### parsing functions
using - https://github.com/tree-sitter-grammars/tree-sitter-glsl
basic usage: https://github.com/tree-sitter/py-tree-sitter

In [1]:
import tree_sitter_glsl as tsglsl
import tree_sitter
from tree_sitter import Language, Parser
from typing import List, Tuple

GLSL_LANGUAGE = Language(tsglsl.language())

parser = Parser(GLSL_LANGUAGE)

In [2]:
# https://gist.github.com/TACIXAT/c5b2db4a80c812c4b4373b65e179a220
def format_sexpression(s, indent_level=0, indent_size=4):
    """ChatGPT + TACIXAT"""
    output = ""
    i = 0
    # Initialize to False to avoid newline for the first token
    need_newline = False
    cdepth = [] # Track colons
    while i < len(s):
        if s[i] == "(":
            output += "\n" + " " * (indent_level * indent_size) + "("
            indent_level += 1
            need_newline = False  # Avoid newline after opening parenthesis
        elif s[i] == ":":
            indent_level += 1
            cdepth.append(indent_level) # Store depth where we saw colon
            output += ":"
        elif s[i] == ")":
            indent_level -= 1
            if len(cdepth) > 0 and indent_level == cdepth[-1]: 
                # Unindent when we return to the depth we saw the last colon
                cdepth.pop()
                indent_level -= 1
            output += ")"
            need_newline = True  # Newline needed after closing parenthesis
        elif s[i] == " ":
            output += " "
        else:
            j = i
            while j < len(s) and s[j] not in ["(", ")", " ", ":"]:
                j += 1
            # Add newline and indentation only when needed
            if need_newline:
                output += "\n" + " " * (indent_level * indent_size)
            output += s[i:j]
            i = j - 1
            need_newline = True  # Next token should start on a new line
        i += 1
    return output

def color_ranges(code:str, func_bytes:list):
    """
    returns a string with ANSI color codes to color parts of functions. Omits some part of the code!
    """
    code_bytes = bytes(code, encoding="utf-8")
    colored_bytes = b"\x1b[0m"
    for start_comment, start_header, end_header, end_docstring, end_function in func_bytes:
        colored_bytes += b"\x1b[32m" + code_bytes[start_comment:start_header]
        colored_bytes += b"\x1b[31m" + code_bytes[start_header:end_header]
        colored_bytes += b"\x1b[33m" + code_bytes[end_header:end_docstring]
        colored_bytes += b"\x1b[34m" + code_bytes[end_docstring:end_function]
        colored_bytes += b"\x1b[0m" #escape back to normal
        colored_bytes += b"\n\n" # for pretty

    return colored_bytes.decode(encoding="utf-8")

In [3]:
example_code = """
// comment directly infront of a function
vec3 red(float intensity){
    // comment inside the function body, but the top
    intensity = max(intensity, 1.0);

    // comment deeper inside the function body
    return vec3(1.0,0.0,0.0) * intensity;
}

vec3 green(float intensity){
    // docstring inside
    // second inside docstring
    intensity = max(intensity, 1.0);

    return vec3(0.0,1.0,0.0) * intensity;
}

// this comment is somewhere in between we don't want it


// comment one before
// comment two before
vec3 blue(float intensity){
    intensity = max(intensity, 1.0);

    return vec3(0.0,0.0,1.0) * intensity;
}

void mainImage( out vec4 fragColor, in vec2 fragCoord )
{
    vec2 uv = fragCoord/iResolution.xy;

    vec3 col = mix(red(0.5), blue(0.3), 0.8);
    col = mix(col, green(1.0), sin(iTime));
    fragColor = vec4(col,1.0);
}
"""

tree = parser.parse(bytes(example_code, encoding="utf-8"))
root_node = tree.root_node
tree

<tree_sitter.Tree at 0x22c78ef5490>

In [4]:
print(format_sexpression(str(root_node), -1, 4))


(translation_unit 
(comment) 
(function_definition 
    type: 
        (type_identifier) 
    declarator: 
        (function_declarator 
            declarator: 
                (identifier) 
            parameters: 
                (parameter_list 
                    (parameter_declaration 
                        type: 
                            (primitive_type) 
                        declarator: 
                            (identifier)))) 
    body: 
        (compound_statement 
            (comment) 
            (expression_statement 
                (assignment_expression 
                    left: 
                        (identifier) 
                    right: 
                        (call_expression 
                            function: 
                                (identifier) 
                            arguments: 
                                (argument_list 
                                    (identifier) 
                                    (number_litera

In [78]:
# https://tree-sitter.github.io/tree-sitter/using-parsers#query-syntax

func_query = GLSL_LANGUAGE.query(
        """
        (function_definition ) @function
    """
    )

comment_query = GLSL_LANGUAGE.query(
    """
    (
        (comment)* @comment.before
        (function_definition 
            .
            type: (primitive_type) @function.type
            declarator: (function_declarator) @function.declarator
            body: (compound_statement
                (comment)? @docstring
                ) @function.body
        )
    )
    """
)

def parse_functions2(code:str) -> List[Tuple[int,int,int,int,int]]:
    tree = parser.parse(bytes(code, encoding="utf-8"))
    root_node = tree.root_node
    funcs = []
    
    matches = comment_query.matches(root_node, end_byte=500)
    print(len(matches))
    for func in matches:
        # func = func[1]["function"]
        # print(func.end_point)
        # matches = comment_query.matches(root_node, end_point=func.end_point)
        print(func)
        # continue
        m = matches[0][1]
        print(m)
        start_comment = start_header = end_header = end_docstring = end_function = None
        start_comment = start_header = m["function.type"].start_byte
        end_function = m["function.body"].end_byte
        end_docstring = end_header = m["function.declarator"].end_byte
        if "comment.before" in m:
            # print(len(m["comment.before"]))
            start_comment = m["comment.before"].start_byte
        if "docstring" in m:
            end_docstring = m["docstring"].end_byte
        
        funcs.append(tuple([start_comment, start_header, end_header, end_docstring, end_function]))
    return funcs

funcs2 = parse_functions2(example_code)
print(funcs2)
colored_funcs2 = color_ranges(example_code, func_bytes=funcs2)
# print(colored_funcs2)


2
(0, {'comment.before': [<Node type=comment, start_point=(1, 0), end_point=(1, 41)>], 'function.type': <Node type=primitive_type, start_point=(29, 0), end_point=(29, 4)>, 'function.declarator': <Node type=function_declarator, start_point=(29, 5), end_point=(29, 55)>, 'function.body': <Node type=compound_statement, start_point=(30, 0), end_point=(36, 1)>})
{'comment.before': [<Node type=comment, start_point=(1, 0), end_point=(1, 41)>], 'function.type': <Node type=primitive_type, start_point=(29, 0), end_point=(29, 4)>, 'function.declarator': <Node type=function_declarator, start_point=(29, 5), end_point=(29, 55)>, 'function.body': <Node type=compound_statement, start_point=(30, 0), end_point=(36, 1)>}


AttributeError: 'list' object has no attribute 'start_byte'

In [101]:
dev_query = GLSL_LANGUAGE.query(
    """
    (   
        [
        (comment)
        (comment)+
        ] @comment
        (function_definition ) @function
    )
    """
)

ms = dev_query.captures(root_node)
print(len(ms))
for c in ms:
    print(c)
    # print(c[1]["commented_function"].text)

16
(<Node type=comment, start_point=(1, 0), end_point=(1, 41)>, 'comment')
(<Node type=comment, start_point=(1, 0), end_point=(1, 41)>, 'comment')
(<Node type=comment, start_point=(1, 0), end_point=(1, 41)>, 'comment')
(<Node type=comment, start_point=(1, 0), end_point=(1, 41)>, 'comment')
(<Node type=function_definition, start_point=(2, 0), end_point=(8, 1)>, 'function')
(<Node type=function_definition, start_point=(10, 0), end_point=(16, 1)>, 'function')
(<Node type=comment, start_point=(18, 0), end_point=(18, 56)>, 'comment')
(<Node type=comment, start_point=(18, 0), end_point=(18, 56)>, 'comment')
(<Node type=comment, start_point=(21, 0), end_point=(21, 21)>, 'comment')
(<Node type=comment, start_point=(21, 0), end_point=(21, 21)>, 'comment')
(<Node type=comment, start_point=(22, 0), end_point=(22, 21)>, 'comment')
(<Node type=comment, start_point=(22, 0), end_point=(22, 21)>, 'comment')
(<Node type=function_definition, start_point=(23, 0), end_point=(27, 1)>, 'function')
(<Node ty

In [109]:

def parse_functions(code:str) -> List[Tuple[int,int,int,int,int]]:
    """
    parses the code using tree-parser-glsl
    returns the **byte-indecies** for before_comment, start header, end header, end docstring, end_function.
    returns a list 5-tupel. If before_comment or docstring aren't found, the indiecies will coinside with the next one.
    """
    tree = parser.parse(bytes(code, encoding="utf-8"))
    root_node = tree.root_node
    funcs = []
    
    # lazy init
    start_comment = start_header = end_header = end_docstring = end_function = None
    comment_line = -2
    for child in root_node.children:
        if child.type == "comment" and comment_line + 1 != child.end_point[0]:
            start_comment = child.start_byte
            comment_line = child.end_point[0]
        if child.type == "function_definition":
            start_header = child.start_byte
            if not start_comment:
                start_comment = start_header
            end_function = child.end_byte
            end_header = child.children[-1].children[0].end_byte
            # inside the function body, past the "{"
            for sub_child in child.children[-1].children[1:]:
                if sub_child.type == "comment":
                    end_docstring = sub_child.end_byte
                else:
                    if not end_docstring:
                        end_docstring = end_header
                    break
                


            funcs.append(tuple([start_comment, start_header, end_header, end_docstring, end_function]))
            start_comment = start_header = end_header = end_docstring = end_function = None
    return funcs


funcs = parse_functions(example_code)
print(funcs)
colored_funcs = color_ranges(example_code, func_bytes=funcs)
print(colored_funcs)


[(1, 43, 69, 122, 251), (253, 253, 281, 336, 418), (479, 523, 550, 550, 632), (634, 634, 691, 691, 855)]
[0m[32m// comment directly infront of a function
[31mvec3 red(float intensity){[33m
    // comment inside the function body, but the top[34m
    intensity = max(intensity, 1.0);

    // comment deeper inside the function body
    return vec3(1.0,0.0,0.0) * intensity;
}[0m

[32m[31mvec3 green(float intensity){[33m
    // docstring inside
    // second inside docstring[34m
    intensity = max(intensity, 1.0);

    return vec3(0.0,1.0,0.0) * intensity;
}[0m

[32m// comment one before
// comment two before
[31mvec3 blue(float intensity){[33m[34m
    intensity = max(intensity, 1.0);

    return vec3(0.0,0.0,1.0) * intensity;
}[0m

[32m[31mvoid mainImage( out vec4 fragColor, in vec2 fragCoord )
{[33m[34m
    vec2 uv = fragCoord/iResolution.xy;

    vec3 col = mix(red(0.5), blue(0.3), 0.8);
    col = mix(col, green(1.0), sin(iTime));
    fragColor = vec4(col,1.0);
}[0m