In [6]:
import sourcy

In [10]:
from sourcy.tokens.doc import Document
import tree_sitter
from tree_sitter.binding import Tree, Node
from typing import List


class Token(object):
    """
    Container for the token text and its annotations
    """

    def __init__(self, token: str, annotation=None, block_annotation=None, position=0):
        """
        :param token:
        :param annotation:
        :param block_annotation:
        """
        self._token = token
        self._annotation = annotation
        self._block = block_annotation
        self._position = position

    @property
    def token(self):
        return self._token

    @property
    def position(self):
        return self._position
    
    @property
    def annotation(self):
        return self._annotation

    @property
    def block(self):
        return self._block

    def __str__(self):
        return f"{self.token} - {self.annotation} - {self.block} - {self.position}"


def _extract_token_annotation(code: bytes, node: Node) -> (bytes, str):
    """
    Extract the token string from the code
    :param code: The code textual representation
    :param node: A node containing the start and end positions of the token
    :return:
    """
    token = code[node.start_byte:node.end_byte]
    annotations = node.type
    return token, annotations



def _traverse(code: bytes, tree: Tree) -> List[Token]:
        """
        Post-order tree traversal that returns a list of tokens with their annotations
        :param code: A byte string representation of the code
        :param tree: The tree representation of the code
        :return:
        """
        root = tree.root_node
        stack = deque()
        stack.append((root, None))

        tokens = []
        while len(stack):
            current, parent = stack.pop()

            if current.type != tree.root_node.type and len(current.children) == 0:
                token, annotation = _extract_token_annotation(code, current)
                _, block_annotation = _extract_token_annotation(code, parent)
                tokens.append(Token(token.decode("utf8"), annotation, block_annotation, current.start_byte))
            for child in current.children:
                stack.append((child, current))

        return tokens[::-1]

In [11]:
with open("/home/sasce/PycharmProjects/CodeGraphClassification/notebooks/test/resources/Lexer.java", 'rb') as f:
    file_content = f.read()

In [14]:
from tree_sitter import Language, Parser
from collections import deque

lang = tree_sitter.Language('/home/sasce/PycharmProjects/CodeGraphClassification/languages.so', 'java')
parser = Parser()
parser.set_language(lang)
tree = parser.parse(file_content)

In [15]:
tokens = _traverse(file_content, tree)

In [16]:
for t in tokens:
    print(str(t))

public - public - modifiers - 0
abstract - abstract - modifiers - 7
class - class - class_declaration - 16
Lexer - identifier - class_declaration - 22
extends - extends - superclass - 28
Recognizer - type_identifier - generic_type - 36
< - < - type_arguments - 46
Integer - type_identifier - type_arguments - 47
, - , - type_arguments - 54
LexerATNSimulator - type_identifier - type_arguments - 56
> - > - type_arguments - 73
implements - implements - super_interfaces - 76
TokenSource - type_identifier - type_list - 87
{ - { - class_body - 99
public - public - modifiers - 102
Lexer - identifier - constructor_declaration - 109
( - ( - formal_parameters - 114
CharStream - type_identifier - formal_parameter - 115
input - identifier - formal_parameter - 126
) - ) - formal_parameters - 131
{ - { - constructor_body - 133
this - this - field_access - 137
. - . - field_access - 141
_input - identifier - field_access - 142
= - = - assignment_expression - 149
input - identifier - assignment_expressi

In [41]:
methods = []
in_method = 0
method = {}
i = 0
while True and len(tokens) != i:
    t = tokens[i]

    if (t.annotation == 'identifier' and t.block == 'method_declaration'):
        start = tokens[i].position
        in_method = True
        open_bloks = 0
        closed_bloks = 0
        comments = [start]
        while in_method:
            i += 1
            if 'comment' in tokens[i].annotation:
                comments.extend([tokens[i].position, tokens[i + 1].position])
            if tokens[i].block == 'block' and tokens[i].annotation == '{':
                open_bloks += 1
            if tokens[i].block == 'block' and tokens[i].annotation == '}':
                closed_bloks += 1
            
            if open_bloks != 0 and open_bloks == closed_bloks:
                in_method = False
                i += 1
        
        end = tokens[i].position
        comments.append(end)
        pairs = list(zip(comments[::2], comments[1::2]))
        body = [file_content[s:e] for s, e in pairs]
        body = ' '.join([t.decode("utf8") for t in body])
        method['body'] = ' '.join(body.split())
        methods.append(method)
    method = {}
    i += 1

In [42]:
methods

[{'body': 'reset() { if ( _input !=null ) { _input.seek(0); } _token = null; _type = Token.INVALID_TYPE; _channel = Token.DEFAULT_CHANNEL; _tokenStartCharIndex = -1; _tokenStartCharPositionInLine = -1; _tokenStartLine = -1; _text = null; _hitEOF = false; _mode = Lexer.DEFAULT_MODE; _modeStack.clear(); getInterpreter().reset(); }'},
 {'body': 'nextToken() { if (_input == null) { throw new IllegalStateException("nextToken requires a non-null input stream."); } int tokenStartMarker = _input.mark(); try{ outer: while (true) { if (_hitEOF) { emitEOF(); return _token; } _token = null; _channel = Token.DEFAULT_CHANNEL; _tokenStartCharIndex = _input.index(); _tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine(); _tokenStartLine = getInterpreter().getLine(); _text = null; do { _type = Token.INVALID_TYPE; int ttype; try { ttype = getInterpreter().match(_input, _mode); } catch (LexerNoViableAltException e) { notifyListeners(e); recover(e); ttype = SKIP; } if ( _input.LA(1)==Int

In [30]:
len(methods)

38