## Step 0 - You need to install tree sitter python grammar first

In [1]:
!git clone https://github.com/tree-sitter/tree-sitter-python

Cloning into 'tree-sitter-python'...
remote: Enumerating objects: 2549, done.[K
remote: Counting objects: 100% (293/293), done.[K
remote: Compressing objects: 100% (131/131), done.[K
remote: Total 2549 (delta 167), reused 270 (delta 151), pack-reused 2256[K
Receiving objects: 100% (2549/2549), 19.05 MiB | 4.30 MiB/s, done.
Resolving deltas: 100% (1596/1596), done.
Updating files: 100% (44/44), done.


## Step 1 - Import modules

In [3]:
from transformers import AutoTokenizer, AutoModelWithLMHead

from core.mappers.concept_mapper import ConceptMapper
from core.parsers.tree_sitter_unparser import TreeSitterParser

## Step 2 - Define tokenizers

In [3]:
# Model
model = AutoModelWithLMHead.from_pretrained("gpt2")
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")



## Step 3 - Use the Mapper

In [4]:
import pandas as pd

# source code
source_code = 'def testing(a,b):\n variable_1=5656556\n variable_2=5656556\n while(r==3) true: return a*b'

#use Mapper
output = ConceptMapper.map_ast_families(source_code, tokenizer, TreeSitterParser.PY_LANGUAGE)

#Use the output
print(output)

        token     concept   ast_token
0         def         def         def
1    Ġtesting  identifier     testing
2           (           (           (
3           a  identifier           a
4           ,           ,           ,
5           b  identifier           b
6          ):           :           :
7           Ċ        None        None
8   Ġvariable  identifier  variable_1
9           _  identifier  variable_1
10          1  identifier  variable_1
11          =           =           =
12        565     integer     5656556
13        655     integer     5656556
14          6     integer     5656556
15          Ċ        None        None
16  Ġvariable  identifier  variable_2
17          _  identifier  variable_2
18          2  identifier  variable_2
19          =           =           =
20        565     integer     5656556
21        655     integer     5656556
22          6     integer     5656556
23          Ċ        None        None
24     Ġwhile       while       while
25          