In [5]:
# default_exp core

In [1]:
import ast

from transformers import AutoTokenizer, AutoModelWithLMHead


from core.aligners.custom_aligner import CustomAligner
from core.aligners.needleman_wunch import NeedlemanWunch
from core.parsers.ast_unparser import UnparserTokenizer

# Demo

In [2]:
# Model
model = AutoModelWithLMHead.from_pretrained("gpt2")
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")



Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [3]:
# source code
source_code = 'def testing(a,b):\n variable_1=5656556\n variable_2=5656556\n while true: return a*b'

In [4]:
# unparser tokenizer
unparser_tokenizer = UnparserTokenizer()
parsed_ast = ast.parse(source_code, mode='exec')

In [5]:
# deep model output
model_tokens = tokenizer.tokenize(source_code)

In [6]:
# python unparser tokenizer output
ast_tokens = unparser_tokenizer.find_tokens(parsed_ast)

In [7]:
# neddleman wunch to find association
needleman_wunch = NeedlemanWunch(model_tokens.copy(), ast_tokens.copy())
needleman_wunch.find_optimal_sequence()

In [8]:
# custon aligner
custom_aligner = CustomAligner(model_tokens.copy(), ast_tokens.copy())
custom_aligner.align_tokens()

In [9]:
# get results
needleman_output = needleman_wunch.get_model_tokens_queue()
custom_aligner_output = custom_aligner.get_model_tokens_queue()

In [10]:
print('\n-------------------- NEEDLEMAN WUNCH ALIGNER -------------------\n')
for token in needleman_output:
    print(token)


-------------------- NEEDLEMAN WUNCH ALIGNER -------------------

{'token': 'def', 'association': {'token': '\n', 'family': '_FunctionDef'}}
{'token': 'Ġtesting', 'association': {'token': '\ndef testing(', 'family': '_FunctionDef'}}
{'token': '(', 'association': {'token': 'a', 'family': '_arg'}}
{'token': 'a', 'association': {'token': ', ', 'family': '_arguments'}}
{'token': ',', 'association': {'token': 'b', 'family': '_arg'}}
{'token': 'b', 'association': {'token': ')', 'family': '_FunctionDef'}}
{'token': '):', 'association': {'token': ':', 'family': '_Ident'}}
{'token': 'Ċ', 'association': {'token': '\n    ', 'family': '_Assign'}}
{'token': 'Ġvariable', 'association': {'token': 'variable_1', 'family': '_Name'}}
{'token': '_', 'association': {'token': ' = ', 'family': '_Assign'}}
{'token': '1', 'association': {'token': '5656556', 'family': '_Constant'}}
{'token': '=', 'association': {'token': '\n    ', 'family': '_Assign'}}
{'token': '565', 'association': {'token': 'variable_2', 'f

In [11]:
print('\n------------------------- CUSTOM ALIGNER -----------------------\n')
for token in custom_aligner_output:
    print(token)


------------------------- CUSTOM ALIGNER -----------------------

{'token': 'def', 'association': {'token': '\ndef testing(', 'family': '_FunctionDef'}}
{'token': 'Ġtesting', 'association': {'token': '\ndef testing(', 'family': '_FunctionDef'}}
{'token': '(', 'association': {'token': '\ndef testing(', 'family': '_FunctionDef'}}
{'token': 'a', 'association': {'token': 'a', 'family': '_arg'}}
{'token': ',', 'association': {'token': ', ', 'family': '_arguments'}}
{'token': 'b', 'association': {'token': 'b', 'family': '_arg'}}
{'token': '):', 'association': {'token': ':', 'family': '_Ident'}}
{'token': 'Ċ', 'association': None}
{'token': 'Ġvariable', 'association': {'token': 'variable_1', 'family': '_Name'}}
{'token': '_', 'association': {'token': 'variable_1', 'family': '_Name'}}
{'token': '1', 'association': {'token': 'variable_1', 'family': '_Name'}}
{'token': '=', 'association': {'token': ' = ', 'family': '_Assign'}}
{'token': '565', 'association': {'token': '5656556', 'family': '_Con