### Imports

In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
from datasets import load_dataset 
from CodeSyntaxConcept.tokenizer import CodeTokenizer
import CodeSyntaxConcept.utils as utils
from statistics import mean, median

### Parameters

In [2]:
checkpoint = "EleutherAI/gpt-neo-125M"
file_path = "output/testbed_base_EleutherAI-gpt-neo-125M.csv"
parent_node_types_path = "output/parent_node_types.csv"
child_node_types_path = "output/child_node_types.csv"
language = "python"

### Tokenizer

In [3]:
tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)

### Parent Node Types

In [4]:
parent_node_types = pd.read_csv(parent_node_types_path, index_col=0)
parent_node_types = set(parent_node_types['parent_type'])
print(parent_node_types)

{'string', 'function_definition', 'list_pattern', 'boolean_operator', 'parenthesized_expression', 'yield', 'if_statement', 'list', 'await', 'decorated_definition', 'assignment', 'binary_operator', 'for_statement', 'finally_clause', 'return_statement', 'block', 'break_statement', 'parameters', 'typed_parameter', 'if_clause', 'lambda', 'dictionary', 'module', 'subscript', 'delete_statement', 'conditional_expression', 'raise_statement', 'for_in_clause', 'call', 'class_definition', 'tuple_pattern', 'tuple', 'slice', 'aliased_import', 'decorator', 'with_clause', 'try_statement', 'ERROR', 'interpolation', 'else_clause', 'dictionary_comprehension', 'list_comprehension', 'dotted_name', 'assert_statement', 'print_statement', 'pattern_list', 'global_statement', 'type', 'import_from_statement', 'format_specifier', 'concatenated_string', 'elif_clause', 'dictionary_splat_pattern', 'augmented_assignment', 'nonlocal_statement', 'chevron', 'with_item', 'import_statement', 'with_statement', 'expression

### Children Node Types

In [5]:
child_node_types = pd.read_csv(child_node_types_path, index_col=0)
child_node_types = set(child_node_types['child_type'])
print(child_node_types)

{'^', 'finally', 'if', 'string', 'as', '(', 'print', 'yield', 'or', '*=', 'float', '%', 'import', 'await', 'false', 'none', 'assert', '/', '+', '[', '|', 'return', 'in', 'del', 'except', '!=', '//', 'comment', 'lambda', 'true', 'identifier', '%=', '<<', '>=', '<<=', 'async', '~', '}', ']', '^=', '+=', '**', 'integer', '>>=', 'class', 'raise', 'not', 'from', '>>', '->', '&=', '<=', 'for', 'else', 'nonlocal', ':', 'def', '|=', 'try', 'while', 'break', '-', ';', 'global', 'pass', '//=', '/=', '=', '==', ')', '*', 'ellipsis', 'and', '>', 'with', 'is', 'continue', '-=', '@', '<', '.', 'elif', ',', '&', '{'}


### Actual Token Predictions

In [6]:
df_actual_ntp = pd.read_csv(file_path, index_col=0)

In [7]:
df_actual_ntp.head()

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[('ers', 0.03083181567490101), ('_', 0.2419087...","[('anwhile', 1.03836617568492e-16), ('ousy', 3...","[('Ġget', 0.0024285861290991306), ('_', 0.2419..."
1,"def execute_pipeline(pipeline, environment_dic...","[('def', 'def', 'function_definition'), ('exec...","[(4299, 'def', 'function_definition'), (12260,...","[4299, 12260, 62, 79, 541, 4470, 7, 79, 541, 4...",492,"[('ers', 0.03083181567490101), ('_', 0.1853068...","[('anwhile', 1.03836617568492e-16), ('icester'...","[('Ġexecute', 5.97450380155351e-05), ('_', 0.1..."
2,"def _decode(self, data):\n '''\n ...","[('def', 'def', 'function_definition'), ('_dec...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 12501, 1098, 7, 944, 11, 1366, 25...",583,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('dec', 0.00133..."
3,"def _repr_html_(self):\n """"""\n J...","[('def', 'def', 'function_definition'), ('_rep...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 260, 1050, 62, 6494, 41052, 944, ...",221,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('re', 0.001378..."
4,"def build_shape(relation, nodes, ways):\n ""...","[('def', 'def', 'function_definition'), ('buil...","[(4299, 'def', 'function_definition'), (1382, ...","[4299, 1382, 62, 43358, 7, 49501, 11, 13760, 1...",454,"[('ers', 0.03083181567490101), ('_', 0.2842166...","[('anwhile', 1.03836617568492e-16), ('buquerqu...","[('Ġbuild', 0.0005856865900568664), ('_', 0.28..."


### Token Binding

In [8]:
def bind_bpe_tokens(
    node,              #Tree sitter ast tree
    encoding,          #Token encoding
    actual_probs,      #Actual probabilities
    lines              #Source code Snippet
): 
    """Traverses the tree and bind the leaves with the corresponding node"""
    tree_node = {}
    tree_node['type'] = node.type
    tree_node['children'] = []
    tree_node['bindings'] = []

    node_span = [utils.convert_to_offset(node.start_point, lines), utils.convert_to_offset(node.end_point, lines)]
    for encoding_index, token_span in enumerate(encoding.offset_mapping):
        if (node_span[0] <= token_span[0] and  
            token_span[0] < node_span[1]) or (node_span[0] < token_span[1] and 
            token_span[1] <= node_span[1]):
            tree_node['bindings'].append(actual_probs[encoding_index])
    
    for n in node.children:
        tree_node['children'].append(bind_bpe_tokens(n, encoding, actual_probs, lines))

    return tree_node
        

In [9]:
encoding = tokenizer.tokenizer(df_actual_ntp.iloc[0]['whole_func_string'], return_offsets_mapping=True)
assert len(eval(df_actual_ntp.iloc[0]['model_input_ids'])) == len(encoding['input_ids'])

In [10]:
binded_tree_col = []
for index, row in df_actual_ntp.iterrows():
    tree = tokenizer.parser.parse(bytes(row['whole_func_string'], "utf8"))
    encoding = tokenizer.tokenizer(row['whole_func_string'], return_offsets_mapping=True)
    actual_logits = eval(row['actual_prob_case'])
    actual_logits.insert(0,(tokenizer.tokenizer.decode(eval(row['model_input_ids'])[0]),'FIRST_TOKEN'))
    binded_tree = bind_bpe_tokens(tree.root_node, encoding, actual_logits, row['whole_func_string'].split('\n'))
    binded_tree_col.append(binded_tree)
df_actual_ntp['binded_tree'] = binded_tree_col

2023-03-01 11:05:51.997291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-01 11:05:56.238894: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64
2023-03-01 11:05:56.239020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64


In [11]:
def process_bindings(
    node: dict,     #Binded AST tree with actual probabilities
) -> None:
    node_actual_probs = [binding[1] for binding in node['bindings'] if isinstance(binding[1], float)]
    node['median_prob'] = median(node_actual_probs) if len(node_actual_probs)>0 else None
    node['max_prob'] = max(node_actual_probs) if len(node_actual_probs)>0 else None
    node['min_prob'] = min(node_actual_probs) if len(node_actual_probs)>0 else None
    node['avg_prob'] = mean(node_actual_probs) if len(node_actual_probs)>0 else None
    node['std'] = np.std(node_actual_probs) if len(node_actual_probs)>0 else None
    for child in node['children']:
        process_bindings(child)

In [12]:
df_actual_ntp['binded_tree'].apply(lambda binded_tree: process_bindings(binded_tree))
print(df_actual_ntp.iloc[0]['binded_tree']['children'][0]['children'][1]['median_prob'])
print(df_actual_ntp.iloc[1]['binded_tree']['children'][0]['children'][1]['median_prob'])

0.0025257335510104895
0.08677685260772705


In [13]:
df_actual_ntp.head()

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case,binded_tree
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[('ers', 0.03083181567490101), ('_', 0.2419087...","[('anwhile', 1.03836617568492e-16), ('ousy', 3...","[('Ġget', 0.0024285861290991306), ('_', 0.2419...","{'type': 'module', 'children': [{'type': 'func..."
1,"def execute_pipeline(pipeline, environment_dic...","[('def', 'def', 'function_definition'), ('exec...","[(4299, 'def', 'function_definition'), (12260,...","[4299, 12260, 62, 79, 541, 4470, 7, 79, 541, 4...",492,"[('ers', 0.03083181567490101), ('_', 0.1853068...","[('anwhile', 1.03836617568492e-16), ('icester'...","[('Ġexecute', 5.97450380155351e-05), ('_', 0.1...","{'type': 'module', 'children': [{'type': 'func..."
2,"def _decode(self, data):\n '''\n ...","[('def', 'def', 'function_definition'), ('_dec...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 12501, 1098, 7, 944, 11, 1366, 25...",583,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('dec', 0.00133...","{'type': 'module', 'children': [{'type': 'func..."
3,"def _repr_html_(self):\n """"""\n J...","[('def', 'def', 'function_definition'), ('_rep...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 260, 1050, 62, 6494, 41052, 944, ...",221,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('re', 0.001378...","{'type': 'module', 'children': [{'type': 'func..."
4,"def build_shape(relation, nodes, ways):\n ""...","[('def', 'def', 'function_definition'), ('buil...","[(4299, 'def', 'function_definition'), (1382, ...","[4299, 1382, 62, 43358, 7, 49501, 11, 13760, 1...",454,"[('ers', 0.03083181567490101), ('_', 0.2842166...","[('anwhile', 1.03836617568492e-16), ('buquerqu...","[('Ġbuild', 0.0005856865900568664), ('_', 0.28...","{'type': 'module', 'children': [{'type': 'func..."


### Local Analysis (Snippets)

In [14]:
def traverse_tree_and_collect_stds(node: dict, node_types_list: list, std_field: str):
    if node[std_field] is not None:
        node_types_list[tokenizer.node_types.index(node['type'])] = node_types_list[tokenizer.node_types.index(node['type'])] + [node[std_field]]
    for child in node['children']:
        traverse_tree_and_collect_stds(child, node_types_list, std_field)

In [15]:
def add_std_column(std_field, dataframe):
    concept_probs = []
    for tree in dataframe['binded_tree']:
        node_types_list = [[] for type in tokenizer.node_types]
        traverse_tree_and_collect_stds(tree, node_types_list, std_field)
        snippet_type_list = []
        for type_index, node_values in enumerate(node_types_list):
            if len(node_values)>0: 
                snippet_type_list.append((tokenizer.node_types[type_index], node_values))
        concept_probs.append(snippet_type_list)
    dataframe['concept_'+std_field] =  concept_probs

In [16]:
add_std_column('median_prob', df_actual_ntp)
add_std_column('min_prob', df_actual_ntp)
df_actual_ntp.head()

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case,binded_tree,concept_median_prob,concept_min_prob
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[('ers', 0.03083181567490101), ('_', 0.2419087...","[('anwhile', 1.03836617568492e-16), ('ousy', 3...","[('Ġget', 0.0024285861290991306), ('_', 0.2419...","{'type': 'module', 'children': [{'type': 'func...","[(if, [0.6537858247756958]), (string, [0.77847...","[(if, [0.6537858247756958]), (string, [0.00227..."
1,"def execute_pipeline(pipeline, environment_dic...","[('def', 'def', 'function_definition'), ('exec...","[(4299, 'def', 'function_definition'), (12260,...","[4299, 12260, 62, 79, 541, 4470, 7, 79, 541, 4...",492,"[('ers', 0.03083181567490101), ('_', 0.1853068...","[('anwhile', 1.03836617568492e-16), ('icester'...","[('Ġexecute', 5.97450380155351e-05), ('_', 0.1...","{'type': 'module', 'children': [{'type': 'func...","[(string, [0.44025999307632446, 0.266962230205...","[(string, [1.3455122825689614e-05, 0.036820098..."
2,"def _decode(self, data):\n '''\n ...","[('def', 'def', 'function_definition'), ('_dec...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 12501, 1098, 7, 944, 11, 1366, 25...",583,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('dec', 0.00133...","{'type': 'module', 'children': [{'type': 'func...","[(if, [0.5241414904594421, 0.5782523155212402]...","[(if, [0.5241414904594421, 0.5782523155212402]..."
3,"def _repr_html_(self):\n """"""\n J...","[('def', 'def', 'function_definition'), ('_rep...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 260, 1050, 62, 6494, 41052, 944, ...",221,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('re', 0.001378...","{'type': 'module', 'children': [{'type': 'func...","[(string, [0.9896580576896667, 0.0001314329565...","[(string, [1.8510989320930094e-05, 0.000131432..."
4,"def build_shape(relation, nodes, ways):\n ""...","[('def', 'def', 'function_definition'), ('buil...","[(4299, 'def', 'function_definition'), (1382, ...","[4299, 1382, 62, 43358, 7, 49501, 11, 13760, 1...",454,"[('ers', 0.03083181567490101), ('_', 0.2842166...","[('anwhile', 1.03836617568492e-16), ('buquerqu...","[('Ġbuild', 0.0005856865900568664), ('_', 0.28...","{'type': 'module', 'children': [{'type': 'func...","[(if, [0.3932020962238312]), (string, [0.05321...","[(if, [0.3932020962238312]), (string, [2.21859..."


### Global Analysis (AST Elements)

In [17]:
def collect_global_std(std_field, dataframe):
    node_types_list = [[] for type in tokenizer.node_types]
    for tree in dataframe['binded_tree']:
        traverse_tree_and_collect_stds(tree, node_types_list, std_field)
    return node_types_list

In [18]:
concept_median_prob_list = collect_global_std('median_prob', df_actual_ntp)
concept_min_prob_list = collect_global_std('min_prob', df_actual_ntp)

In [19]:
global_concept_dataframe = pd.DataFrame([], columns=['ast_element', 'concept_median_prob', 'concept_min_prob' ])
for concept_idx in range(0,len(tokenizer.node_types)):
    global_concept_dataframe.loc[len(global_concept_dataframe.index)] = [tokenizer.node_types[concept_idx], concept_median_prob_list[concept_idx], concept_min_prob_list[concept_idx]]

In [20]:
global_concept_dataframe

Unnamed: 0,ast_element,concept_median_prob,concept_min_prob
0,^,[],[]
1,finally,"[0.1794426441192627, 0.2560184895992279, 0.181...","[0.1794426441192627, 0.2560184895992279, 0.181..."
2,_simple_statement,[],[]
3,if,"[0.6537858247756958, 0.5241414904594421, 0.578...","[0.6537858247756958, 0.5241414904594421, 0.578..."
4,string,"[0.7784753441810608, 0.44025999307632446, 0.26...","[0.0022710536140948534, 1.3455122825689614e-05..."
...,...,...,...
191,{,"[0.27034106850624084, 0.06825897842645645, 0.0...","[0.27034106850624084, 0.06825897842645645, 0.0..."
192,lambda_parameters,"[0.0032521618995815516, 0.3915998339653015]","[0.0032521618995815516, 0.006417915225028992]"
193,typed_default_parameter,[],[]
194,expression_statement,"[0.7784753441810608, 0.44025999307632446, 0.07...","[0.0022710536140948534, 1.3455122825689614e-05..."


### Snippet Concept Embeddings

In [21]:
def get_concept_embeddings(node, concepts):
    def get_concept_bindings(node, concepts, bindings):
        for child in node['children']:
            get_concept_bindings(child, concepts, bindings)
        if node['type'] in concepts:
            bindings[concepts.index(node['type'])].append([prob for token, prob in node['bindings'] if prob != 'FIRST_TOKEN'])
    bindings=[[] for _ in range(len(concepts))]
    get_concept_bindings(node, concepts, bindings)
    embedding = []
    for prob_list in bindings:
        flat_prob_list = [prob for sublist in prob_list for prob in sublist]
        embedding.append(median(flat_prob_list) if len(flat_prob_list)>0 else 0)
    return embedding
    

In [22]:
#### MOST FREQUENT CONCEPTS - EXPLORATORY ANALYSIS
most_frequent_leaves = ['identifier', '.', '(', ')', ',', '=', 'string',':','[',']','integer']
most_frequent_parents = ['attribute','expression_statement','argument_list','call','assignment','comparison_operator', 'if_statement','return_statement','for_statement', 'parameters']
concepts = most_frequent_leaves + most_frequent_parents
df_concept_embeddings = pd.DataFrame([], columns= concepts)
for binded_tree in df_actual_ntp['binded_tree']:
    df_concept_embeddings.loc[len(df_concept_embeddings.index)] = get_concept_embeddings(binded_tree, concepts)

In [23]:
df_concept_embeddings

Unnamed: 0,identifier,.,(,),",",=,string,:,[,],...,attribute,expression_statement,argument_list,call,assignment,comparison_operator,if_statement,return_statement,for_statement,parameters
0,0.156971,0.713405,0.147036,0.080469,0.475690,0.000000,0.778475,0.080469,0.000000,0.000000,...,0.477256,0.778475,0.014872,0.343622,0.000000,0.512151,0.999796,0.447364,0.999746,0.080469
1,0.771225,0.008170,0.569995,0.417964,0.668838,0.857300,0.437222,0.582480,0.000000,0.000000,...,0.022576,0.623677,0.992551,0.954952,0.819023,0.000000,0.000000,0.997409,0.000000,0.226456
2,0.258213,0.652287,0.507631,0.406716,0.617457,0.460635,0.364909,0.736973,0.417742,0.680124,...,0.463851,0.493506,0.981943,0.873090,0.372309,0.370858,0.997098,0.293735,0.000000,0.056926
3,0.070280,0.424575,0.626984,0.339046,0.083339,0.711400,0.644779,0.160940,0.008454,0.079043,...,0.448601,0.467311,0.170392,0.417464,0.439323,0.000000,0.000000,0.320331,0.823992,0.019856
4,0.138766,0.536793,0.229903,0.030855,0.440560,0.877260,0.053210,0.788386,0.752652,0.535191,...,0.536353,0.999645,0.999966,0.999963,0.164813,0.518637,0.999280,0.000000,0.998989,0.021539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.359243,0.405570,0.267881,0.102278,0.318372,0.813165,0.353171,0.095666,0.000000,0.000000,...,0.447181,0.338090,0.660784,0.398508,0.314067,0.000000,0.000000,0.398508,0.000000,0.095666
96,0.164901,0.309655,0.505404,0.180601,0.396127,0.925209,0.042850,0.742384,0.082021,0.161671,...,0.195883,0.191853,0.392626,0.262477,0.234001,0.419765,0.999061,0.000000,0.000000,0.157972
97,0.483576,0.984598,0.352837,0.659989,0.479029,0.827725,0.156683,0.849902,0.981576,0.890145,...,0.738280,0.959182,0.997203,0.992732,0.328838,0.739819,0.999051,0.027442,0.000000,0.079162
98,0.328847,0.332634,0.381625,0.200231,0.784930,0.637194,0.803410,0.991581,0.000000,0.000000,...,0.290401,0.653183,0.841836,0.710848,0.584961,0.000000,0.999226,0.000000,0.000000,0.053828
