### Imports

In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
from datasets import load_dataset 
from CodeSyntaxConcept.tokenizer import CodeTokenizer
import CodeSyntaxConcept.utils as utils
from statistics import mean, median
import json

### Parameters

In [2]:
checkpoint = "EleutherAI/gpt-neo-125M"
file_path = "output/testbed_base_EleutherAI-gpt-neo-125M.csv"
parent_node_types_path = "output/parent_node_types.csv"
child_node_types_path = "output/child_node_types.csv"
language = "python"
aggregates_path = "output/aggregation_function/" + file_path.split('/')[1]

In [3]:
tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)

### Parent Node Types

In [4]:
parent_node_types = pd.read_csv(parent_node_types_path, index_col=0)
parent_node_types = set(parent_node_types['parent_type'])

### Children Node Types

In [5]:
child_node_types = pd.read_csv(child_node_types_path, index_col=0)
child_node_types = set(child_node_types['child_type'])

### Load Aggregates

In [6]:
df_actual_ntp = pd.read_csv(aggregates_path, index_col=0)

In [7]:
##Convert JSON do Dict
df_actual_ntp['binded_tree'] = df_actual_ntp['binded_tree'].map(lambda binded_tree: json.loads(binded_tree))

In [8]:
df_actual_ntp.head()

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case,binded_tree
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[('ers', 0.03083181567490101), ('_', 0.2419087...","[('anwhile', 1.03836617568492e-16), ('ousy', 3...","[('Ġget', 0.0024285861290991306), ('_', 0.2419...","{'type': 'module', 'children': [{'type': 'func..."
1,"def execute_pipeline(pipeline, environment_dic...","[('def', 'def', 'function_definition'), ('exec...","[(4299, 'def', 'function_definition'), (12260,...","[4299, 12260, 62, 79, 541, 4470, 7, 79, 541, 4...",492,"[('ers', 0.03083181567490101), ('_', 0.1853068...","[('anwhile', 1.03836617568492e-16), ('icester'...","[('Ġexecute', 5.97450380155351e-05), ('_', 0.1...","{'type': 'module', 'children': [{'type': 'func..."
2,"def _decode(self, data):\n '''\n ...","[('def', 'def', 'function_definition'), ('_dec...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 12501, 1098, 7, 944, 11, 1366, 25...",583,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('dec', 0.00133...","{'type': 'module', 'children': [{'type': 'func..."
3,"def _repr_html_(self):\n """"""\n J...","[('def', 'def', 'function_definition'), ('_rep...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 260, 1050, 62, 6494, 41052, 944, ...",221,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('re', 0.001378...","{'type': 'module', 'children': [{'type': 'func..."
4,"def build_shape(relation, nodes, ways):\n ""...","[('def', 'def', 'function_definition'), ('buil...","[(4299, 'def', 'function_definition'), (1382, ...","[4299, 1382, 62, 43358, 7, 49501, 11, 13760, 1...",454,"[('ers', 0.03083181567490101), ('_', 0.2842166...","[('anwhile', 1.03836617568492e-16), ('buquerqu...","[('Ġbuild', 0.0005856865900568664), ('_', 0.28...","{'type': 'module', 'children': [{'type': 'func..."


### Snippet Concept Embeddings

In [9]:
def get_concept_embeddings(node, concepts):
    def get_concept_bindings(node, concepts, bindings):
        for child in node['children']:
            get_concept_bindings(child, concepts, bindings)
        if node['type'] in concepts:
            bindings[concepts.index(node['type'])].append([prob for token, prob in node['bindings'] if prob != 'FIRST_TOKEN'])
    bindings=[[] for _ in range(len(concepts))]
    get_concept_bindings(node, concepts, bindings)
    embedding = []
    for prob_list in bindings:
        ##TODO: Bootstrapping
        flat_prob_list = [prob for sublist in prob_list for prob in sublist]
        embedding.append(median(flat_prob_list) if len(flat_prob_list)>0 else 0)
    return embedding
    

In [10]:
#### MOST FREQUENT CONCEPTS - EXPLORATORY ANALYSIS
most_frequent_leaves = ['identifier', '.', '(', ')', ',', '=', 'string',':','[',']','integer']
most_frequent_parents = ['attribute','expression_statement','argument_list','call','assignment','comparison_operator', 'if_statement','return_statement','for_statement', 'parameters']
concepts = most_frequent_leaves + most_frequent_parents
df_concept_embeddings = pd.DataFrame([], columns= concepts)
for binded_tree in df_actual_ntp['binded_tree']:
    df_concept_embeddings.loc[len(df_concept_embeddings.index)] = get_concept_embeddings(binded_tree, concepts)

In [11]:
df_concept_embeddings

Unnamed: 0,identifier,.,(,),",",=,string,:,[,],...,attribute,expression_statement,argument_list,call,assignment,comparison_operator,if_statement,return_statement,for_statement,parameters
0,0.156971,0.713405,0.147036,0.080469,0.475690,0.000000,0.778475,0.080469,0.000000,0.000000,...,0.477256,0.778475,0.014872,0.343622,0.000000,0.512151,0.999796,0.447364,0.999746,0.080469
1,0.771225,0.008170,0.569995,0.417964,0.668838,0.857300,0.437222,0.582480,0.000000,0.000000,...,0.022576,0.623677,0.992551,0.954952,0.819023,0.000000,0.000000,0.997409,0.000000,0.226456
2,0.258213,0.652287,0.507631,0.406716,0.617457,0.460635,0.364909,0.736973,0.417742,0.680124,...,0.463851,0.493506,0.981943,0.873090,0.372309,0.370858,0.997098,0.293735,0.000000,0.056926
3,0.070280,0.424575,0.626984,0.339046,0.083339,0.711400,0.644779,0.160940,0.008454,0.079043,...,0.448601,0.467311,0.170392,0.417464,0.439323,0.000000,0.000000,0.320331,0.823992,0.019856
4,0.138766,0.536793,0.229903,0.030855,0.440560,0.877260,0.053210,0.788386,0.752652,0.535191,...,0.536353,0.999645,0.999966,0.999963,0.164813,0.518637,0.999280,0.000000,0.998989,0.021539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.359243,0.405570,0.267881,0.102278,0.318372,0.813165,0.353171,0.095666,0.000000,0.000000,...,0.447181,0.338090,0.660784,0.398508,0.314067,0.000000,0.000000,0.398508,0.000000,0.095666
96,0.164901,0.309655,0.505404,0.180601,0.396127,0.925209,0.042850,0.742384,0.082021,0.161671,...,0.195883,0.191853,0.392626,0.262477,0.234001,0.419765,0.999061,0.000000,0.000000,0.157972
97,0.483576,0.984598,0.352837,0.659989,0.479029,0.827725,0.156683,0.849902,0.981576,0.890145,...,0.738280,0.959182,0.997203,0.992732,0.328838,0.739819,0.999051,0.027442,0.000000,0.079162
98,0.328847,0.332634,0.381625,0.200231,0.784930,0.637194,0.803410,0.991581,0.000000,0.000000,...,0.290401,0.653183,0.841836,0.710848,0.584961,0.000000,0.999226,0.000000,0.000000,0.053828
