### Imports

In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
from datasets import load_dataset 
from CodeSyntaxConcept.tokenizer import CodeTokenizer
import CodeSyntaxConcept.utils as utils
from statistics import mean, median
import json

### Parameters

In [2]:
checkpoint = "EleutherAI/gpt-neo-125M"
parent_node_types_path = "output/parent_node_types.csv"
child_node_types_path = "output/child_node_types.csv"
aggregates_path = "output/aggregation_function/codesearch_tesbed_EleutherAI-gpt-neo-2.7B_10000_aggregated.csv"
language = "python"

In [3]:
tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)

### Parent Node Types

In [4]:
parent_node_types = pd.read_csv(parent_node_types_path, index_col=0)
parent_node_types = set(parent_node_types['parent_type'])

### Children Node Types

In [5]:
child_node_types = pd.read_csv(child_node_types_path, index_col=0)
child_node_types = set(child_node_types['child_type'])

### Load Aggregates

In [6]:
df_actual_ntp = pd.read_csv(aggregates_path, index_col=0)

In [7]:
##Convert JSON do Dict
df_actual_ntp['binded_tree'] = df_actual_ntp['binded_tree'].map(lambda binded_tree: json.loads(binded_tree))

In [8]:
df_actual_ntp.head()

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case,binded_tree
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[('ers', 0.03083181567490101), ('_', 0.2419087...","[('anwhile', 1.03836617568492e-16), ('ousy', 3...","[('Ġget', 0.0024285861290991306), ('_', 0.2419...","{'type': 'module', 'children': [{'type': 'func..."
1,"def execute_pipeline(pipeline, environment_dic...","[('def', 'def', 'function_definition'), ('exec...","[(4299, 'def', 'function_definition'), (12260,...","[4299, 12260, 62, 79, 541, 4470, 7, 79, 541, 4...",492,"[('ers', 0.03083181567490101), ('_', 0.1853068...","[('anwhile', 1.03836617568492e-16), ('icester'...","[('Ġexecute', 5.97450380155351e-05), ('_', 0.1...","{'type': 'module', 'children': [{'type': 'func..."
2,"def _decode(self, data):\n '''\n ...","[('def', 'def', 'function_definition'), ('_dec...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 12501, 1098, 7, 944, 11, 1366, 25...",583,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('dec', 0.00133...","{'type': 'module', 'children': [{'type': 'func..."
3,"def _repr_html_(self):\n """"""\n J...","[('def', 'def', 'function_definition'), ('_rep...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 260, 1050, 62, 6494, 41052, 944, ...",221,"[('ers', 0.03083181567490101), ('(', 0.0166473...","[('anwhile', 1.03836617568492e-16), ('ousy', 4...","[('Ġ_', 0.001834857277572155), ('re', 0.001378...","{'type': 'module', 'children': [{'type': 'func..."
4,"def build_shape(relation, nodes, ways):\n ""...","[('def', 'def', 'function_definition'), ('buil...","[(4299, 'def', 'function_definition'), (1382, ...","[4299, 1382, 62, 43358, 7, 49501, 11, 13760, 1...",454,"[('ers', 0.03083181567490101), ('_', 0.2842166...","[('anwhile', 1.03836617568492e-16), ('buquerqu...","[('Ġbuild', 0.0005856865900568664), ('_', 0.28...","{'type': 'module', 'children': [{'type': 'func..."


### Snippet Concept Embeddings

In [15]:
def get_concept_embeddings(node, concepts):
    def get_concept_bindings(node, concepts, bindings):
        for child in node['children']:
            get_concept_bindings(child, concepts, bindings)
        if node['type'] in concepts:
            bindings[concepts.index(node['type'])].append([prob for token, prob in node['bindings'] if prob != 'FIRST_TOKEN'])
    bindings=[[] for _ in range(len(concepts))]
    get_concept_bindings(node, concepts, bindings)
    embedding = []
    for prob_list in bindings:
        flat_prob_list = [prob for sublist in prob_list for prob in sublist]
        if(len(flat_prob_list) > 0):
            ### BOOTSTRAPPING to calculate median in embeddings.
            flat_prob_list = utils.bootstrapping(flat_prob_list, np.mean, size=500).tolist()
            ###
            embedding.append(median(flat_prob_list))
        else:
            embedding.append(0)
    return embedding
    

In [16]:
#### MOST FREQUENT CONCEPTS - EXPLORATORY ANALYSIS
most_frequent_leaves = ['identifier', '.', '(', ')', ',', '=', 'string',':','[',']','integer']
most_frequent_parents = ['attribute','expression_statement','argument_list','call','assignment','comparison_operator', 'if_statement','return_statement','for_statement', 'parameters']
concepts = most_frequent_leaves + most_frequent_parents
df_concept_embeddings = pd.DataFrame([], columns= concepts)
for binded_tree in df_actual_ntp['binded_tree']:
    df_concept_embeddings.loc[len(df_concept_embeddings.index)] = get_concept_embeddings(binded_tree, concepts)

In [18]:
df_concept_embeddings

Unnamed: 0,identifier,.,(,),",",=,string,:,[,],...,attribute,expression_statement,argument_list,call,assignment,comparison_operator,if_statement,return_statement,for_statement,parameters
0,0.247978,0.713405,0.147036,0.047671,0.475690,0.000000,0.616789,0.237462,0.000000,0.000000,...,0.476376,0.623299,0.014872,0.387745,0.000000,0.520069,0.827289,0.447364,0.805238,0.180006
1,0.553572,0.301083,0.544647,0.355568,0.590923,0.701462,0.503896,0.409560,0.000000,0.000000,...,0.247976,0.535017,0.767892,0.721437,0.615278,0.000000,0.000000,0.810869,0.000000,0.335569
2,0.411638,0.494048,0.447135,0.394196,0.655803,0.448934,0.480014,0.711429,0.490898,0.717164,...,0.454433,0.515587,0.711673,0.664521,0.409602,0.429127,0.744368,0.386768,0.000000,0.205367
3,0.289304,0.478765,0.592088,0.425479,0.083339,0.608954,0.578598,0.160940,0.008454,0.079043,...,0.496723,0.496207,0.385763,0.439849,0.438232,0.000000,0.000000,0.320331,0.602646,0.100654
4,0.369156,0.535992,0.229903,0.030855,0.439524,0.877260,0.176193,0.556677,0.752652,0.535191,...,0.507497,0.746921,0.851714,0.839427,0.255438,0.495044,0.814654,0.000000,0.791884,0.138418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.442776,0.405570,0.343751,0.167023,0.318372,0.545017,0.458943,0.095666,0.000000,0.000000,...,0.560354,0.455111,0.604727,0.487616,0.457105,0.000000,0.000000,0.476267,0.000000,0.293313
96,0.275039,0.344783,0.548590,0.436379,0.502031,0.720402,0.096400,0.637414,0.180578,0.340318,...,0.313048,0.305562,0.457356,0.389555,0.332187,0.301415,0.685028,0.000000,0.000000,0.226816
97,0.470578,0.911941,0.454986,0.660920,0.501757,0.810675,0.349007,0.719806,0.562791,0.769002,...,0.609300,0.683370,0.800504,0.775895,0.429285,0.585311,0.814245,0.027442,0.000000,0.184580
98,0.371598,0.485130,0.345332,0.324572,0.784930,0.637194,0.636103,0.646096,0.000000,0.000000,...,0.369177,0.609379,0.719711,0.620839,0.565530,0.000000,0.810176,0.000000,0.000000,0.127877
