### Imports

In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
from datasets import load_dataset 
from CodeSyntaxConcept.tokenizer import CodeTokenizer
import CodeSyntaxConcept.utils as utils
from statistics import mean, median
import json

  from .autonotebook import tqdm as notebook_tqdm


### Parameters

In [2]:
checkpoint = "EleutherAI/gpt-neo-1.3B"
parent_node_types_path = "/scratch1/XXXX/CodeSyntaxConcept/experimental_notebooks/output/parent_node_types.csv"
child_node_types_path = "/scratch1/XXXX/CodeSyntaxConcept/experimental_notebooks/output/child_node_types.csv"
aggregates_path = "/scratch1/XXXX/CodeSyntaxConcept/experimental_notebooks/output/aggregation_function/out_astevalverticalfiltered_c2.csv"
language = "python"

In [3]:
tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)

### Parent Node Types

In [4]:
parent_node_types = pd.read_csv(parent_node_types_path, index_col=0)
parent_node_types = set(parent_node_types['parent_type'])

### Children Node Types

In [5]:
child_node_types = pd.read_csv(child_node_types_path, index_col=0)
child_node_types = set(child_node_types['child_type'])

### Load Aggregates

In [6]:
df_actual_ntp = pd.read_csv(aggregates_path, index_col=0)

In [7]:
##Convert JSON do Dict
df_actual_ntp['binded_tree'] = df_actual_ntp['binded_tree'].map(lambda binded_tree: json.loads(binded_tree))

In [8]:
df_actual_ntp.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss,binded_tree
50971,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c2,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[('module', 0.33433786034584045), ('_', 0.3124...","[(' Premiership', 8.584324521837203e-14), (' p...","[(' run', 4.150300082983449e-05), ('_', 0.3124...",1.401886,"{'type': 'module', 'children': [{'type': 'func..."
50972,176,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c2,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[('module', 0.33433952927589417), ('_', 0.5017...","[(' Premiership', 8.584466145745984e-14), (' E...","[(' get', 0.005219809710979462), ('_', 0.50174...",0.970864,"{'type': 'module', 'children': [{'type': 'func..."
50973,293,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c2,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[('module', 0.33433714509010315), ('_', 0.8443...","[(' Premiership', 8.58435637027602e-14), (' Un...","[(' test', 0.009839195758104324), ('_', 0.8443...",0.610186,"{'type': 'module', 'children': [{'type': 'func..."
50974,742,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 220...",c2,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[('module', 0.33433660864830017), ('_', 0.8443...","[(' Premiership', 8.584030431997916e-14), (' U...","[(' test', 0.009839048609137535), ('_', 0.8443...",0.38386,"{'type': 'module', 'children': [{'type': 'func..."
50975,930,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c2,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[('module', 0.3343401551246643), ('_', 0.84433...","[(' Premiership', 8.58433468623257e-14), (' Un...","[(' test', 0.009838951751589775), ('_', 0.8443...",0.536392,"{'type': 'module', 'children': [{'type': 'func..."


### Snippet Concept Embeddings

In [9]:
def get_concept_embeddings(node, concepts):
    def get_concept_bindings(node, concepts, bindings):
        for child in node['children']:
            get_concept_bindings(child, concepts, bindings)
        if node['type'] in concepts:
            bindings[concepts.index(node['type'])].append([prob for token, prob in node['bindings'] if prob != 'FIRST_TOKEN'])
    bindings=[[] for _ in range(len(concepts))]
    get_concept_bindings(node, concepts, bindings)
    embedding = []
    for prob_list in bindings:
        flat_prob_list = [prob for sublist in prob_list for prob in sublist]
        if(len(flat_prob_list) > 0):
            ### BOOTSTRAPPING to calculate median in embeddings.
            flat_prob_list = utils.bootstrapping(flat_prob_list, np.mean, size=500).tolist()
            ###
            embedding.append(median(flat_prob_list))
        else:
            embedding.append(0)
    return embedding
    

In [10]:
#### MOST FREQUENT CONCEPTS - EXPLORATORY ANALYSIS
#most_frequent_leaves = ['identifier', '.', '(', ')', ',', '=', 'string',':','[',']','integer']
#most_frequent_parents = ['attribute','expression_statement','argument_list','call','assignment','comparison_operator', 'if_statement','return_statement','for_statement', 'parameters']
#concepts = most_frequent_leaves + most_frequent_parents
concepts = ['for_statement', 'while_statement', 'return_statement', ']', ')', 'if_statement', 'comparison_operator', 'boolean_operator', 'for_in_clause', 'if_clause', 'list_comprehension', 'lambda', 'identifier' ,'string']
df_concept_embeddings = pd.DataFrame([], columns= concepts)
for binded_tree in df_actual_ntp['binded_tree']:
    df_concept_embeddings.loc[len(df_concept_embeddings.index)] = get_concept_embeddings(binded_tree, concepts)

In [None]:
df_concept_embeddings = df_concept_embeddings.set_index(df_actual_ntp.index)
for concept in concepts:
    df_actual_ntp[str(concept)] = df_concept_embeddings[str(concept)]

In [11]:
df_actual_ntp

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,...,attribute,expression_statement,argument_list,call,assignment,comparison_operator,if_statement,return_statement,for_statement,parameters
50971,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c2,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,...,0.519795,0.546816,0.570526,0.575901,0.544558,0.2779,0.650242,0.0,0.0,0.365225
50972,176,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c2,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,...,0.506149,0.501393,0.62305,0.556906,0.41463,0.644424,0.777837,0.828828,0.0,0.041225
50973,293,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c2,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,...,0.679338,0.798644,0.797762,0.77405,0.51929,0.719165,0.0,0.0,0.0,0.220137
50974,742,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 220...",c2,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,...,0.725916,0.864241,0.837326,0.811349,0.871812,0.69918,0.0,0.0,0.0,0.259359
50975,930,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c2,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,...,0.714718,0.82854,0.803874,0.788511,0.817927,0.738143,0.0,0.0,0.0,0.396442


In [12]:
len(df_concept_embeddings['attribute'])
len(df_actual_ntp)

5