### Imports

In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
from datasets import load_dataset 
from CodeSyntaxConcept.tokenizer import CodeTokenizer
import CodeSyntaxConcept.utils as utils
from statistics import mean, median
import json

  from .autonotebook import tqdm as notebook_tqdm


### Parameters

In [2]:
checkpoint = "EleutherAI/gpt-neo-1.3B"
parent_node_types_path = "/scratch1/svelascodimate/CodeSyntaxConcept/experimental_notebooks/output/parent_node_types.csv"
child_node_types_path = "/scratch1/svelascodimate/CodeSyntaxConcept/experimental_notebooks/output/child_node_types.csv"
aggregates_path = "/scratch1/svelascodimate/CodeSyntaxConcept/experimental_notebooks/output/aggregation_function/out_astevalverticalfiltered_c2.csv"
language = "python"

In [3]:
tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)

### Parent Node Types

In [4]:
parent_node_types = pd.read_csv(parent_node_types_path, index_col=0)
parent_node_types = set(parent_node_types['parent_type'])

### Children Node Types

In [5]:
child_node_types = pd.read_csv(child_node_types_path, index_col=0)
child_node_types = set(child_node_types['child_type'])

### Load Aggregates

In [6]:
df_actual_ntp = pd.read_csv(aggregates_path, index_col=0)

In [7]:
##Convert JSON do Dict
df_actual_ntp['binded_tree'] = df_actual_ntp['binded_tree'].map(lambda binded_tree: json.loads(binded_tree))

In [8]:
df_actual_ntp.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss,binded_tree
50971,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c2,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[('module', 0.33433786034584045), ('_', 0.3124...","[(' Premiership', 8.584324521837203e-14), (' p...","[(' run', 4.150300082983449e-05), ('_', 0.3124...",1.401886,"{'type': 'module', 'children': [{'type': 'func..."
50972,176,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c2,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[('module', 0.33433952927589417), ('_', 0.5017...","[(' Premiership', 8.584466145745984e-14), (' E...","[(' get', 0.005219809710979462), ('_', 0.50174...",0.970864,"{'type': 'module', 'children': [{'type': 'func..."
50973,293,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c2,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[('module', 0.33433714509010315), ('_', 0.8443...","[(' Premiership', 8.58435637027602e-14), (' Un...","[(' test', 0.009839195758104324), ('_', 0.8443...",0.610186,"{'type': 'module', 'children': [{'type': 'func..."
50974,742,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 220...",c2,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[('module', 0.33433660864830017), ('_', 0.8443...","[(' Premiership', 8.584030431997916e-14), (' U...","[(' test', 0.009839048609137535), ('_', 0.8443...",0.38386,"{'type': 'module', 'children': [{'type': 'func..."
50975,930,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c2,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[('module', 0.3343401551246643), ('_', 0.84433...","[(' Premiership', 8.58433468623257e-14), (' Un...","[(' test', 0.009838951751589775), ('_', 0.8443...",0.536392,"{'type': 'module', 'children': [{'type': 'func..."


### Local Analysis (Snippets)

In [9]:
def traverse_tree_and_collect_stds(node: dict, node_types_list: list, std_field: str):
    if node[std_field] is not None:
        node_types_list[tokenizer.node_types.index(node['type'])] = node_types_list[tokenizer.node_types.index(node['type'])] + [node[std_field]]
    for child in node['children']:
        traverse_tree_and_collect_stds(child, node_types_list, std_field)

In [10]:
def add_statistic_column(std_field, dataframe):
    concept_probs = []
    for tree in dataframe['binded_tree']:
        node_types_list = [[] for type in tokenizer.node_types]
        traverse_tree_and_collect_stds(tree, node_types_list, std_field)
        snippet_type_list = []
        for type_index, node_values in enumerate(node_types_list):
            if len(node_values)>0: 
                snippet_type_list.append((tokenizer.node_types[type_index], node_values))
        concept_probs.append(snippet_type_list)
    dataframe['concept_'+std_field] =  concept_probs

In [11]:
add_statistic_column('median_prob', df_actual_ntp)
add_statistic_column('min_prob', df_actual_ntp)
add_statistic_column('max_prob', df_actual_ntp)
df_actual_ntp.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss,binded_tree,concept_median_prob,concept_min_prob,concept_max_prob
50971,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c2,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[('module', 0.33433786034584045), ('_', 0.3124...","[(' Premiership', 8.584324521837203e-14), (' p...","[(' run', 4.150300082983449e-05), ('_', 0.3124...",1.401886,"{'type': 'module', 'children': [{'type': 'func...","[(string, [0.18092918285401538, 0.451572136904...","[(string, [0.00843026242884142, 0.189425867884...","[(string, [0.6264417905892644, 0.7172050918452..."
50972,176,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c2,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[('module', 0.33433952927589417), ('_', 0.5017...","[(' Premiership', 8.584466145745984e-14), (' E...","[(' get', 0.005219809710979462), ('_', 0.50174...",0.970864,"{'type': 'module', 'children': [{'type': 'func...","[(string, [0.20834349759388715, 0.416128066368...","[(string, [0.012166319725414118, 0.06110522802...","[(string, [0.7232807005445162, 0.9284585913022..."
50973,293,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c2,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[('module', 0.33433714509010315), ('_', 0.8443...","[(' Premiership', 8.58435637027602e-14), (' Un...","[(' test', 0.009839195758104324), ('_', 0.8443...",0.610186,"{'type': 'module', 'children': [{'type': 'func...","[(string, [0.4690643052260081, 0.8965929349263...","[(string, [0.23860621452331543, 0.795861423015...","[(string, [0.8675042390823364, 0.9781620502471..."
50974,742,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 220...",c2,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[('module', 0.33433660864830017), ('_', 0.8443...","[(' Premiership', 8.584030431997916e-14), (' U...","[(' test', 0.009839048609137535), ('_', 0.8443...",0.38386,"{'type': 'module', 'children': [{'type': 'func...","[(string, [0.2514445201183359, 0.4976739316371...","[(string, [0.00943030882626772, 4.256641477695...","[(string, [0.619777500629425, 0.96614074707031..."
50975,930,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c2,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[('module', 0.3343401551246643), ('_', 0.84433...","[(' Premiership', 8.58433468623257e-14), (' Un...","[(' test', 0.009838951751589775), ('_', 0.8443...",0.536392,"{'type': 'module', 'children': [{'type': 'func...","[(from, [0.01963830552995205, 0.26718652248382...","[(from, [0.01963830552995205, 0.26718652248382...","[(from, [0.01963830552995205, 0.26718652248382..."


### Global Analysis (AST Elements)

In [12]:
def collect_global_std(std_field, dataframe):
    node_types_list = [[] for type in tokenizer.node_types]
    for tree in dataframe['binded_tree']:
        traverse_tree_and_collect_stds(tree, node_types_list, std_field)
    return node_types_list

In [13]:
concept_median_prob_list = collect_global_std('median_prob', df_actual_ntp)
concept_min_prob_list = collect_global_std('min_prob', df_actual_ntp)
concept_max_prob_list = collect_global_std('max_prob', df_actual_ntp)

In [14]:
global_concept_dataframe = pd.DataFrame([], columns=['ast_element', 'node_type' ,'concept_median_prob', 'concept_min_prob','concept_max_prob'])
for concept_idx in range(0,len(tokenizer.node_types)):
    if(len(concept_median_prob_list[concept_idx])>0):
        global_concept_dataframe.loc[len(global_concept_dataframe.index)] = [tokenizer.node_types[concept_idx],
                                                                             'parent' if tokenizer.node_types[concept_idx] in parent_node_types else 'leaf',
                                                                             concept_median_prob_list[concept_idx], 
                                                                             concept_min_prob_list[concept_idx], 
                                                                             concept_max_prob_list[concept_idx]]

In [15]:
global_concept_dataframe['model'] = checkpoint
global_concept_dataframe

Unnamed: 0,ast_element,node_type,concept_median_prob,concept_min_prob,concept_max_prob,model
0,from,leaf,"[0.01963830552995205, 0.2671865224838257]","[0.01963830552995205, 0.2671865224838257]","[0.01963830552995205, 0.2671865224838257]",EleutherAI/gpt-neo-1.3B
1,string,parent,"[0.18092918285401538, 0.4515721369048151, 0.39...","[0.00843026242884142, 0.1894258678848928, 0.10...","[0.6264417905892644, 0.7172050918452442, 0.744...",EleutherAI/gpt-neo-1.3B
2,default_parameter,parent,[0.46895369962051825],[0.001325523965836813],[0.9738041957219442],EleutherAI/gpt-neo-1.3B
3,return,leaf,[0.3585938811302185],[0.3585938811302185],[0.3585938811302185],EleutherAI/gpt-neo-1.3B
4,"""",leaf,"[0.09370232373476028, 0.09742529690265656, 0.2...","[0.09370232373476028, 0.09742529690265656, 0.2...","[0.09370232373476028, 0.09742529690265656, 0.2...",EleutherAI/gpt-neo-1.3B
5,integer,leaf,"[0.49849268794059753, 0.7268467545509338, 0.97...","[0.49849268794059753, 0.7268467545509338, 0.97...","[0.49849268794059753, 0.7268467545509338, 0.97...",EleutherAI/gpt-neo-1.3B
6,not_operator,parent,[0.4261011965572834],[0.08066985756158829],[0.7715325355529785],EleutherAI/gpt-neo-1.3B
7,assignment,parent,"[0.5641642529517412, 0.3715283669929098, 0.766...","[0.13901309482753277, 0.11126363949733786, 0.3...","[0.8793553709983826, 0.6793163788194457, 0.953...",EleutherAI/gpt-neo-1.3B
8,:,leaf,"[0.5750448703765869, 0.9328832030296326, 0.999...","[0.5750448703765869, 0.9328832030296326, 0.999...","[0.5750448703765869, 0.9328832030296326, 0.999...",EleutherAI/gpt-neo-1.3B
9,argument_list,parent,"[0.3742155928864478, 0.5522209473631599, 0.661...","[0.17043522462431018, 0.2267781330103224, 0.41...","[0.620731327819395, 0.8474852605299517, 0.8754...",EleutherAI/gpt-neo-1.3B
