### Imports

In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
from datasets import load_dataset 
from CodeSyntaxConcept.tokenizer import CodeTokenizer
import CodeSyntaxConcept.utils as utils
from statistics import mean, median
import json

  from .autonotebook import tqdm as notebook_tqdm


### Parameters

In [2]:
checkpoint = "EleutherAI/gpt-neo-1.3B"
file_path = "/scratch1/XXXX/CodeSyntaxConcept/experimental_notebooks/output/out_astevalverticalfiltered_c2.csv"
language = "python"
#aggregates_path = "output/aggregation_function/" + file_path.split('/')[1]
aggregates_path = "/scratch1/XXXX/CodeSyntaxConcept/experimental_notebooks/output/aggregation_function/out_astevalverticalfiltered_c2.csv"

### Tokenizer

In [3]:
tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)

### Actual Token Predictions

In [4]:
df_actual_ntp = pd.read_csv(file_path, index_col=0)
df_actual_ntp = df_actual_ntp.head()

In [5]:
df_actual_ntp.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
50971,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c2,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[('module', 0.33433786034584045), ('_', 0.3124...","[(' Premiership', 8.584324521837203e-14), (' p...","[(' run', 4.150300082983449e-05), ('_', 0.3124...",1.401886
50972,176,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c2,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[('module', 0.33433952927589417), ('_', 0.5017...","[(' Premiership', 8.584466145745984e-14), (' E...","[(' get', 0.005219809710979462), ('_', 0.50174...",0.970864
50973,293,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c2,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[('module', 0.33433714509010315), ('_', 0.8443...","[(' Premiership', 8.58435637027602e-14), (' Un...","[(' test', 0.009839195758104324), ('_', 0.8443...",0.610186
50974,742,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 220...",c2,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[('module', 0.33433660864830017), ('_', 0.8443...","[(' Premiership', 8.584030431997916e-14), (' U...","[(' test', 0.009839048609137535), ('_', 0.8443...",0.38386
50975,930,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c2,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[('module', 0.3343401551246643), ('_', 0.84433...","[(' Premiership', 8.58433468623257e-14), (' Un...","[(' test', 0.009838951751589775), ('_', 0.8443...",0.536392


### Token Binding

In [6]:
def bind_bpe_tokens(
    node,              #Tree sitter ast tree
    encoding,          #Token encoding
    actual_probs,      #Actual probabilities
    lines              #Source code Snippet
): 
    """Traverses the tree and bind the leaves with the corresponding node"""
    tree_node = {}
    tree_node['type'] = node.type
    tree_node['children'] = []
    tree_node['bindings'] = []

    node_span = [utils.convert_to_offset(node.start_point, lines), utils.convert_to_offset(node.end_point, lines)]
    for encoding_index, token_span in enumerate(encoding.offset_mapping):
        if (node_span[0] <= token_span[0] and token_span[0] < node_span[1]) \
        or (node_span[0] < token_span[1] and token_span[1] <= node_span[1]) \
        or (node_span[0] >= token_span[0] and token_span[1] >= node_span[1]) :
            tree_node['bindings'].append(actual_probs[encoding_index])
    
    for n in node.children:
        tree_node['children'].append(bind_bpe_tokens(n, encoding, actual_probs, lines))

    return tree_node
        

In [7]:
encoding = tokenizer.tokenizer(df_actual_ntp.iloc[0]['code'], return_offsets_mapping=True)
assert len(eval(df_actual_ntp.iloc[0]['ids'])) == len(encoding['input_ids'])

In [8]:
binded_tree_col = []
for index, row in df_actual_ntp.iterrows():
    tree = tokenizer.parser.parse(bytes(row['code'], "utf8"))
    encoding = tokenizer.tokenizer(row['code'], return_offsets_mapping=True)
    actual_logits = eval(row['actual_prob'])
    actual_logits.insert(0,(tokenizer.tokenizer.decode(eval(row['ids'])[0]),'FIRST_TOKEN'))
    binded_tree = bind_bpe_tokens(tree.root_node, encoding, actual_logits, row['code'].split('\n'))
    binded_tree_col.append(binded_tree)
df_actual_ntp['binded_tree'] = binded_tree_col

In [9]:
def process_bindings(
    node: dict,     #Binded AST tree with actual probabilities
) -> None:
    node_actual_probs = [binding[1] for binding in node['bindings'] if isinstance(binding[1], float)]
    node['median_prob'] = node['max_prob'] = node['min_prob'] = node['avg_prob'] =  node['std'] = None
    ## len is zero if node correspond to FIRST_TOKEN = 'def' 
    if(len(node_actual_probs) > 0):
        ## BOOTSTRAPPING-> 
        node_actual_probs = utils.bootstrapping(node_actual_probs, np.mean, size=500).tolist()
        ##
        node['median_prob'] = median(node_actual_probs) 
        node['max_prob'] = max(node_actual_probs) 
        node['min_prob'] = min(node_actual_probs)
        node['avg_prob'] = mean(node_actual_probs)
        node['std'] = np.std(node_actual_probs)
    for child in node['children']:
        process_bindings(child)

In [10]:
df_actual_ntp['binded_tree'].apply(lambda binded_tree: process_bindings(binded_tree))
print(df_actual_ntp.iloc[0]['binded_tree']['children'][0]['children'][1]['median_prob'])
print(df_actual_ntp.iloc[1]['binded_tree']['children'][0]['children'][1]['median_prob'])

0.14563976032804932
0.26148256649644785


In [11]:
df_actual_ntp.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss,binded_tree
50971,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c2,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[('module', 0.33433786034584045), ('_', 0.3124...","[(' Premiership', 8.584324521837203e-14), (' p...","[(' run', 4.150300082983449e-05), ('_', 0.3124...",1.401886,"{'type': 'module', 'children': [{'type': 'func..."
50972,176,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c2,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[('module', 0.33433952927589417), ('_', 0.5017...","[(' Premiership', 8.584466145745984e-14), (' E...","[(' get', 0.005219809710979462), ('_', 0.50174...",0.970864,"{'type': 'module', 'children': [{'type': 'func..."
50973,293,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c2,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[('module', 0.33433714509010315), ('_', 0.8443...","[(' Premiership', 8.58435637027602e-14), (' Un...","[(' test', 0.009839195758104324), ('_', 0.8443...",0.610186,"{'type': 'module', 'children': [{'type': 'func..."
50974,742,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 220...",c2,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[('module', 0.33433660864830017), ('_', 0.8443...","[(' Premiership', 8.584030431997916e-14), (' U...","[(' test', 0.009839048609137535), ('_', 0.8443...",0.38386,"{'type': 'module', 'children': [{'type': 'func..."
50975,930,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c2,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[('module', 0.3343401551246643), ('_', 0.84433...","[(' Premiership', 8.58433468623257e-14), (' Un...","[(' test', 0.009838951751589775), ('_', 0.8443...",0.536392,"{'type': 'module', 'children': [{'type': 'func..."


In [12]:
##Convert to JSON Object
df_actual_ntp['binded_tree'] = df_actual_ntp['binded_tree'].map(lambda binded_tree: json.dumps(binded_tree))

In [13]:
df_actual_ntp.to_csv(aggregates_path)