# Statistical Module (Bootstrapping)

>
> Aggregation Module with Bootstrapping Algorihtms. 
>

In [None]:
#| default_exp statistics

In [None]:
#| export
import pandas as pd
import os
import time
import numpy as np
import torch
import gc

In [None]:
#| export

import logging
logging.basicConfig(
    filename="logger_data_engineering.txt",
    filemode='a',
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO
    )

In [None]:
#| export

import seaborn as sns
from scipy import stats
from statistics import NormalDist
import matplotlib.pyplot as plt

# Data Engineering

In [None]:
#| export

def c_eleuther( returnModel = False, model_type =  'EleutherAI/gpt-neo-125m'):
    ''' Eleuther and Salesforce and Parrot uses the same importation'''
    from transformers import AutoTokenizer, AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(model_type)
    logging.info("Tokenizer Loaded")
    if returnModel:
        model = AutoModelForCausalLM.from_pretrained(model_type)
        logging.info("Model Loaded")
    else:
        model = []
    
    logging.info(model_type)
    return tokenizer, model
    
    

def init_model_args( current_case = 'c1', returnModel = False ): 
    
    code_models = {
        'c1':('EleutherAI/gpt-neo-125m', ), # Basic (on Pile) GPT-3/J
        'c2':('EleutherAI/gpt-neo-1.3B', ),
        'c3':('EleutherAI/gpt-neo-2.7B', ),
        'c4':('EleutherAI/gpt-j-6b', ),
        'c5':('Salesforce/codegen-350M-nl', ), #Basic (on Pile) codegen
        'c6':('Salesforce/codegen-2B-nl', ),
        'c7':('Salesforce/codegen-6B-nl', ),
        'c8':('Salesforce/codegen-16B-nl', ),
        'c9':('codeparrot/codeparrot-small-multi', ), #multi-Language
        'c10':('Salesforce/codegen-350M-multi', ),
        'c11':('Salesforce/codegen-2B-multi', ),
        'c12':('Salesforce/codegen-6B-multi', ),
        'c13':('Salesforce/codegen-16B-multi', ),
        'c14':('codeparrot/codeparrot-small', ), #mono-Language
        'c15':('codeparrot/codeparrot', ),
        'c16':('Salesforce/codegen-350M-mono', ),
        'c17':('Salesforce/codegen-2B-mono', ),
        'c18':('Salesforce/codegen-6B-mono', ),
        'c19':('Salesforce/codegen-16B-mono', ),
    }
    
    model_type = code_models[current_case][0]
    tokenizer, model =  c_eleuther( returnModel = returnModel,  model_type = model_type ) 
    
    
    return model_type, tokenizer, model

## Init Parameters

In [None]:
#| hide
#| eval: false
# [WARNING] Hyperparameters changes, please tune them up
CODEMODEL =  'c17'
params = {
    'codemodel' : CODEMODEL,
    'numpy_files_logits_path': f'../datax/np_files_logits/{CODEMODEL}',
    'testbeds_path' : '../datax/testbeds/AstEvalVerticalFiltered.json',
    'outputs' : f'../data/ds_raw_logits/out_astevalverticalfiltered_{CODEMODEL}.csv'
}

In [None]:
#| hide
#| eval: false
params['outputs']

'../data/ds_raw_logits/out_astevalverticalfiltered_c17.csv'

In [None]:
#| hide
#| eval: false
#Uploading Model UnderAnalisys
name, tokenizer, model = init_model_args(
    current_case = params['codemodel'], 
    returnModel = False #[WARNING!] Check the parameters before calling it. 
    )

In [None]:
#| hide
#| eval: false
#Testing data loads
data_pd = pd.read_json( params['testbeds_path'] )
data_pd.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
0,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c1,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154


# Softmax Normalization and Data Engineering

In [None]:
#| hide
#| eval: false
filtered_prompts_ids = data_pd[data_pd['m_name']==params['codemodel']]['ids'].values

In [None]:
#| hide
#| eval: false
tf_input_ids = [torch.tensor(  input_ids, dtype = torch.int) for input_ids in filtered_prompts_ids ]

### Testing Tokenizer

In [None]:
#| hide
#| eval: false
#Testung Sample to validate minmax functions
sample_id = 0
data_pd[data_pd['m_name']==params['codemodel']].iloc[sample_id]

size                                                            225
ids               [4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...
m_name                                                          c17
code              def run_python_tests(test_modules, parallelism...
ast_errors                                                       []
n_ast_errors                                                      0
ast_levels                                                       13
n_whitespaces_                                                   67
complexity                                                        7
nloc                                                             15
token_counts                                                    120
n_ast_nodes                                                     154
Name: 815536, dtype: object

In [None]:
#| hide
#| eval: false
tf_input_ids[sample_id]

tensor([ 4299,  1057,    62, 29412,    62, 41989,     7,  9288,    62, 18170,
           11, 10730,  1042,    11,   351,    62,  1073,  1857,    28, 25101,
         2599,   198, 50284,  2617,    62,  7839,    62,   392,    62,  9967,
         7203, 28768,  9485,  4561,   668,  5254,  1600,   366,  9148, 11290,
           62,    47,    56,  4303, 14175,    62,  4944,  2043,    62,    51,
         1546,  4694,  4943,   628, 50284,   361,   351,    62,  1073,  1857,
           25,   198, 50280,     2, 33998,  1838,   262,  9485,  4561,   668,
         5254,   781, 15492,  2233,   284,  4334, 10730,  1042,    13,   198,
        50280,     2,  1649,   356,  1057,  9485,  4561,   668,  5254,   351,
         5197,    11,   340,  3544,   604,   329,   783,   355,   198, 50280,
            2, 46513,    13,   198, 50280,  1845, 29363,  1042,   796,   604,
          198, 50280, 12048,   796,   366,  5143,    12, 41989,    12,  4480,
           12,  1073,  1857,     1,   198, 50284, 17772,    25, 

In [None]:
#| hide
#| eval: false
# Loading Logits From Files
out= np.load( params['numpy_files_logits_path'] +'/' + f'logits_tensor[{sample_id}]_batch[{sample_id}]_model[{CODEMODEL}].npy' )
soft = torch.nn.Softmax( dim = 0 ) #Flattening normalization
first_token_distribution = soft( torch.from_numpy(out[0]) ) #Flattening normalization for fist token
assert round(first_token_distribution.sum().item()) == 1.0


In [None]:
#| hide
#| eval: false
first_token_distribution[0]

tensor(1.5081e-05)

In [None]:
#| hide
#| eval: false
tokenizer.decode([7881])

' register'

In [None]:
#| hide
#| eval: false
tokenizer.vocab.keys()



In [None]:
#| hide
#| eval: false
tokenizer.get_vocab()

{'ĠPubMed': 32131,
 'elvet': 32667,
 'erential': 33369,
 '41': 3901,
 'scroll': 48728,
 'Ġacquiring': 22488,
 'XT': 25010,
 'Laughs': 34610,
 'Ġelectroly': 39450,
 'Ġsaints': 31728,
 'ĠLaw': 3854,
 'steps': 20214,
 '1996': 22288,
 'Ġheck': 22574,
 'ĠDot': 22875,
 'ĠMate': 24787,
 'ĠToken': 29130,
 'Pri': 34487,
 'grounds': 40520,
 'ĠTrader': 41956,
 '426': 42780,
 'Ġbiologically': 44479,
 'Ġpens': 29707,
 'ube': 3266,
 'onnaissance': 31539,
 'ĠFixed': 10832,
 'wage': 21482,
 'liber': 33203,
 'ĠPBS': 30051,
 'Ġspheres': 34126,
 'Vo': 42144,
 'ĠAnd': 843,
 'ĠDodgers': 23576,
 'Ġworkouts': 27197,
 'ĠNish': 48438,
 'Ġheats': 37876,
 'ĠLAT': 42355,
 'ĠHO': 40115,
 'Ġmor': 2146,
 'ĠTrailer': 36923,
 'Ġconce': 8571,
 'ands': 1746,
 'Only': 10049,
 'Ġmil': 1465,
 'ichick': 38448,
 'Ġworkflow': 30798,
 'Ġdefenders': 16355,
 'ĠARE': 15986,
 'retch': 22592,
 'ĠBrass': 32309,
 'Ġindu': 9318,
 'ĠBrist': 23072,
 'ĠLDL': 37654,
 'ichen': 41437,
 'ĠReconstruction': 45060,
 'ĠMET': 31243,
 'arkable': 4

## Logit Uploading and Flattening

In [None]:
#| hide
#| eval: false
first_token_distribution.topk(k=1, largest =True) #Returns the most probable token for that position

torch.return_types.topk(
values=tensor([0.1579]),
indices=tensor([1332]))

In [None]:
#| export

def topk_tuple( logit_vocab_tensor, largest, tokenizer_fn):
    "Run topk for a token"
    #token_position_dict = list( tokenizer.get_vocab().keys() )
    topk = logit_vocab_tensor.topk( k=1 , largest=largest ) #TODO K number of elements can be extended
    #print(topk.indices)
    return ( tokenizer_fn.decode(topk.indices), topk.values.item() )

def min_max_logits( logit_vocab_sample_tensor, tokenizer_fn ):
    "Compute min_max for a sample"
    max_cases = []
    min_cases = []
    for logit_vocab_tensor in logit_vocab_sample_tensor:
        max_cases.append( topk_tuple( logit_vocab_tensor = logit_vocab_tensor, largest = True, tokenizer_fn = tokenizer_fn) ) #TST Max Logit
        min_cases.append( topk_tuple( logit_vocab_tensor = logit_vocab_tensor, largest = False, tokenizer_fn = tokenizer_fn) ) #TST Min Logit
    return max_cases, min_cases

def actual_logit( 
                 logit_vocab_sample_tensor, 
                 tokenized_prompt, 
                 tokenizer_fn,
                 ):
    "Compute actual logits for a sample"
    actual_logits_prompt = []
    for token_pos, id_token in enumerate( tokenized_prompt[1:] ): #Eliminate the first token prediction since we do not use it
        actual_logits_prompt.append(
            (   tokenizer_fn.decode( int(id_token) ), #retrieving the name of the token with the id
                logit_vocab_sample_tensor[token_pos][int(id_token)].item()      ) #retrieving the logit given the position in the sequence and the position in the vocab
            )
    return actual_logits_prompt

In [None]:
#| hide
#| eval: false
max_case,min_case = min_max_logits(
    logit_vocab_sample_tensor = [ soft( torch.from_numpy(token) ) for token in out[:4]],
    tokenizer_fn= tokenizer
    ) #Sentence Level minmax (reduced to 3 tokens)
print(max_case)
print(min_case)

[(' test', 0.15791159868240356), ('_', 0.410830557346344), ('test', 0.05332261323928833), ('_', 0.609906017780304)]
[('ヘラ', 4.97558140521992e-11), ('��', 7.214983938281833e-22), (' DevOnline', 2.6005836363079656e-15), ('��', 2.25670669779587e-23)]


In [None]:
#| hide
#| eval: false
sample_id

0

In [None]:
#| hide
#| eval: false
actual_cases = actual_logit(
    logit_vocab_sample_tensor = [ soft( torch.from_numpy(token) ) for token in out] , #Out is a complete sequence
    tokenized_prompt = tf_input_ids[ sample_id ],
    tokenizer_fn = tokenizer
    )

In [None]:
#| hide
#| eval: false
actual_cases

[(' run', 0.012154975906014442),
 ('_', 0.410830557346344),
 ('python', 0.0014204250182956457),
 ('_', 0.609906017780304),
 ('tests', 0.06661944091320038),
 ('(', 0.7821821570396423),
 ('test', 0.05372804030776024),
 ('_', 0.6518159508705139),
 ('modules', 0.00809108093380928),
 (',', 0.5366635918617249),
 (' parallel', 0.008699917234480381),
 ('ism', 0.023772072046995163),
 (',', 0.3085222840309143),
 (' with', 0.0006042752647772431),
 ('_', 0.9571990370750427),
 ('co', 0.17824891209602356),
 ('verage', 0.9994751811027527),
 ('=', 0.20326749980449677),
 ('False', 0.8346660137176514),
 ('):', 0.6203358173370361),
 ('\n', 0.9075515866279602),
 ('    ', 0.9226301908493042),
 ('set', 0.00017469270096626133),
 ('_', 0.901804506778717),
 ('title', 0.0003461785090621561),
 ('_', 0.1248159408569336),
 ('and', 0.3271429240703583),
 ('_', 0.9998694658279419),
 ('block', 5.6533168390160426e-05),
 ('("', 0.1712595373392105),
 ('Running', 0.7192416191101074),
 (' Py', 0.02002798579633236),
 ('Sp',

# Logits Execution

In [None]:
#| hide
#| eval: false
len(filtered_prompts_ids)
#params['numpy_files_logits_path'] +'/' + f'logits_tensor[{sample_id}]_batch[{sample_id}]_model[{CODEMODEL}].npy'

50971

In [None]:
#| export

def batching_logits( size ):

    max_logit_token_prompt = []
    min_logit_token_prompt = []
    actual_logit_token_prompt = []
    
    soft = torch.nn.Softmax( dim = 0 )                          #Flattening normalization
    
    for file in range( size ):
        out = np.load(params['numpy_files_logits_path']+'/'+ f'logits_tensor[{file}]_batch[{file}]_model[{CODEMODEL}].npy')
        next_tokens_distribution = [ soft( torch.from_numpy(token) ) for token in out]  #Flattening normalization
        
        max_cases,min_cases = min_max_logits(
            logit_vocab_sample_tensor = next_tokens_distribution,
            tokenizer_fn= tokenizer
            )

        actual_cases = actual_logit(
            logit_vocab_sample_tensor = next_tokens_distribution,
            #sample_position = file,
            tokenized_prompt = tf_input_ids[ file ],
            tokenizer_fn = tokenizer
            )
        
        max_logit_token_prompt.append( max_cases )
        min_logit_token_prompt.append( min_cases )
        actual_logit_token_prompt.append( actual_cases )
        
        logging.info(file)
    return max_logit_token_prompt,min_logit_token_prompt,actual_logit_token_prompt

In [None]:
#| hide
#| eval: false
#max_logit_token_prompt,min_logit_token_prompt,actual_logit_token_prompt = batching_logits( size = 2 )
max_logit_token_prompt, min_logit_token_prompt, actual_logit_token_prompt = batching_logits(
    size = len(filtered_prompts_ids)
    ) #<---WARNING TIME Consuming

In [None]:
#| hide
#| eval: false
dataframe_to_save = data_pd[data_pd['m_name']==params['codemodel']].copy()


In [None]:
#| hide
#| eval: false
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
815536,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c17,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154


In [None]:
#| hide
#| eval: false
dataframe_to_save['max_prob'] = max_logit_token_prompt
dataframe_to_save['min_prob'] = min_logit_token_prompt
dataframe_to_save['actual_prob'] = actual_logit_token_prompt

In [None]:
#| hide
#| eval: false
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob
815536,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c17,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[( test, 0.15791159868240356), (_, 0.410830557...","[(ヘラ, 4.97558140521992e-11), (��, 7.2149839382...","[( run, 0.012154975906014442), (_, 0.410830557..."


In [None]:
#| hide
#| eval: false
dataframe_to_save.shape

(50971, 15)

In [None]:
#| hide
#| eval: false
params['outputs']

'../data/ds_raw_logits/out_astevalverticalfiltered_c17.csv'

In [None]:
#| hide
#| eval: false
## Saving CheckPoint
dataframe_to_save.to_csv( params['outputs']  )

In [None]:
#| hide
#| eval: false
dataframe_to_save = pd.read_csv( 
                      params['outputs'] , 
                      index_col=0
            )

In [None]:
#| hide
#| eval: false
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob
815536,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c17,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' test', 0.15791159868240356), ('_', 0.41083...","[('ヘラ', 4.97558140521992e-11), ('��', 7.214983...","[(' run', 0.012154975906014442), ('_', 0.41083..."


In [None]:
#| hide
#| eval: false
dataframe_to_save.shape

(50971, 15)

## Loss Retrieval

In [None]:
#| export

def batching_loss( size ):
    output_loss = []
    for current_batch in range(size):
        out = np.load(params['numpy_files_logits_path']+'/'+f'loss_batch[{current_batch}]_model[{CODEMODEL}].npy') 
        output_loss.append( out.item() ) #.item() for numpy library
        logging.info(current_batch)
    return output_loss

In [None]:
#| hide
#| eval: false
output_loss = batching_loss(size = dataframe_to_save.shape[0] ) #[WAENING!] Takes Time

In [None]:
#| hide
#| eval: false
dataframe_to_save['loss'] = output_loss
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
815536,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c17,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' test', 0.15791159868240356), ('_', 0.41083...","[('ヘラ', 4.97558140521992e-11), ('��', 7.214983...","[(' run', 0.012154975906014442), ('_', 0.41083...",1.509103


In [None]:
#| hide
#| eval: false
## Saving CheckPoint 2
dataframe_to_save.to_csv( params['outputs']  )

In [None]:
#| hide
#| eval: false
dataframe_to_save = pd.read_csv( 
                      params['outputs'] , 
                      index_col=0
            )

In [None]:
#| hide
#| eval: false
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
815536,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c17,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' test', 0.15791159868240356), ('_', 0.41083...","[('ヘラ', 4.97558140521992e-11), ('��', 7.214983...","[(' run', 0.012154975906014442), ('_', 0.41083...",1.509103


# Bootstrapping Analysis

## Combining all datasets

In [None]:
#| export

models = ['c1','c2','c3','c5','c6','c9','c10','c11','c14','c15','c16','c17']

In [None]:
#| export

new_columns = {
    'c1':('gpt-3','125M'),
    'c2':('gpt-3','1.3B'),
    'c3':('gpt-3','2.7B'),
    'c5':('codegen-nl','350M'),
    'c6':('codegen-nl','2B'),
    'c9':('multi-lang','110M'),
    'c10':('multi-lang','350M'),
    'c11':('multi-lang','2B'),
    'c14':('mono-lang','110M'),
    'c15':('mono-lang','1.5B'),
    'c16':('mono-lang','350M'),
    'c17':('mono-lang','2B')
}

In [None]:
#| hide
#| eval: false
pd_combined_models = pd.read_csv( '/workspaces/CodeSyntaxConcept/data/ds_processed_logits_global/out_astevalverticalfiltered_c1.csv'  , index_col=0)

In [None]:
#| hide
#| eval: false
pd_combined_models.shape

(174, 6)

In [None]:
#| hide
#| eval: false
new_columns['c1']

('gpt-3', '125M')

In [None]:
#| hide
#| eval: false
pd_combined_models['id'] = new_columns['c1'][0] + " [" + new_columns['c1'][1] + "]"
pd_combined_models['type_model'] = new_columns['c1'][0]
pd_combined_models['size_model'] = new_columns['c1'][1]

In [None]:
#| hide
#| eval: false
pd_combined_models.head(1)

Unnamed: 0,ast_element,node_type,concept_median_prob,concept_min_prob,concept_max_prob,model,id,type_model,size_model
0,with_clause,parent,"[0.3349215416126299, 0.48713837044152364, 0.98...","[0.11321202570206879, 0.2244944909784513, 0.96...","[0.6549162715673447, 0.6862596067098471, 0.996...",EleutherAI/gpt-neo-125m,gpt-3 [125M],gpt-3,125M


In [None]:
#| hide
#| eval: false
for m in models[1:]:
    pd_temp = pd_combined_models.copy()
    pd_combined_models = pd.read_csv( f'/workspaces/CodeSyntaxConcept/data/ds_processed_logits_global/out_astevalverticalfiltered_{m}.csv'  , index_col=0)
    
    pd_combined_models['id'] = new_columns[m][0] + " [" + new_columns[m][1] + "]"
    pd_combined_models['type_model'] = new_columns[m][0]
    pd_combined_models['size_model'] = new_columns[m][1]
    
    pd_combined_models = pd.concat( [pd_temp,pd_combined_models], ignore_index=True ) #vertical concatenation
    print(f"completed-{m}")

completed-c2
completed-c3
completed-c5
completed-c6
completed-c9
completed-c10
completed-c11
completed-c14
completed-c15
completed-c16
completed-c17


In [None]:
#| hide
#| eval: false
set(pd_combined_models.model.values)

{'EleutherAI/gpt-neo-1.3B',
 'EleutherAI/gpt-neo-125m',
 'EleutherAI/gpt-neo-2.7B',
 'Salesforce/codegen-2B-mono',
 'Salesforce/codegen-2B-multi',
 'Salesforce/codegen-2B-nl',
 'Salesforce/codegen-350M-mono',
 'Salesforce/codegen-350M-multi',
 'Salesforce/codegen-350M-nl',
 'codeparrot/codeparrot',
 'codeparrot/codeparrot-small',
 'codeparrot/codeparrot-small-multi'}

In [None]:
#| hide
#| eval: false
pd_combined_models.to_csv('/workspaces/CodeSyntaxConcept/data/ds_processed_logits_global/out_astevalverticalfiltered_global.csv')  

In [None]:
#| hide
#| eval: false
pd_combined_models = pd.read_csv( '/workspaces/CodeSyntaxConcept/data/ds_processed_logits_global/out_astevalverticalfiltered_global.csv'  , index_col=0)

In [None]:
#| hide
#| eval: false
pd_combined_models.shape

(2088, 9)

## Median Bootstrapping

In [None]:
#| hide
#| eval: false
filtered_concepts_local = [
    'for_statement', #Iterative
    'while_statement', #Iterative
    'return_statement', #Ending
    ']', #Ending
    ')', #Ending
    'if_statement', #Decision
    'comparison_operator', #Boolean
    'boolean_operator', #Boolean
    'for_in_clause', #Funct
    'if_clause', #Funct
    'list_comprehension', #Funct
    'lambda',#Funct
    'identifier', #NL
    'string', #NL
]

In [None]:
#| hide
#| eval: false
filtered_concepts_global = [
    'for_statement', #Iterative
    'while_statement', #Iterative
    'return_statement', #Ending
    ']', #Ending
    ')', #Ending
    '}', #Ending
    ':', #Ending
    'if_statement', #Decision
    'elif',  #Decision
    'else',  #Decision
    'comparison_operator', #Operator
    'boolean_operator', #Operator
    'binary_operator', #Operator
    'unary_operator', #Operator
    'for_in_clause', #Funct
    'if_clause', #Funct
    'list_comprehension', #Funct
    'lambda',#Funct
    'identifier', #NL
    'string', #NL
    'comment', #NL
    'tuple', #Data
    'dictionary', #Data
    'list', #Data
    'set', #Data
    'try_statement', #Exceptions
    'except', #Exceptions
    'raise', #Exceptions
    'finally', #Exceptions
    'assert_statement', #Testing
    'integer', #types
    'float'#types
]

In [None]:
#| hide
#| eval: false
#https://datagy.io/filter-pandas/
pd_combined_models_filtered_global = pd_combined_models[pd_combined_models['ast_element'].isin( filtered_concepts_global )].copy()

In [None]:
#| hide
#| eval: false
pd_combined_models_filtered_global.shape

(384, 9)

In [None]:
#| hide
#| eval: false
pd_combined_models_filtered_global.head(1)

Unnamed: 0,ast_element,node_type,concept_median_prob,concept_min_prob,concept_max_prob,model,id,type_model,size_model
4,],leaf,"[0.5011640191078186, 0.4131472706794739, 0.199...","[0.5011640191078186, 0.4131472706794739, 0.199...","[0.5011640191078186, 0.4131472706794739, 0.199...",EleutherAI/gpt-neo-125m,gpt-3 [125M],gpt-3,125M


In [None]:
#| export

def bootstrapping( np_data, np_func, size ):
    """Create a bootstrap sample given data and a function
    For instance, a bootstrap sample of means, or mediands. 
    The bootstrap replicates are a long as the original size
    we can choose any observation more than once (resampling with replacement:np.random.choice)
    """
    
    #Cleaning NaNs
    #np_data_clean = np_data[ np.logical_not( np.isnan(np_data) ) ] 
    
    #The size of the bootstrap replicate is as big as size
    #Creating the boostrap replicates as long as the orignal data size
    #This strategy might work as imputation 
    bootstrap_repl = [ np_func( np.random.choice( np_data, size=len(np_data) ) ) for i in range( size ) ]
    
    #logging.info("Covariate: " + cov) #Empirical Mean
    #logging.info("Empirical Mean: " + str(np.mean(np_data_clean))) #Empirical Mean
    #logging.info("Bootstrapped Mean: " + str( np.mean(bootstrap_repl) ) ) #Bootstrapped Mean
    
    return np.array( bootstrap_repl )

In [None]:
#| export

#see https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def confidence_intervals_v2(data, confidence=0.95):
    dist = NormalDist.from_samples(data)
    z = NormalDist().inv_cdf((1 + confidence) / 2.)
    h = dist.stdev * z / ((len(data) - 1) ** .5)
    return dist.mean - h, dist.mean + h, h

#test_confidence = confidence_intervals_v2(data = test_boots, confidence=0.95)

In [None]:
#| export

def standard_error(bootstrapped_data):
    return np.std( bootstrapped_data )

In [None]:
#| hide
#| eval: false
# [WARNING] Bootstrapping Takes Time
pd_combined_models_filtered_global['median_bootstrap'] = pd_combined_models_filtered_global.apply(
        lambda row :
                np.median(
                        bootstrapping( 
                                np_data = eval(row.concept_median_prob),
                                np_func = np.median,
                                size = 500
                        )
                ) ,axis=1
    )

In [None]:
#| hide
#| eval: false
pd_combined_models_filtered_global.head(2)

Unnamed: 0,ast_element,node_type,concept_median_prob,concept_min_prob,concept_max_prob,model,id,type_model,size_model,median_bootstrap
4,],leaf,"[0.5011640191078186, 0.4131472706794739, 0.199...","[0.5011640191078186, 0.4131472706794739, 0.199...","[0.5011640191078186, 0.4131472706794739, 0.199...",EleutherAI/gpt-neo-125m,gpt-3 [125M],gpt-3,125M,0.51997
20,},leaf,"[0.054765164852142334, 0.2852576673030853, 0.1...","[0.054765164852142334, 0.2852576673030853, 0.1...","[0.054765164852142334, 0.2852576673030853, 0.1...",EleutherAI/gpt-neo-125m,gpt-3 [125M],gpt-3,125M,0.407857


In [None]:
#| hide
#| eval: false
pd_combined_models_filtered_global.to_csv('/workspaces/CodeSyntaxConcept/data/ds_processed_logits_global/out_astevalverticalfiltered_global_filtered_bts.csv')  

In [None]:
#| hide
#| eval: false
pd_combined_models_filtered_global['median_standard_error'] = pd_combined_models_filtered_global.apply(
        lambda row :
                standard_error(
                        bootstrapping( 
                                np_data = eval(row.concept_median_prob),
                                np_func = np.median,
                                size = 500
                        )
                ) ,axis=1
    )

In [None]:
#| hide
#| eval: false
pd_combined_models_filtered_global.head(2)

Unnamed: 0,ast_element,node_type,concept_median_prob,concept_min_prob,concept_max_prob,model,id,type_model,size_model,median_bootstrap,median_standard_error
4,],leaf,"[0.5011640191078186, 0.4131472706794739, 0.199...","[0.5011640191078186, 0.4131472706794739, 0.199...","[0.5011640191078186, 0.4131472706794739, 0.199...",EleutherAI/gpt-neo-125m,gpt-3 [125M],gpt-3,125M,0.51997,0.002649
20,},leaf,"[0.054765164852142334, 0.2852576673030853, 0.1...","[0.054765164852142334, 0.2852576673030853, 0.1...","[0.054765164852142334, 0.2852576673030853, 0.1...",EleutherAI/gpt-neo-125m,gpt-3 [125M],gpt-3,125M,0.407857,0.004083


In [None]:

#| hide
#| eval: false
pd_combined_models_filtered_global.to_csv('/workspaces/CodeSyntaxConcept/data/ds_processed_logits_global/out_astevalverticalfiltered_global_filtered_bts.csv')  

## Local Combination

In [None]:
#| hide
#| eval: false
pd_combined_models_local = pd.read_csv( '/workspaces/CodeSyntaxConcept/data/ds_processed_logits_local/out_astevalverticalfiltered_c1.csv'  , index_col=0)

In [None]:
#| hide
#| eval: false
confounders = [ 'size', 'ast_levels', 'complexity', 'n_ast_nodes']
performance = [
               'for_statement', 
               'while_statement', 
               'return_statement',
               ']', 
               ')', 
               'if_statement', 
               'comparison_operator', 
               'boolean_operator',
               'for_in_clause', 
               'if_clause', 
               'list_comprehension', 
               'lambda',
               'identifier', 
               'string', 
               ]
outcome = ['loss']

In [None]:
#| hide
#| eval: false
pd_combined_models_local = pd_combined_models_local[confounders+performance+outcome].copy() #Filtering

In [None]:
#| hide
#| eval: false
pd_combined_models_local.head(1)

Unnamed: 0,size,ast_levels,complexity,n_ast_nodes,for_statement,while_statement,return_statement,],),if_statement,...,for_in_clause,if_clause,list_comprehension,lambda,identifier,string,loss,id,type_model,size_model
0,280,13,7,154,0.0,0.0,0.0,0.457156,0.423305,0.587624,...,0.722651,0.0,0.0,0.0,0.382276,0.390246,1.79292,gpt-3 [125M],gpt-3,125M


In [None]:
#| hide
#| eval: false
pd_combined_models_local['id'] = new_columns['c1'][0] + " [" + new_columns['c1'][1] + "]"
pd_combined_models_local['type_model'] = new_columns['c1'][0]
pd_combined_models_local['size_model'] = new_columns['c1'][1]

In [None]:
#| hide
#| eval: false
pd_combined_models_local.shape

(50971, 22)

In [None]:
#| hide
#| eval: false
for m in models[1:]:
    pd_temp = pd_combined_models_local.copy()
    pd_combined_models_local = pd.read_csv( f'/workspaces/CodeSyntaxConcept/data/ds_processed_logits_local/out_astevalverticalfiltered_{m}.csv'  , index_col=0)
    pd_combined_models_local = pd_combined_models_local[confounders+performance+outcome].copy() #Filtering
    
    pd_combined_models_local['id'] = new_columns[m][0] + " [" + new_columns[m][1] + "]"
    pd_combined_models_local['type_model'] = new_columns[m][0]
    pd_combined_models_local['size_model'] = new_columns[m][1]
    
    pd_combined_models_local = pd.concat( [pd_temp,pd_combined_models_local], ignore_index=True ) #vertical concatenation
    print(f"completed-{m}")

completed-c2
completed-c3
completed-c5
completed-c6
completed-c9
completed-c10
completed-c11
completed-c14
completed-c15
completed-c16
completed-c17


In [None]:
#| hide
#| eval: false
pd_combined_models_local.shape

(611652, 22)

In [None]:
#| hide
#| eval: false
pd_combined_models_local.to_csv('/workspaces/CodeSyntaxConcept/data/ds_processed_logits_local/out_astevalverticalfiltered_local.csv')  