In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
import gc

In [2]:
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(
    filename="logger_data_engineering.txt",
    filemode='a',
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO
    )

In [3]:
import seaborn as sns
from scipy import stats
from statistics import NormalDist
import matplotlib.pyplot as plt

# Data Engineering

In [4]:
def c_eleuther( returnModel = False, model_type =  'EleutherAI/gpt-neo-125m'):
    ''' Eleuther and Salesforce and Parrot uses the same importation'''
    from transformers import AutoTokenizer, AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(model_type)
    logging.info("Tokenizer Loaded")
    if returnModel:
        model = AutoModelForCausalLM.from_pretrained(model_type)
        logging.info("Model Loaded")
    else:
        model = []
    
    logging.info(model_type)
    return tokenizer, model
    
    

def init_model_args( current_case = 'c1', returnModel = False ): 
    
    code_models = {
        'c1':('EleutherAI/gpt-neo-125m', ), # Basic (on Pile) GPT-3/J
        'c2':('EleutherAI/gpt-neo-1.3B', ),
        'c3':('EleutherAI/gpt-neo-2.7B', ),
        'c4':('EleutherAI/gpt-j-6b', ),
        'c5':('Salesforce/codegen-350M-nl', ), #Basic (on Pile) codegen
        'c6':('Salesforce/codegen-2B-nl', ),
        'c7':('Salesforce/codegen-6B-nl', ),
        'c8':('Salesforce/codegen-16B-nl', ),
        'c9':('codeparrot/codeparrot-small-multi', ), #multi-Language
        'c10':('Salesforce/codegen-350M-multi', ),
        'c11':('Salesforce/codegen-2B-multi', ),
        'c12':('Salesforce/codegen-6B-multi', ),
        'c13':('Salesforce/codegen-16B-multi', ),
        'c14':('codeparrot/codeparrot-small', ), #mono-Language
        'c15':('codeparrot/codeparrot', ),
        'c16':('Salesforce/codegen-350M-mono', ),
        'c17':('Salesforce/codegen-2B-mono', ),
        'c18':('Salesforce/codegen-6B-mono', ),
        'c19':('Salesforce/codegen-16B-mono', ),
    }
    
    model_type = code_models[current_case][0]
    tokenizer, model =  c_eleuther( returnModel = returnModel,  model_type = model_type ) 
    
    
    return model_type, tokenizer, model

## Init Parameters

In [5]:
# [WARNING] Hyperparameters changes, please tune them up
CODEMODEL =  'c15'
params = {
    'codemodel' : CODEMODEL,
    'numpy_files_logits_path': f'../datax/np_files_logits/{CODEMODEL}',
    'testbeds_path' : '../datax/testbeds/AstEvalVerticalFiltered.json',
    'outputs' : f'../data/ds_raw_logits/out_astevalverticalfiltered_{CODEMODEL}.csv'
}

In [6]:
params['outputs']

'../data/ds_raw_logits/out_astevalverticalfiltered_c15.csv'

In [7]:
#Uploading Model UnderAnalisys
name, tokenizer, model = init_model_args(
    current_case = params['codemodel'], 
    returnModel = False #[WARNING!] Check the parameters before calling it. 
    )

In [8]:
#Testing data loads
data_pd = pd.read_json( params['testbeds_path'] )
data_pd.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
0,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c1,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154


# Softmax Normalization and Data Engineering

In [9]:
filtered_prompts_ids = data_pd[data_pd['m_name']==params['codemodel']]['ids'].values

In [10]:
tf_input_ids = [torch.tensor(  input_ids, dtype = torch.int) for input_ids in filtered_prompts_ids ]

### Testing Tokenizer

In [11]:
#Testung Sample to validate minmax functions
sample_id = 0
data_pd[data_pd['m_name']==params['codemodel']].iloc[sample_id]

size                                                            194
ids               [318, 1255, 63, 1548, 63, 2219, 8, 396, 63, 31...
m_name                                                          c15
code              def run_python_tests(test_modules, parallelism...
ast_errors                                                       []
n_ast_errors                                                      0
ast_levels                                                       13
n_whitespaces_                                                   67
complexity                                                        7
nloc                                                             15
token_counts                                                    120
n_ast_nodes                                                     154
Name: 713594, dtype: object

In [12]:
tf_input_ids[sample_id]

tensor([  318,  1255,    63,  1548,    63,  2219,     8,   396,    63,  3112,
           12, 30042, 24366,    12,   543,    63, 10828,    29,   797,   304,
          272,   663,    63,  1213,    63,   460,    63,  1457,   480,  9602,
         1611, 27110,  2295,   401,   298,  8475,    63,  4521,    51, 28535,
           63, 15505,    63,  7569,   531,   339,   340,   543,    63, 10828,
           26,   267,   327,  1849,  3652,  7704,   314,  1611, 27110,  2295,
        17566, 10562,  7037,   370,   946, 22421, 30042, 24366,    14,   267,
          327,  3979,   781,  1255,  1611, 27110,  2295,   543, 11745,    12,
          652,  4440,   841,   367,  3063,   465,   267,   327, 18092,    14,
          267, 30042, 24366,   275,   841,   267,  2884,   275,   298,  1065,
           13,  2219,    13,  1045,    13, 10828,     2,   272,   587,    26,
          267,  2884,   275,   298,  1065,    13,  2219,     2,   272,  1414,
          275,   359,   736,    14,   515,    14,   904,     8, 

In [13]:
# Loading Logits From Files
out= np.load( params['numpy_files_logits_path'] +'/' + f'logits_tensor[{sample_id}]_batch[{sample_id}]_model[{CODEMODEL}].npy' )
soft = torch.nn.Softmax( dim = 0 ) #Flattening normalization
first_token_distribution = soft( torch.from_numpy(out[0]) ) #Flattening normalization for fist token
assert round(first_token_distribution.sum().item()) == 1.0


In [14]:
first_token_distribution[0]

tensor(7.1191e-06)

In [15]:
tokenizer.decode([7881])

'unCP'

In [16]:
tokenizer.vocab.keys()



In [17]:
tokenizer.get_vocab()

{'Ġtopo': 11111,
 "'[": 3894,
 'high': 6260,
 'Dst': 31413,
 'Compound': 26552,
 'cation': 1063,
 'insecure': 24824,
 "_='": 7993,
 'tric': 13929,
 'TTTT': 21939,
 'During': 28734,
 'Ġbegin': 5386,
 'srt': 24336,
 'Ġjo': 5225,
 'exempt': 25254,
 'Dependency': 14403,
 'OM': 7474,
 'unexpected': 8953,
 '>`_': 18740,
 'Ġ"\\"': 30768,
 'ĊĠĠĠĠĠĠĠĠĊĠĠĠĠĠĠĠĠĠĠĠ': 27180,
 'End': 3005,
 'ronic': 20410,
 'ĠHELP': 29269,
 'tries': 6264,
 'Press': 14414,
 'installer': 17047,
 'Ġtimed': 18590,
 'approved': 21556,
 'ĠURL': 2851,
 '*,': 10381,
 'ivy': 10241,
 'Declar': 31963,
 'ne': 685,
 'acle': 12550,
 'Ġrec': 6488,
 'SVC': 17675,
 'xmlrpc': 17596,
 'atility': 18783,
 'ĠeError': 19424,
 'vation': 4255,
 'ÏĦ': 17123,
 'Ġlocks': 23340,
 'Ġbytestring': 23888,
 'Construct': 11493,
 'Ġheads': 25362,
 'yp': 2279,
 'ParetoRandomVariable': 27663,
 'Ġugly': 23761,
 'Pen': 15597,
 'Ġlose': 27332,
 'penden': 2230,
 'TemplateResponse': 31036,
 '/*\\': 26695,
 'EGG': 30124,
 'Ġresponses': 9320,
 'csrf': 10088,


## Logit Uploading and Flattening

In [18]:
first_token_distribution.topk(k=1, largest =True) #Returns the most probable token for that position

torch.return_types.topk(
values=tensor([0.1375]),
indices=tensor([511]))

In [19]:
def topk_tuple( logit_vocab_tensor, largest, tokenizer_fn):
    "Run topk for a token"
    #token_position_dict = list( tokenizer.get_vocab().keys() )
    topk = logit_vocab_tensor.topk( k=1 , largest=largest ) #TODO K number of elements can be extended
    #print(topk.indices)
    return ( tokenizer_fn.decode(topk.indices), topk.values.item() )

def min_max_logits( logit_vocab_sample_tensor, tokenizer_fn ):
    "Compute min_max for a sample"
    max_cases = []
    min_cases = []
    for logit_vocab_tensor in logit_vocab_sample_tensor:
        max_cases.append( topk_tuple( logit_vocab_tensor = logit_vocab_tensor, largest = True, tokenizer_fn = tokenizer_fn) ) #TST Max Logit
        min_cases.append( topk_tuple( logit_vocab_tensor = logit_vocab_tensor, largest = False, tokenizer_fn = tokenizer_fn) ) #TST Min Logit
    return max_cases, min_cases

def actual_logit( 
                 logit_vocab_sample_tensor, 
                 tokenized_prompt, 
                 tokenizer_fn,
                 ):
    "Compute actual logits for a sample"
    actual_logits_prompt = []
    for token_pos, id_token in enumerate( tokenized_prompt[1:] ): #Eliminate the first token prediction since we do not use it
        actual_logits_prompt.append(
            (   tokenizer_fn.decode( int(id_token) ), #retrieving the name of the token with the id
                logit_vocab_sample_tensor[token_pos][int(id_token)].item()      ) #retrieving the logit given the position in the sequence and the position in the vocab
            )
    return actual_logits_prompt

In [20]:
max_case,min_case = min_max_logits(
    logit_vocab_sample_tensor = [ soft( torch.from_numpy(token) ) for token in out[:4]],
    tokenizer_fn= tokenizer
    ) #Sentence Level minmax (reduced to 3 tokens)
print(max_case)
print(min_case)

[(' test', 0.1375076323747635), ('_', 0.597936749458313), ('command', 0.06420105695724487), ('_', 0.8308457732200623)]
[('Sorry', 4.279823162534058e-09), (" '/',", 1.0003992426121933e-11), (' hashes', 2.366234153061697e-10), (' lack', 7.656505637831934e-12)]


In [21]:
sample_id

0

In [22]:
out[0]

array([ 0.31265062,  0.9233073 ,  1.9250013 , ..., -3.3065329 ,
       -2.9923072 , -0.03829849], dtype=float32)

In [23]:
len(out[0])

32768

In [24]:
actual_cases = actual_logit(
    logit_vocab_sample_tensor = [ soft( torch.from_numpy(token) ) for token in out] , #Out is a complete sequence
    tokenized_prompt = tf_input_ids[ sample_id ],
    tokenizer_fn = tokenizer
    )

In [25]:
actual_cases

[(' run', 0.008938679471611977),
 ('_', 0.597936749458313),
 ('python', 0.006282455287873745),
 ('_', 0.8308457732200623),
 ('tests', 0.19709986448287964),
 ('(', 0.7202150821685791),
 ('test', 0.10973510891199112),
 ('_', 0.9092771410942078),
 ('modules', 0.051806215196847916),
 (',', 0.7252672910690308),
 (' paralle', 0.0006053867982700467),
 ('lism', 0.907451331615448),
 (',', 0.46318528056144714),
 (' with', 0.01322502363473177),
 ('_', 0.9971532821655273),
 ('coverage', 0.8213512301445007),
 ('=', 0.638321578502655),
 ('False', 0.7724078297615051),
 ('):', 0.6867207884788513),
 ('\n   ', 0.7567179203033447),
 (' set', 0.08770234882831573),
 ('_', 0.9746276140213013),
 ('title', 0.8571750521659851),
 ('_', 0.9264533519744873),
 ('and', 0.9881894588470459),
 ('_', 0.9999985694885254),
 ('block', 0.984995424747467),
 ('("', 0.8790395259857178),
 ('Running', 0.9553847908973694),
 (' Py', 0.394257128238678),
 ('Spark', 0.6713027954101562),
 (' tests', 0.9851412773132324),
 ('",', 0.978

# Logits Execution

In [26]:
len(filtered_prompts_ids)
#params['numpy_files_logits_path'] +'/' + f'logits_tensor[{sample_id}]_batch[{sample_id}]_model[{CODEMODEL}].npy'

50971

In [27]:
def batching_logits( size = len(filtered_prompts_ids) ):

    max_logit_token_prompt = []
    min_logit_token_prompt = []
    actual_logit_token_prompt = []
    
    soft = torch.nn.Softmax( dim = 0 )                          #Flattening normalization
    
    for file in range( size ):
        out = np.load(params['numpy_files_logits_path']+'/'+ f'logits_tensor[{file}]_batch[{file}]_model[{CODEMODEL}].npy')
        next_tokens_distribution = [ soft( torch.from_numpy(token) ) for token in out]  #Flattening normalization
        
        max_cases,min_cases = min_max_logits(
            logit_vocab_sample_tensor = next_tokens_distribution,
            tokenizer_fn= tokenizer
            )

        actual_cases = actual_logit(
            logit_vocab_sample_tensor = next_tokens_distribution,
            #sample_position = file,
            tokenized_prompt = tf_input_ids[ file ],
            tokenizer_fn = tokenizer
            )
        
        max_logit_token_prompt.append( max_cases )
        min_logit_token_prompt.append( min_cases )
        actual_logit_token_prompt.append( actual_cases )
        
        logging.info(file)
    return max_logit_token_prompt,min_logit_token_prompt,actual_logit_token_prompt

In [28]:
#max_logit_token_prompt,min_logit_token_prompt,actual_logit_token_prompt = batching_logits( size = 2 )
max_logit_token_prompt, min_logit_token_prompt, actual_logit_token_prompt = batching_logits(
    #size = 2
    ) #<---WARNING TIME Consuming

In [None]:
dataframe_to_save = data_pd[data_pd['m_name']==params['codemodel']].copy()


In [None]:
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
509710,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c11,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154


In [None]:
dataframe_to_save['max_prob'] = max_logit_token_prompt
dataframe_to_save['min_prob'] = min_logit_token_prompt
dataframe_to_save['actual_prob'] = actual_logit_token_prompt

In [None]:
dataframe_to_save.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob
509710,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c11,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[( test, 0.2930986285209656), ((, 0.4787138998...","[(���, 9.901952033927047e-14), (��, 9.24228728...","[( run, 0.006889969576150179), (_, 0.215628445..."
509711,122,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c11,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[( test, 0.29309898614883423), (_, 0.671698391...","[(���, 9.902002178277525e-14), (��, 7.44283860...","[( get, 0.08673281967639923), (_, 0.6716983914..."
509712,167,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c11,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[( test, 0.2930968999862671), (_, 0.7564151883...","[(���, 9.90181854153456e-14), (��, 5.357790919...","[( test, 0.2930968999862671), (_, 0.7564151883..."
509713,352,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 502...",c11,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[( test, 0.2930949628353119), (_, 0.7564187645...","[(���, 9.901356400358538e-14), (��, 5.35781655...","[( test, 0.2930949628353119), (_, 0.7564187645..."
509714,468,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c11,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[( test, 0.29308953881263733), (_, 0.756419122...","[(���, 9.901531905585209e-14), (��, 5.35757348...","[( test, 0.29308953881263733), (_, 0.756419122..."


In [None]:
dataframe_to_save.shape

(50971, 15)

In [None]:
params['outputs']

'../data/ds_raw_logits/out_astevalverticalfiltered_c11.csv'

In [None]:
## Saving CheckPoint
dataframe_to_save.to_csv( params['outputs']  )

In [None]:
dataframe_to_save = pd.read_csv( 
                      params['outputs'] , 
                      index_col=0
            )

In [None]:
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob
509710,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c11,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' test', 0.2930986285209656), ('(', 0.478713...","[('���', 9.901952033927047e-14), ('��', 9.2422...","[(' run', 0.006889969576150179), ('_', 0.21562..."


In [None]:
dataframe_to_save.shape

(50971, 15)

## Loss Retrieval

In [None]:
def batching_loss( size = dataframe_to_save.shape[0] ):
    output_loss = []
    for current_batch in range(size):
        out = np.load(params['numpy_files_logits_path']+'/'+f'loss_batch[{current_batch}]_model[{CODEMODEL}].npy') 
        output_loss.append( out.item() ) #.item() for numpy library
        logging.info(current_batch)
    return output_loss

In [None]:
output_loss = batching_loss()

In [None]:
output_loss

[0.5538359880447388,
 0.9980243444442749,
 0.6270461678504944,
 0.5394910573959351,
 0.5734942555427551,
 0.9254372715950012,
 0.9574809670448303,
 1.0923113822937012,
 1.018527626991272,
 1.2701603174209595,
 0.4565414488315582,
 0.8710812926292419,
 0.8113462924957275,
 0.9423788785934448,
 1.0700488090515137,
 1.1814287900924683,
 0.9184234142303467,
 1.061155915260315,
 0.6328152418136597,
 0.2760624289512634,
 0.2698154151439667,
 0.41957518458366394,
 0.8125044107437134,
 0.8152645230293274,
 1.1329861879348755,
 0.907298743724823,
 0.7357975244522095,
 0.5761137008666992,
 1.0465426445007324,
 0.5303027033805847,
 2.1901166439056396,
 0.9620245695114136,
 0.8741754293441772,
 0.7289432287216187,
 0.577857494354248,
 0.5760984420776367,
 0.5433822870254517,
 1.0447763204574585,
 1.4224224090576172,
 1.2442312240600586,
 0.6622323393821716,
 1.3007172346115112,
 0.7755067348480225,
 1.3602198362350464,
 1.5325920581817627,
 1.424408197402954,
 1.291252613067627,
 1.155175447463989

In [None]:
dataframe_to_save['loss'] = output_loss
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
509710,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c11,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' test', 0.2930986285209656), ('(', 0.478713...","[('���', 9.901952033927047e-14), ('��', 9.2422...","[(' run', 0.006889969576150179), ('_', 0.21562...",0.553836


In [None]:
## Saving CheckPoint 2
dataframe_to_save.to_csv( params['outputs']  )

In [None]:
dataframe_to_save = pd.read_csv( 
                      params['outputs'] , 
                      index_col=0
            )

In [None]:
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
509710,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c11,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' test', 0.2930986285209656), ('(', 0.478713...","[('���', 9.901952033927047e-14), ('��', 9.2422...","[(' run', 0.006889969576150179), ('_', 0.21562...",0.553836


# Combining all datasets (last step)