In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
import gc

In [2]:
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(
    filename="logger_data_engineering_copy.txt",
    filemode='a',
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO
    )

In [3]:
import seaborn as sns
from scipy import stats
from statistics import NormalDist
import matplotlib.pyplot as plt

# Data Engineering

In [4]:
def c_eleuther( returnModel = False, model_type =  'EleutherAI/gpt-neo-125m'):
    ''' Eleuther and Salesforce and Parrot uses the same importation'''
    from transformers import AutoTokenizer, AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(model_type)
    logging.info("Tokenizer Loaded")
    if returnModel:
        model = AutoModelForCausalLM.from_pretrained(model_type)
        logging.info("Model Loaded")
    else:
        model = []
    
    logging.info(model_type)
    return tokenizer, model
    
    

def init_model_args( current_case = 'c1', returnModel = False ): 
    
    code_models = {
        'c1':('EleutherAI/gpt-neo-125m', ), # Basic (on Pile) GPT-3/J
        'c2':('EleutherAI/gpt-neo-1.3B', ),
        'c3':('EleutherAI/gpt-neo-2.7B', ),
        'c4':('EleutherAI/gpt-j-6b', ),
        'c5':('Salesforce/codegen-350M-nl', ), #Basic (on Pile) codegen
        'c6':('Salesforce/codegen-2B-nl', ),
        'c7':('Salesforce/codegen-6B-nl', ),
        'c8':('Salesforce/codegen-16B-nl', ),
        'c9':('codeparrot/codeparrot-small-multi', ), #multi-Language
        'c10':('Salesforce/codegen-350M-multi', ),
        'c11':('Salesforce/codegen-2B-multi', ),
        'c12':('Salesforce/codegen-6B-multi', ),
        'c13':('Salesforce/codegen-16B-multi', ),
        'c14':('codeparrot/codeparrot-small', ), #mono-Language
        'c15':('codeparrot/codeparrot', ),
        'c16':('Salesforce/codegen-350M-mono', ),
        'c17':('Salesforce/codegen-2B-mono', ),
        'c18':('Salesforce/codegen-6B-mono', ),
        'c19':('Salesforce/codegen-16B-mono', ),
    }
    
    model_type = code_models[current_case][0]
    tokenizer, model =  c_eleuther( returnModel = returnModel,  model_type = model_type ) 
    
    
    return model_type, tokenizer, model

## Init Parameters

In [5]:
# [WARNING] Hyperparameters changes, please tune them up
CODEMODEL =  'c5'
params = {
    'codemodel' : CODEMODEL,
    'numpy_files_logits_path': f'../datax/np_files_logits/{CODEMODEL}',
    'testbeds_path' : '../datax/testbeds/AstEvalVerticalFiltered.json',
    'outputs' : f'../data/ds_raw_logits/out_astevalverticalfiltered_{CODEMODEL}.csv'
}

In [6]:
params['outputs']

'../data/ds_raw_logits/out_astevalverticalfiltered_c5.csv'

In [7]:
#Uploading Model UnderAnalisys
name, tokenizer, model = init_model_args(
    current_case = params['codemodel'], 
    returnModel = False #[WARNING!] Check the parameters before calling it. 
    )

Downloading (…)okenizer_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [8]:
#Testing data loads
data_pd = pd.read_json( params['testbeds_path'] )
data_pd.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
0,280,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c1,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154


# Softmax Normalization and Data Engineering

In [9]:
filtered_prompts_ids = data_pd[data_pd['m_name']==params['codemodel']]['ids'].values

In [10]:
tf_input_ids = [torch.tensor(  input_ids, dtype = torch.int) for input_ids in filtered_prompts_ids ]

### Testing Tokenizer

In [11]:
#Testung Sample to validate minmax functions
sample_id = 0
data_pd[data_pd['m_name']==params['codemodel']].iloc[sample_id]

size                                                            280
ids               [4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...
m_name                                                           c5
code              def run_python_tests(test_modules, parallelism...
ast_errors                                                       []
n_ast_errors                                                      0
ast_levels                                                       13
n_whitespaces_                                                   67
complexity                                                        7
nloc                                                             15
token_counts                                                    120
n_ast_nodes                                                     154
Name: 203884, dtype: object

In [12]:
tf_input_ids[sample_id]

tensor([ 4299,  1057,    62, 29412,    62, 41989,     7,  9288,    62, 18170,
           11, 10730,  1042,    11,   351,    62,  1073,  1857,    28, 25101,
         2599,   198,   220,   220,   220,   900,    62,  7839,    62,   392,
           62,  9967,  7203, 28768,  9485,  4561,   668,  5254,  1600,   366,
         9148, 11290,    62,    47,    56,  4303, 14175,    62,  4944,  2043,
           62,    51,  1546,  4694,  4943,   628,   220,   220,   220,   611,
          351,    62,  1073,  1857,    25,   198,   220,   220,   220,   220,
          220,   220,   220,  1303, 33998,  1838,   262,  9485,  4561,   668,
         5254,   781, 15492,  2233,   284,  4334, 10730,  1042,    13,   198,
          220,   220,   220,   220,   220,   220,   220,  1303,  1649,   356,
         1057,  9485,  4561,   668,  5254,   351,  5197,    11,   340,  3544,
          604,   329,   783,   355,   198,   220,   220,   220,   220,   220,
          220,   220,  1303, 46513,    13,   198,   220,   220, 

In [13]:
# Loading Logits From Files
out= np.load( params['numpy_files_logits_path'] +'/' + f'logits_tensor[{sample_id}]_batch[{sample_id}]_model[{CODEMODEL}].npy' )
soft = torch.nn.Softmax( dim = 0 ) #Flattening normalization
first_token_distribution = soft( torch.from_numpy(out[0]) ) #Flattening normalization for fist token
assert round(first_token_distribution.sum().item()) == 1.0


In [14]:
first_token_distribution[0]

tensor(0.0009)

In [15]:
tokenizer.decode([7881])

' register'

In [16]:
tokenizer.vocab.keys()



In [17]:
tokenizer.get_vocab()

{'Ġhomicide': 19625,
 'Ġapart': 5475,
 'Reg': 8081,
 'Do': 5211,
 'bot': 13645,
 'Ġmerge': 20121,
 'Ġlining': 20883,
 'Ġsimplest': 24043,
 'Ġbillionaires': 34740,
 'ĠWilliamson': 34974,
 'increasing': 42647,
 'ĠJon': 5966,
 'mm': 3020,
 'ĠDefin': 29589,
 'Ġboss': 6478,
 'ĠWii': 16591,
 'lay': 10724,
 'ĠKirin': 40262,
 'ĠSomebody': 43141,
 'Ġindistinguishable': 43649,
 'ĠTechnical': 20671,
 'ĠAtari': 35884,
 '+.': 27613,
 'Ġovercl': 40397,
 'ĠLaos': 45919,
 'diff': 26069,
 'ĠTrinidad': 48269,
 'adel': 6959,
 '143': 21139,
 'õ': 177,
 'ĠEnter': 6062,
 'ĠAmbro': 43233,
 'ma': 2611,
 'Ġvalued': 17560,
 'Ġleases': 38522,
 'Ġambassadors': 41702,
 'ãģķ': 43357,
 'ĠYour': 3406,
 'ĠSin': 10884,
 'Db': 43832,
 'ĠMonica': 23240,
 'forest': 29623,
 'jew': 47483,
 'Ġiso': 47279,
 'Repe': 47541,
 'outed': 18534,
 'Ġfright': 12773,
 'ĠKatie': 24721,
 'ĠThey': 1119,
 'Ġmetabolism': 20211,
 'Ġdragons': 20308,
 'Ġtitanium': 41284,
 'Ġcontributions': 9284,
 'ĠJosh': 8518,
 'Ġ05': 8870,
 'Ġgym': 11550,
 '

## Logit Uploading and Flattening

In [18]:
first_token_distribution.topk(k=1, largest =True) #Returns the most probable token for that position

torch.return_types.topk(
values=tensor([0.0654]),
indices=tensor([62]))

In [19]:
def topk_tuple( logit_vocab_tensor, largest, tokenizer_fn):
    "Run topk for a token"
    #token_position_dict = list( tokenizer.get_vocab().keys() )
    topk = logit_vocab_tensor.topk( k=1 , largest=largest ) #TODO K number of elements can be extended
    #print(topk.indices)
    return ( tokenizer_fn.decode(topk.indices), topk.values.item() )

def min_max_logits( logit_vocab_sample_tensor, tokenizer_fn ):
    "Compute min_max for a sample"
    max_cases = []
    min_cases = []
    for logit_vocab_tensor in logit_vocab_sample_tensor:
        max_cases.append( topk_tuple( logit_vocab_tensor = logit_vocab_tensor, largest = True, tokenizer_fn = tokenizer_fn) ) #TST Max Logit
        min_cases.append( topk_tuple( logit_vocab_tensor = logit_vocab_tensor, largest = False, tokenizer_fn = tokenizer_fn) ) #TST Min Logit
    return max_cases, min_cases

def actual_logit( 
                 logit_vocab_sample_tensor, 
                 tokenized_prompt, 
                 tokenizer_fn,
                 ):
    "Compute actual logits for a sample"
    actual_logits_prompt = []
    for token_pos, id_token in enumerate( tokenized_prompt[1:] ): #Eliminate the first token prediction since we do not use it
        actual_logits_prompt.append(
            (   tokenizer_fn.decode( int(id_token) ), #retrieving the name of the token with the id
                logit_vocab_sample_tensor[token_pos][int(id_token)].item()      ) #retrieving the logit given the position in the sequence and the position in the vocab
            )
    return actual_logits_prompt

In [20]:
max_case,min_case = min_max_logits(
    logit_vocab_sample_tensor = [ soft( torch.from_numpy(token) ) for token in out[:4]],
    tokenizer_fn= tokenizer
    ) #Sentence Level minmax (reduced to 3 tokens)
print(max_case)
print(min_case)

[('_', 0.06537198275327682), ('_', 0.17509573698043823), ('test', 0.0670626237988472), ('_', 0.29162847995758057)]
[('quickShipAvailable', 7.791013792229559e-14), ('channelAvailability', 1.3532363801457213e-16), ('�士', 4.875596454345654e-12), ('quickShipAvailable', 7.463312444530577e-25)]


In [21]:
sample_id

0

In [22]:
out[0]

array([16.705246 , 16.441128 , 15.44639  , ..., -5.198506 , -5.215082 ,
       -5.2114615], dtype=float32)

In [23]:
len(out[0])

51200

In [24]:
actual_cases = actual_logit(
    logit_vocab_sample_tensor = [ soft( torch.from_numpy(token) ) for token in out] , #Out is a complete sequence
    tokenized_prompt = tf_input_ids[ sample_id ],
    tokenizer_fn = tokenizer
    )

In [25]:
actual_cases

[(' run', 2.0048053556820378e-05),
 ('_', 0.17509573698043823),
 ('python', 0.004026083275675774),
 ('_', 0.29162847995758057),
 ('tests', 0.06515749543905258),
 ('(', 0.2737066149711609),
 ('test', 0.04412486031651497),
 ('_', 0.6183211803436279),
 ('modules', 0.0008266911027021706),
 (',', 0.33119067549705505),
 (' parallel', 0.0005007521249353886),
 ('ism', 0.13258478045463562),
 (',', 0.19979333877563477),
 (' with', 0.0006641774671152234),
 ('_', 0.9744492769241333),
 ('co', 0.0007279330748133361),
 ('verage', 0.9097177982330322),
 ('=', 0.28591224551200867),
 ('False', 0.5996348261833191),
 ('):', 0.41072025895118713),
 ('\n', 0.9089254140853882),
 (' ', 0.9474048018455505),
 (' ', 0.9351903796195984),
 (' ', 0.998816728591919),
 (' set', 0.0008627278730273247),
 ('_', 0.6558790802955627),
 ('title', 0.00029944657580927014),
 ('_', 0.01964067481458187),
 ('and', 0.08589049428701401),
 ('_', 0.9994111061096191),
 ('block', 0.00022660034301225096),
 ('("', 0.2782861888408661),
 ('R

# Logits Execution

In [26]:
len(filtered_prompts_ids)
#params['numpy_files_logits_path'] +'/' + f'logits_tensor[{sample_id}]_batch[{sample_id}]_model[{CODEMODEL}].npy'

50971

In [27]:
def batching_logits( size = len(filtered_prompts_ids) ):

    max_logit_token_prompt = []
    min_logit_token_prompt = []
    actual_logit_token_prompt = []
    
    soft = torch.nn.Softmax( dim = 0 )                          #Flattening normalization
    
    for file in range( size ):
        out = np.load(params['numpy_files_logits_path']+'/'+ f'logits_tensor[{file}]_batch[{file}]_model[{CODEMODEL}].npy')
        next_tokens_distribution = [ soft( torch.from_numpy(token) ) for token in out]  #Flattening normalization
        
        max_cases,min_cases = min_max_logits(
            logit_vocab_sample_tensor = next_tokens_distribution,
            tokenizer_fn= tokenizer
            )

        actual_cases = actual_logit(
            logit_vocab_sample_tensor = next_tokens_distribution,
            #sample_position = file,
            tokenized_prompt = tf_input_ids[ file ],
            tokenizer_fn = tokenizer
            )
        
        max_logit_token_prompt.append( max_cases )
        min_logit_token_prompt.append( min_cases )
        actual_logit_token_prompt.append( actual_cases )
        
        logging.info(file)
    return max_logit_token_prompt,min_logit_token_prompt,actual_logit_token_prompt

In [28]:
#max_logit_token_prompt,min_logit_token_prompt,actual_logit_token_prompt = batching_logits( size = 2 )
max_logit_token_prompt, min_logit_token_prompt, actual_logit_token_prompt = batching_logits(
    #size = 2
    ) #<---WARNING TIME Consuming

In [None]:
dataframe_to_save = data_pd[data_pd['m_name']==params['codemodel']].copy()


In [None]:
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
458739,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c10,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154


In [None]:
dataframe_to_save['max_prob'] = max_logit_token_prompt
dataframe_to_save['min_prob'] = min_logit_token_prompt
dataframe_to_save['actual_prob'] = actual_logit_token_prompt

In [None]:
dataframe_to_save.head()

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob
458739,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c10,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[( __, 0.13518139719963074), ((, 0.50364696979...","[( detainees, 1.8827131693932575e-12), (ヘラ, 2....","[( run, 0.0024231544230133295), (_, 0.20523256..."
458740,122,"[4299, 651, 62, 12286, 62, 29412, 62, 18558, 3...",c10,def get_default_python_executables():\n pyt...,[],0,12,34,5,10,62,109,"[( __, 0.13518139719963074), (_, 0.78407043218...","[( detainees, 1.8827131693932575e-12), ( Schum...","[( get, 0.061549633741378784), (_, 0.784070432..."
458741,167,"[4299, 1332, 62, 293, 459, 7, 944, 2599, 198, ...",c10,def test_least(self):\n df = self.spark...,[],0,21,34,1,12,141,230,"[( __, 0.13518203794956207), (_, 0.85579311847...","[( detainees, 1.8827005926480567e-12), ( Schum...","[( test, 0.10771691799163818), (_, 0.855793118..."
458742,352,"[4299, 1332, 62, 48369, 7, 944, 2599, 198, 502...",c10,def test_slice(self):\n df = self.spark...,[],0,21,69,1,34,293,453,"[( __, 0.13518159091472626), (_, 0.85579228401...","[( detainees, 1.882705146297181e-12), ( Schume...","[( test, 0.10771779716014862), (_, 0.855792284..."
458743,468,"[4299, 1332, 62, 2502, 10724, 7, 944, 2599, 19...",c10,def test_overlay(self):\n from pyspark....,[],0,21,117,2,39,301,497,"[( __, 0.13518092036247253), (_, 0.85579180717...","[( detainees, 1.8827101336271745e-12), ( Schum...","[( test, 0.10771705955266953), (_, 0.855791807..."


In [None]:
dataframe_to_save.shape

(50971, 15)

In [None]:
params['outputs']

'../data/ds_raw_logits/out_astevalverticalfiltered_c10.csv'

In [None]:
## Saving CheckPoint
dataframe_to_save.to_csv( params['outputs']  )

In [None]:
dataframe_to_save = pd.read_csv( 
                      params['outputs'] , 
                      index_col=0
            )

In [None]:
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob
458739,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c10,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' __', 0.13518139719963074), ('(', 0.5036469...","[(' detainees', 1.8827131693932575e-12), ('ヘラ'...","[(' run', 0.0024231544230133295), ('_', 0.2052..."


In [None]:
dataframe_to_save.shape

(50971, 15)

## Loss Retrieval

In [None]:
def batching_loss( size = dataframe_to_save.shape[0] ):
    output_loss = []
    for current_batch in range(size):
        out = np.load(params['numpy_files_logits_path']+'/'+f'loss_batch[{current_batch}]_model[{CODEMODEL}].npy') 
        output_loss.append( out.item() ) #.item() for numpy library
        logging.info(current_batch)
    return output_loss

In [None]:
output_loss = batching_loss() #[WAENING!] Takes Time

In [None]:
output_loss

[1.5280216932296753,
 1.2751010656356812,
 0.9239424467086792,
 0.7154776453971863,
 0.9453902244567871,
 1.0772773027420044,
 1.1250700950622559,
 1.26653254032135,
 1.1923243999481201,
 1.7146557569503784,
 0.7318271994590759,
 1.0532130002975464,
 1.049182653427124,
 1.2169959545135498,
 1.2471927404403687,
 1.4219638109207153,
 1.120624303817749,
 1.496616005897522,
 1.1474535465240479,
 0.5680826306343079,
 0.4323500096797943,
 0.6018445491790771,
 1.4369544982910156,
 1.0454716682434082,
 1.61322021484375,
 1.1424901485443115,
 1.136786937713623,
 1.130584716796875,
 1.6420553922653198,
 0.7731624841690063,
 2.6084516048431396,
 1.3456796407699585,
 1.274665355682373,
 0.9134281277656555,
 0.783573567867279,
 0.7675909399986267,
 0.647835910320282,
 1.923689603805542,
 1.8243354558944702,
 1.9578386545181274,
 1.809107780456543,
 2.2381503582000732,
 1.0801299810409546,
 1.9237412214279175,
 1.669828176498413,
 1.9037694931030273,
 2.028780460357666,
 1.6289767026901245,
 1.17454

In [None]:
dataframe_to_save['loss'] = output_loss
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
458739,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c10,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' __', 0.13518139719963074), ('(', 0.5036469...","[(' detainees', 1.8827131693932575e-12), ('ヘラ'...","[(' run', 0.0024231544230133295), ('_', 0.2052...",1.528022


In [None]:
## Saving CheckPoint 2
dataframe_to_save.to_csv( params['outputs']  )

In [None]:
dataframe_to_save = pd.read_csv( 
                      params['outputs'] , 
                      index_col=0
            )

In [None]:
dataframe_to_save.head(1)

Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes,max_prob,min_prob,actual_prob,loss
458739,225,"[4299, 1057, 62, 29412, 62, 41989, 7, 9288, 62...",c10,"def run_python_tests(test_modules, parallelism...",[],0,13,67,7,15,120,154,"[(' __', 0.13518139719963074), ('(', 0.5036469...","[(' detainees', 1.8827131693932575e-12), ('ヘラ'...","[(' run', 0.0024231544230133295), ('_', 0.2052...",1.528022


# Combining all datasets (last step)