In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
import gc


In [2]:
import seaborn as sns
from scipy import stats
from statistics import NormalDist
import matplotlib.pyplot as plt

# Loading Data From Folders

In [5]:
#Available Datasets

def params(): 
    
    code_models = {
        'Case1':('EleutherAI/gpt-neo-125M','codesearch_tesbed_EleutherAI-gpt-neo-125M_10000.csv','EleutherAI-gpt-neo-125M_10000_','callbacks-EleutherAI-gpt-neo-125M_10000_'),
        'Case2':('EleutherAI/gpt-neo-2.7B','codesearch_tesbed_EleutherAI-gpt-neo-2.7B_10000.csv','EleutherAI-gpt-neo-2.7B_10000_','callbacks-EleutherAI-gpt-neo-2.7B_10000_'),    
        'Case3':('EleutherAI/gpt-neo-1.3B','codesearch_tesbed_EleutherAI-gpt-neo-1.3B_10000.csv','EleutherAI-gpt-neo-1.3B_10000_','callbacks-EleutherAI-gpt-neo-1.3B_10000_'),
        'Case4':('microsoft/CodeGPT-small-py','codesearch_tesbed_microsoft-CodeGPT-small-py_1024_10000.csv','CodeGPT-small-py_10000_','callbacks-CodeGPT-small-py_10000_'),
        'Case5':('microsoft/CodeGPT-small-py-adaptedGPT2','codesearch_tesbed_microsoft-CodeGPT-small-py-adaptedGPT2_1024_10000.csv','CodeGPT-small-py-adaptedGPT2_10000_','callbacks-CodeGPT-small-py-adaptedGPT2_10000_'),
        'Case6':('Salesforce/codegen-2B-multi','codesearch_tesbed_Salesforce-codegen-2B-multi_10000.csv','Salesforce-codegen-2B-multi_10000_','callbacks-Salesforce-codegen-2B-multi_10000_')
    }
    current_case = 'Case2' #<----[Hyper]
    
    #print(code_models[current_case][1])
    
    return {
            'big_table_path' : '../data/concept_tables/' + code_models[current_case][1],
            'hf_model' :  code_models[current_case][0],
            'model_name': code_models[current_case][2],
            'callbacks' : '../data/' + code_models[current_case][3]
        }   

In [6]:
#pwd
parameters = params()
parameters['callbacks']

'../data/callbacks-EleutherAI-gpt-neo-2.7B_10000_'

In [16]:
data_pd = pd.read_csv( 
                      parameters['big_table_path'] , 
                      index_col=0
            )

In [17]:
data_pd.head(2)

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids
0,"def encode_images(format_dict):\n """"""b64-en...","[('def', 'def', 'function_definition'), ('enco...","[(4299, 'def', 'function_definition'), (37773,...","[4299, 37773, 62, 17566, 7, 18982, 62, 11600, ...",355
1,"def _process_execute_error(self, msg):\n ...","[('def', 'def', 'function_definition'), ('_pro...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 14681, 62, 41049, 62, 18224, 7, 9...",398


# Softmax Normalization and Data Engineering

In [50]:
#This code works for CodeGPT and Salesforce
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(parameters['hf_model'])

In [None]:
input_ids_list = tokenizer.batch_encode_plus( list(data_pd.whole_func_string.values) ) #<-- Do not return as a Tensor [cast to list]

In [55]:
out= np.load(parameters['callbacks']+'/'+parameters['model_name']  + '_logits_tensor[9999]_batch[9999].npy')
soft = torch.nn.Softmax( dim = 0 ) #Flattening normalization
next_token_distribution = soft( torch.from_numpy(out[0]) ) #Flattening normalization
assert round(next_token_distribution.sum().item()) == 1.0
assert tokenizer.vocab_size == out.shape[1]

In [58]:
next_token_distribution.topk(k=1, largest =True)

torch.return_types.topk(
values=tensor([0.1830]),
indices=tensor([21412]))

In [68]:
def topk_tuple( vocab_tensor, largest):
    token_position_dict = list( tokenizer.get_vocab().keys() )
    topk = vocab_tensor.topk( k=1 , largest=largest ) #TODO K number of elements can be extended
    return ( token_position_dict[topk.indices], topk.values.item() )

def min_max_logits( vocab_tensor ):
    max_case = topk_tuple( vocab_tensor = next_token_distribution, largest = True) #TST Max Logit
    min_case = topk_tuple( vocab_tensor = next_token_distribution, largest = False) #TST Min Logit
    return max_case, min_case

In [69]:
max_case,min_case = min_max_logits(vocab_tensor = next_token_distribution)
print(max_case,min_case)

('ocating', 0.18295776844024658) ('agy', 2.510304958965471e-15)


In [72]:
input_ids_list[0]

Encoding(num_tokens=355, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [63]:
actual_logit_token_prompt = []
token_position_dict = list( tokenizer.get_vocab().keys() )
for p,prompt in enumerate( input_ids_list ):
    temp_prompt = []
    for c,code_token_position in enumerate( prompt[1:] ): #Eliminate the first token prediction since we do not use it
        temp_prompt.append( 
                ( token_position_dict[int(code_token_position)],next_token_distributions[ p ][ c ][int(code_token_position)] )        
            )
    actual_logit_token_prompt.append( temp_prompt )

('agy', tensor([2.5103e-15]))

In [54]:
#maximum case
#return A namedtuple of (values, indices)
#If Largest True then search for the max case
max_logit_token_prompt = [[ topk_tuple(vocab_token=vocab_token, largest=True) for vocab_token in prompt ] for prompt in next_token_distributions ]
max_logit_token_prompt

50257

1

In [14]:
def np_loader():
    output_logits = [
        np.load(parameters['callbacks']+'/'+parameters['model_name']  + f'_logits_tensor[{n}]_batch[{n}].npy') for n in range(10000)
    ]

    output_loss = [ 
         np.load(parameters['callbacks']+ '/'+parameters['model_name']+f'_loss_batch[{current_batch}].npy') for current_batch in range(10000)
    ]
    return output_logits, output_loss

In [15]:
output_logits, output_loss = np_loader() #<-----WARNING, Takes time loading!

KeyboardInterrupt: 

In [13]:
output_logits[0]

'../data/callbacks-EleutherAI-gpt-neo-2.7B_10000_/EleutherAI-gpt-neo-2.7B_10000__logits_tensor[0]_batch[0].npy'

In [12]:
#Based on the response here: https://stackoverflow.com/questions/62852940/how-to-get-immediate-next-word-probability-using-gpt2-model
# next_token_logits = output[0][:, -1, :] #last token
output_logits[0].shape #The loggits contains the information of the next token prediction given the prevous windows w_{<t}

AttributeError: 'str' object has no attribute 'shape'

In [None]:
output_logits[0][0]

tensor([ -5.6060,  -2.4399,  -6.9558,  ..., -11.8578, -14.6072,  -6.0800],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [None]:
sum( output_logits[0][0] ) #non-normalized tokens summup different than 1

tensor(-553426.4375, device='cuda:0', grad_fn=<AddBackward0>)

In [None]:
next_token_distribution.shape

torch.Size([50257])

## Batch Softmax Distribution

In [None]:
output_logits

[tensor([[ -5.6059,  -2.4399,  -6.9557,  ..., -11.8578, -14.6071,  -6.0799],
         [ -6.2990,  -3.9610,  -6.1833,  ..., -10.6406,  -8.8349,  -6.3452],
         [ -4.8018,  -4.1277,  -4.1619,  ...,  -8.1974,  -7.6979,  -6.0528],
         ...,
         [-11.9002, -11.7730,  -8.0985,  ..., -15.8561, -17.3783, -11.6446],
         [ -7.4098,  -9.9632, -12.2038,  ..., -18.2707, -18.1764, -10.3717],
         [-10.0745,  -9.8232, -12.0393,  ..., -25.0120, -21.1442,  -8.6445]],
        grad_fn=<MmBackward0>),
 tensor([[ -5.6059,  -2.4399,  -6.9557,  ..., -11.8578, -14.6071,  -6.0799],
         [ -3.2613,  -3.4885,  -7.5667,  ..., -14.2281, -14.1252,  -6.1792],
         [ -4.1326,  -5.6764,  -5.4887,  ..., -12.1207, -11.4404,  -7.5004],
         ...,
         [-25.2886, -25.3481, -24.2219,  ..., -32.4984, -33.6202, -15.5898],
         [-23.1931, -22.4658, -20.3607,  ..., -23.5468, -28.6755, -14.5011],
         [-16.4041, -14.7310, -12.8584,  ..., -31.2271, -25.5518,  -5.8683]],
        grad_f

In [None]:
next_token_distributions = [ [soft( logits ) for logits in prompt_tesor]  for prompt_tesor in output_logits ]

In [None]:
sum( next_token_distributions[0][0] ) #tst

tensor(1.0000, grad_fn=<AddBackward0>)

### MAX Case

[[('ers', tensor([0.0308], grad_fn=<TopkBackward0>)),
  ('_', tensor([0.2419], grad_fn=<TopkBackward0>)),
  ('name', tensor([0.0194], grad_fn=<TopkBackward0>)),
  ('_', tensor([0.4209], grad_fn=<TopkBackward0>)),
  ('node', tensor([0.2215], grad_fn=<TopkBackward0>)),
  (',', tensor([0.4757], grad_fn=<TopkBackward0>)),
  ('Ġnode', tensor([0.1696], grad_fn=<TopkBackward0>)),
  (',', tensor([0.3714], grad_fn=<TopkBackward0>)),
  ('Ċ', tensor([0.9505], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9688], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9873], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9983], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.4134], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9901], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9468], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9975], grad_fn=<TopkBackward0>)),
  ('Ġ"""', tensor([0.1968], grad_fn=<TopkBackward0>)),
  ('Ċ', tensor([0.4083], grad_fn=<TopkBackward0>)),
  ('Ġ', tensor([0.9993], grad_fn=<TopkBackward0>)

In [None]:
data_pd['max_prob_case'] = max_logit_token_prompt

In [None]:
data_pd.head(1)

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[(ers, [tensor(0.0308, grad_fn=<UnbindBackward..."


### MIN Case

In [None]:
#minimum case
min_logit_token_prompt = [[ topk_tuple(vocab_token=vocab_token, largest=False) for vocab_token in prompt ] for prompt in next_token_distributions ]

In [None]:
data_pd['min_prob_case'] = min_logit_token_prompt
data_pd.head(1)

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[(ers, [tensor(0.0308, grad_fn=<UnbindBackward...","[(anwhile, [tensor(1.0384e-16, grad_fn=<Unbind..."


### Actual Token Case

In [None]:
#actual token case
actual_logit_token_prompt = []
token_position_dict = list( tokenizer.get_vocab().keys() )
for p,prompt in enumerate( input_ids_list ):
    temp_prompt = []
    for c,code_token_position in enumerate( prompt[1:] ): #Eliminate the first token prediction since we do not use it
        temp_prompt.append( 
                ( token_position_dict[int(code_token_position)],next_token_distributions[ p ][ c ][int(code_token_position)] )        
            )
    actual_logit_token_prompt.append( temp_prompt )

In [None]:
data_pd['actual_prob_case'] = actual_logit_token_prompt
data_pd.head(1)

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[(ers, [tensor(0.0308, grad_fn=<UnbindBackward...","[(anwhile, [tensor(1.0384e-16, grad_fn=<Unbind...","[(Ġget, tensor(0.0024, grad_fn=<SelectBackward..."


In [None]:
dataframe_to_save = data_pd.copy()
dataframe_to_save['max_prob_case'] = dataframe_to_save['max_prob_case'].map(lambda max_prob: [(value[0], value[1].item()) for value in max_prob ])
dataframe_to_save['min_prob_case'] = dataframe_to_save['min_prob_case'].map(lambda min_prob: [(value[0], value[1].item()) for value in min_prob ])
dataframe_to_save['actual_prob_case'] = dataframe_to_save['actual_prob_case'].map(lambda actual_prob: [(value[0], value[1].item()) for value in actual_prob ])
dataframe_to_save.head()
#dataframe_to_save.iloc[0]['actual_prob_case'][0]

Unnamed: 0,whole_func_string,ast_concepts,model_tokenizer_concepts,model_input_ids,model_total_input_ids,max_prob_case,min_prob_case,actual_prob_case
0,"def get_node(self, label):\n """"""\n ...","[('def', 'def', 'function_definition'), ('get_...","[(4299, 'def', 'function_definition'), (651, '...","[4299, 651, 62, 17440, 7, 944, 11, 6167, 2599,...",115,"[(ers, 0.03083181567490101), (_, 0.24190878868...","[(anwhile, 1.03836617568492e-16), (ousy, 3.281...","[(Ġget, 0.0024285861290991306), (_, 0.24190878..."
1,"def execute_pipeline(pipeline, environment_dic...","[('def', 'def', 'function_definition'), ('exec...","[(4299, 'def', 'function_definition'), (12260,...","[4299, 12260, 62, 79, 541, 4470, 7, 79, 541, 4...",492,"[(ers, 0.03083181567490101), (_, 0.18530689179...","[(anwhile, 1.03836617568492e-16), (icester, 3....","[(Ġexecute, 5.97450380155351e-05), (_, 0.18530..."
2,"def _decode(self, data):\n '''\n ...","[('def', 'def', 'function_definition'), ('_dec...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 12501, 1098, 7, 944, 11, 1366, 25...",583,"[(ers, 0.03083181567490101), ((, 0.01664732024...","[(anwhile, 1.03836617568492e-16), (ousy, 4.688...","[(Ġ_, 0.001834857277572155), (dec, 0.001334611..."
3,"def _repr_html_(self):\n """"""\n J...","[('def', 'def', 'function_definition'), ('_rep...","[(4299, 'def', 'function_definition'), (4808, ...","[4299, 4808, 260, 1050, 62, 6494, 41052, 944, ...",221,"[(ers, 0.03083181567490101), ((, 0.01664732024...","[(anwhile, 1.03836617568492e-16), (ousy, 4.688...","[(Ġ_, 0.001834857277572155), (re, 0.0013783742..."
4,"def build_shape(relation, nodes, ways):\n ""...","[('def', 'def', 'function_definition'), ('buil...","[(4299, 'def', 'function_definition'), (1382, ...","[4299, 1382, 62, 43358, 7, 49501, 11, 13760, 1...",454,"[(ers, 0.03083181567490101), (_, 0.28421667218...","[(anwhile, 1.03836617568492e-16), (buquerque, ...","[(Ġbuild, 0.0005856865900568664), (_, 0.284216..."


In [None]:
## Saving File
#data_pd.to_csv( 'output/testbed_base_EleutherAI-gpt-neo-125M.csv' )
dataframe_to_save.to_csv( 'output/testbed_base_EleutherAI-gpt-neo-125M.csv' )