In [1]:
import pandas as pd
import os
import time
import numpy as np
import torch
import gc


In [2]:
import seaborn as sns
from scipy import stats
from statistics import NormalDist
import matplotlib.pyplot as plt

In [3]:
torch.__version__

'1.12.1+cu113'

In [4]:
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda', index=0)

In [5]:
#test
x = torch.rand(5, 3, 
               device=device#'cpu'
               )
print(x)

tensor([[0.3478, 0.4097, 0.3891],
        [0.7176, 0.8822, 0.0272],
        [0.8784, 0.1918, 0.7899],
        [0.4318, 0.0703, 0.1718],
        [0.5152, 0.7167, 0.1274]], device='cuda:0')


In [6]:
x.get_device()

0

In [7]:
#Deleting tensor to free memory
del x
torch.cuda.empty_cache()

In [8]:
torch.cuda.memory_allocated()

0

In [7]:
! nvidia-smi

Wed Apr 19 19:01:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  Off  | 00000000:61:00.0 Off |                    0 |
| N/A   31C    P0    33W / 250W |      7MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Logits Extractor
>
> Extracting Tensor Logits from a given Neural Code Model @danaderp
>

In [9]:
def c_eleuther( returnModel = False, model_type =  'EleutherAI/gpt-neo-125m'):
    ''' Eleuther and Salesforce and Parrot uses the same importation'''
    from transformers import AutoTokenizer, AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(model_type)
    if returnModel:
        print("Uploading:",model_type)
        model = AutoModelForCausalLM.from_pretrained(model_type)
    else:
        model = []
    return (tokenizer, model)
    
    

def init_model_args( current_case = 'c1', returnModel = False ): 
    
    code_models = {
        'c1':('gpt-neo-125m',) + c_eleuther( returnModel = returnModel,  model_type = 'EleutherAI/gpt-neo-125m' ), # Basic (on Pile) GPT-3/J
        'c2':('gpt-neo-1.3B',) + c_eleuther( returnModel = returnModel,  model_type = 'EleutherAI/gpt-neo-1.3B' ),
        'c3':('gpt-neo-2.7B',) + c_eleuther( returnModel = returnModel,  model_type = 'EleutherAI/gpt-neo-2.7B' ),
        'c4':('gpt-j-6b',) + c_eleuther( returnModel = returnModel,  model_type = 'EleutherAI/gpt-j-6b' ),
        'c5':('codegen-350M-nl',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-350M-nl' ), #Basic (on Pile) codegen
        'c6':('codegen-2B-nl',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-2B-nl' ),
        'c7':('codegen-6B-nl',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-6B-nl' ),
        'c8':('codegen-16B-nl',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-16B-nl' ),
        'c9':('codeparrot-small-multi',) + c_eleuther( returnModel = returnModel,  model_type = 'codeparrot/codeparrot-small-multi' ), #multi-Language
        'c10':('codegen-350M-multi',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-350M-multi' ),
        'c11':('codegen-2B-multi',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-2B-multi' ),
        'c12':('codegen-6B-multi',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-6B-multi' ),
        'c13':('codegen-16B-multi',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-16B-multi' ),
        'c14':('codeparrot-small',) + c_eleuther( returnModel = returnModel,  model_type = 'codeparrot/codeparrot-small' ), #mono-Language
        'c15':('codeparrot',) + c_eleuther( returnModel = returnModel,  model_type = 'codeparrot/codeparrot' ),
        'c16':('codegen-350M-mono',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-350M-mono' ),
        'c17':('codegen-2B-mono',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-2B-mono' ),
        'c18':('codegen-6B-mono',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-6B-mono' ),
        'c19':('codegen-16B-mono',) + c_eleuther( returnModel = returnModel,  model_type = 'Salesforce/codegen-16B-mono' ),
    }
    
    r = code_models[current_case]
    
    data_path ='../datax/testbeds/AstEvalFilteredV1.json' #<-- HYPER AstEvalFilteredV1
    pd_data = pd.read_json( data_path ) #Data Uploading
    
    numpy_files_logits_path = '../datax/1_numpy_files_logits/' + current_case #outputpath
    return r[0], r[1], r[2], pd_data, numpy_files_logits_path

In [10]:
#Available Datasets
# Case1: codesearch_tesbed_EleutherAI-gpt-neo-125M_10000 for the model 'EleutherAI/gpt-neo-125M' 
# Case2: codesearch_tesbed_EleutherAI-gpt-neo-2.7B_10000 for the model 'EleutherAI/gpt-neo-2.7B' <-- MEMORY CONSTRAINTS

def params(): 
    
    code_models = {
        'Case3':('EleutherAI/gpt-neo-1.3B','codesearch_tesbed_EleutherAI-gpt-neo-1.3B_10000.csv','EleutherAI-gpt-neo-1.3B_10000_','callbacks-EleutherAI-gpt-neo-1.3B_10000_'),
        'Case4':('microsoft/CodeGPT-small-py','codesearch_tesbed_microsoft-CodeGPT-small-py_1024_10000.csv','CodeGPT-small-py_10000_','callbacks-CodeGPT-small-py_10000_'),
        'Case5':('microsoft/CodeGPT-small-py-adaptedGPT2','codesearch_tesbed_microsoft-CodeGPT-small-py-adaptedGPT2_1024_10000.csv','CodeGPT-small-py-adaptedGPT2_10000_','callbacks-CodeGPT-small-py-adaptedGPT2_10000_'),
        'Case6':('Salesforce/codegen-2B-multi','codesearch_tesbed_Salesforce-codegen-2B-multi_10000.csv','Salesforce-codegen-2B-multi_10000_','callbacks-Salesforce-codegen-2B-multi_10000_')
    }
    current_case = 'Case4' #<----[Hyper]
    
    #print(code_models[current_case][1])
    
    return {
            'big_table_path' : '../data/concept_tables/' + code_models[current_case][1],
            'hf_model' :  code_models[current_case][0],
            'model_name': code_models[current_case][2],
            'callbacks' : '../data/' + code_models[current_case][3],
            'wpe':1024  #<----[Hyper]
}

# Init Parameters
> Loading Models and Testbeds


In [None]:
CODEMODEL = 'c1' #Hyper

In [10]:
name, tokenizer, model, pd_data, numpy_files_logits_path = init_model_args(current_case = CODEMODEL, returnModel = True) #[WARNING!] Check the parameters before calling it. 

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/526M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/797M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/11.3G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/995 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/14.3G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/32.2G [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/247M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/797M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/14.3G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/32.2G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/457M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/797M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/14.3G [00:00<?, ?B/s]

In [None]:
#Testing name and output_path
print( name, numpy_files_logits_path )

In [13]:
#Testing data loads
pd_data.describe() 

Unnamed: 0,model_total_input_ids
count,10000.0
mean,322.23
std,220.374923
min,41.0
25%,156.0
50%,253.0
75%,436.0
max,1022.0


# Extracting Logits From a Given Model

In [16]:
prompts = pd_data[CODEMODEL + '_ids'].values

In [17]:
#Evaluating preprocessed ids
input_ids_list =  [eval(ids_vector) for ids_vector in prompts] #<-- Do not return as a Tensor

In [18]:
#Casting Integers to Tensor Integers. Make sure the tesor is created in a device
#We ignored the parameter attention_mask since we are not using masking here [https://huggingface.co/transformers/v4.10.1/glossary.html#attention-mask]

tf_input_ids = [torch.tensor(  input_ids, dtype = torch.int, device=device ) for input_ids in input_ids_list ]

In [19]:
#It should be same size
assert len(input_ids_list) == len(prompts)

### Loading Model to Memoery

In [20]:
model.to( device ) #WARNING, Verify the device before assigning to memory

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50001, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [21]:
parameters #Verification Point of Parameters

{'big_table_path': '../data/concept_tables/codesearch_tesbed_microsoft-CodeGPT-small-py_1024_10000.csv',
 'hf_model': 'microsoft/CodeGPT-small-py',
 'model_name': 'CodeGPT-small-py_10000_',
 'callbacks': '../data/callbacks-CodeGPT-small-py_10000_',
 'wpe': 1024}

### Executing Logits

In [22]:
def logit_extractor(batch, input, from_index=0):
    """
    Output is the class CausalLMOutputWithPast (https://huggingface.co/transformers/v4.10.1/main_classes/output.html?highlight=causallmoutputwithpast)"
    logits (torch.FloatTensor of shape (batch_size, sequence_length, config.vocab_size)) – Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    The expression i.type(torch.LongTensor).to(device) is for casting labels for the loss
    """
    #Output is in CausalLMOutputWithPast

    for idx, n in enumerate( range( from_index, len(input), batch) ):
        output = [ model( 
            input_ids = i, 
            labels = i.type(torch.LongTensor).to(device) 
            ) for i in input[n:n+batch] ] #Labels must be provided to compute loss
    
        output_logits = [ o.logits.detach().to('cpu').numpy() for o in output ]  #Logits Extraction
        output_loss = np.array([ o.loss.detach().to('cpu').numpy() for o in output ])  #Language modeling loss (for next-token prediction).

        #Saving Callbacks
        current_batch = idx + (from_index//batch)
        for jdx, o_logits in enumerate( output_logits ):
            np.save( numpy_files_logits_path+ '/'+ f'logits_tensor[{jdx+n}]_batch[{current_batch}]_model[{CODEMODEL}].npy', o_logits) #Saving LOGITS
        np.save( numpy_files_logits_path+ '/'+f'loss_batch[{current_batch}]_model[{CODEMODEL}].npy', output_loss) #Saving LOSS
        
        print(f"Batch [{current_batch}] Completed")
        
        #Memory Released
        for out in output:
            del out.logits
            torch.cuda.empty_cache()
            del out.loss
            torch.cuda.empty_cache()
        for out in output_logits:
            del out
            torch.cuda.empty_cache()
        for out in output_loss:
            del out
            torch.cuda.empty_cache()
        del output
        del output_logits
        del output_loss
    
    pass

In [23]:
## ACTUAL EXPERIMENT
## TIME AND MEMORY CONSUMING
logit_extractor(
    batch = 1, 
    input = tf_input_ids, 
    from_index=0
)

Batch [0] Completed
Batch [1] Completed
Batch [2] Completed
Batch [3] Completed
Batch [4] Completed
Batch [5] Completed
Batch [6] Completed
Batch [7] Completed
Batch [8] Completed
Batch [9] Completed
Batch [10] Completed
Batch [11] Completed
Batch [12] Completed
Batch [13] Completed
Batch [14] Completed
Batch [15] Completed
Batch [16] Completed
Batch [17] Completed
Batch [18] Completed
Batch [19] Completed
Batch [20] Completed
Batch [21] Completed
Batch [22] Completed
Batch [23] Completed
Batch [24] Completed
Batch [25] Completed
Batch [26] Completed
Batch [27] Completed
Batch [28] Completed
Batch [29] Completed
Batch [30] Completed
Batch [31] Completed
Batch [32] Completed
Batch [33] Completed
Batch [34] Completed
Batch [35] Completed
Batch [36] Completed
Batch [37] Completed
Batch [38] Completed
Batch [39] Completed
Batch [40] Completed
Batch [41] Completed
Batch [42] Completed
Batch [43] Completed
Batch [44] Completed
Batch [45] Completed
Batch [46] Completed
Batch [47] Completed
Ba

OSError: [Errno 122] Disk quota exceeded

In [35]:
#logit_extractor(batch =2, input= input_ids_list[:2]) #<---- [WARNING TIME AND MEMORY CONSUMING]

In [36]:
#output_logits = np.load('../data/callbacks/logits_tensor[0]_batch[0].npy')

In [37]:
#assert output_logits.shape[0] == len(input_ids_list[0])

In [38]:
#output_loss = np.load('../data/callbacks/loss_batch[0].npy')

In [39]:
#output_loss