In [60]:
from pathlib import Path
import csv
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools

pd.options.display.float_format = '{:.2f}'.format

In [61]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

In [62]:
import warnings
from matplotlib import colors
import os

In [63]:
def param_default():
    return {
        #'dataset' : 'codeparrot/codeparrot-clean-valid', #Deprecated
        'galeras_path' : '/workspaces/code-rationales/semeru-datasets/semeru/galeras/code_rationales',
        'dataset' : 'code_completion_random_cut_5k_30_512_tokens',
        #'dataset' : 'code_completion_docstring_random_cut_3.8k_30_150_tokens',
        #'dataset' : 'code_completion_docstring_signature_3.8k_30_150_tokens',
        #'dataset' : 'code_completion_docstring_5k_30_150_tokens',
        'dataset_disk_path': '/workspaces/code-rationales/semeru-datasets/codeparrot-clean-valid',
        'model_name': '/workspaces/code-rationales/data/codeparrot-small/checkpoints/checkpoint-29000', 
        'cache_dir': '/workspaces/code-rationales/datax/df_cache_dir', 
        'sampling_results': '/workspaces/code-rationales/data/sampling/gpt',
        'num_samples' : 100
    }

In [64]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Data Loading and Testing

In [65]:
# Save dataset in nfs
#raw_datasets = load_dataset(param_default()['dataset'], cache_dir=param_default()['cache_dir'])
#raw_datasets.save_to_disk(param_default()['dataset_disk_path'])

In [66]:
# Reload with the `json` script
#test_dataset = load_from_disk(param_default()['dataset_disk_path'])
#test_dataset

In [67]:
test_dataset = load_dataset('json',data_files=param_default()['galeras_path'] + '/' + param_default()['dataset'] + '.json', cache_dir=param_default()['cache_dir'])

Found cached dataset json (/workspaces/code-rationales/datax/df_cache_dir/json/default-633751337636c627/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [68]:
test_dataset = test_dataset['train']
test_dataset[0]

{'path': 'tests/driver_tests.py',
 'signature': 'def test_positive_integer_or_none()',
 'n_ast_errors': 0,
 'n_ast_nodes': 25,
 'n_words': 5,
 'url': 'https://github.com/EpistasisLab/tpot.git',
 'prompt': "Generate Pyhton code that Assert that the TPOT CLI interface's positive_integer_or_none parsing throws an exception when n < 0. and code starts with def test_positive_integer_or_none():\n    \n    assert_ra:",
 'complexity': 1,
 'id': 181600,
 'file_name': 'driver_tests.py',
 'language': 'Python',
 'repo': 'tpot',
 'commit_message': 'Revert "Deployed 7ccda9a with MkDocs version: 1.3.0"\n\nThis reverts commit bd9629c40e01241766197119b581a99409b07068.',
 'token_counts': 13,
 'random_cut': 'def test_positive_integer_or_none():\n    \n    assert_ra',
 'fun_name': 'test_positive_integer_or_none',
 'n_whitespaces': 11,
 'code': "def test_positive_integer_or_none():\n    \n    assert_raises(Exception, positive_integer_or_none, '-1')\n\n",
 'nloc': 2,
 'docstring': "Assert that the TPOT CLI 

In [69]:
test_dataset = test_dataset.select(range(param_default()['num_samples']))

In [70]:
###### IMPORTANT MODIFIY 
#df_sampled_code = test_dataset.to_pandas()[['code','prompt']] #code_completion_random_cut_5k_30_512_tokens
df_sampled_code = test_dataset.to_pandas()[['docstring','code','prompt']] # others

In [71]:
df_sampled_code.describe()

Unnamed: 0,docstring,code,prompt
count,100,100,100
unique,100,100,100
top,Assert that the TPOT CLI interface's positive_...,def test_positive_integer_or_none():\n \n ...,Generate Pyhton code that Assert that the TPOT...
freq,1,1,1


## Model Loading and Testing

In [72]:
model = AutoModelForCausalLM.from_pretrained(
            param_default()['model_name'],
            cache_dir=param_default()['cache_dir'])

In [73]:
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32768, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [74]:
model.device

device(type='cuda', index=0)

## Tokenizer Loading and Testing

In [75]:
tokenizer = AutoTokenizer.from_pretrained(param_default()['model_name'])

## Samples Encoding and Filtering

In [76]:
###### IMPORTANT MODIFIY 
#df_sampled_code['ground_truth'] = df_sampled_code['code'] #code_completion_random_cut_5k_30_512_tokens
df_sampled_code['ground_truth'] = df_sampled_code['docstring'] + '\n' + df_sampled_code['code'] #others

In [77]:
df_sampled_code['size'] =  df_sampled_code['ground_truth'].map(lambda code: len(tokenizer(code)['input_ids']))
df_sampled_code['input_ids'] = tokenizer(df_sampled_code['prompt'].tolist())['input_ids']

In [78]:
df_sampled_code = df_sampled_code.drop('docstring', axis=1)
df_sampled_code = df_sampled_code.drop('code', axis=1)

In [79]:
df_sampled_code.head(5)

Unnamed: 0,prompt,ground_truth,size,input_ids
0,Generate Pyhton code that Assert that the TPOT...,Assert that the TPOT CLI interface's positive_...,55,"[6864, 1611, 517, 265, 1233, 626, 10716, 626, ..."
1,Generate Pyhton code that \n Return the maj...,\n Return the major-minor version of the cu...,63,"[6864, 1611, 517, 265, 1233, 626, 5591, 1432, ..."
2,Generate Pyhton code that \n Encode a bytes...,\n Encode a bytestring to a base64 string f...,59,"[6864, 1611, 517, 265, 1233, 626, 5591, 19244,..."
3,Generate Pyhton code that Represents the numbe...,Represents the number as a triple tuple.\n\n ...,52,"[6864, 1611, 517, 265, 1233, 626, 21591, 314, ..."
4,Generate Pyhton code that \n Return the...,\n Return the context variables require...,47,"[6864, 1611, 517, 265, 1233, 626, 4960, 1432, ..."


## Model Sampling Generation

In [80]:
print(df_sampled_code['size'].max())
print(df_sampled_code['size'].mean())
#take the 75% percentile 
df_sampled_code['size'].quantile(0.85)

146
83.33


112.44999999999997

In [81]:
SAMPLES = 30 #<---- Hardocoded
MAX_GEN_TOK = df_sampled_code['size'].max()
#MAX_GEN_TOK = 112

In [82]:
MAX_GEN_TOK

146

In [24]:
inputs = tokenizer(["def hello_world():"], return_tensors="pt")
inputs.to(device)
outputs = model.generate(**inputs, do_sample=True, max_length=MAX_GEN_TOK, top_k=0, num_return_sequences=2, pad_token_id=tokenizer.eos_token_id)
#tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):])

In [25]:
outputs

tensor([[  318, 16509,    63,  7617,   837,   272,   372,  3839,   342,   421,
           199,   318, 17242,    63,  7617,     8,   589,    12,  4353,    63,
           815,    29,   403,     9,  1035,   488,    26,   272,   372,   413,
           339,   340, 10061,   436,  2688,     8,  8258,    12,   283,  2325,
          3097,   735,   267, 10061,    14,  2325,  3097,   342,   272,   372,
         17242,   342,   421,   199,   318, 13318,  5199,   837,   272,   408,
           272,   372,   282,  1059,   370,  1413,  4993,   272,   408,   272,
           862,    26,   267,   372, 17242,    63,  7617,   342,   272,   871,
           334,  8647,   547,    12, 13483,   304,   267,   327, 13244,  2366,
          1928,  7252,   267,   746,   421,   199,   318, 16509,    63,  7617,
             8,   589,    12,  4353,    63,   815,    29,   403,     9,  1035,
          2155,    26,   272,   340,  4353,    63,   815,   365,   488,    26,
           267,  2967,    67,  7301,   275,  4884,  

In [26]:
torch.tensor(outputs.tolist()).to(model.device)

tensor([[  318, 16509,    63,  7617,   837,   272,   372,  3839,   342,   421,
           199,   318, 17242,    63,  7617,     8,   589,    12,  4353,    63,
           815,    29,   403,     9,  1035,   488,    26,   272,   372,   413,
           339,   340, 10061,   436,  2688,     8,  8258,    12,   283,  2325,
          3097,   735,   267, 10061,    14,  2325,  3097,   342,   272,   372,
         17242,   342,   421,   199,   318, 13318,  5199,   837,   272,   408,
           272,   372,   282,  1059,   370,  1413,  4993,   272,   408,   272,
           862,    26,   267,   372, 17242,    63,  7617,   342,   272,   871,
           334,  8647,   547,    12, 13483,   304,   267,   327, 13244,  2366,
          1928,  7252,   267,   746,   421,   199,   318, 16509,    63,  7617,
             8,   589,    12,  4353,    63,   815,    29,   403,     9,  1035,
          2155,    26,   272,   340,  4353,    63,   815,   365,   488,    26,
           267,  2967,    67,  7301,   275,  4884,  

In [27]:
def df_sampled_generation(
        df_sampled_code, 
        model,
        number_samples = 1,
        max_gen_tok = 100, 
        top_k = 0
    ):
    dict_generated_code = {i: [] for i in range(number_samples)}
    for idx_prompt, prompt in enumerate(df_sampled_code['prompt']):
        input = tokenizer([prompt], return_tensors="pt")
        input.to(model.device)
        outputs = model.generate(**input, do_sample=True,
                                 #max_length=max_gen_tok if df_sampled_code['size'][idx_prompt] < max_gen_tok else df_sampled_code['size'][idx_prompt],
                                 max_length=max_gen_tok,
                                 top_k=top_k, 
                                 num_return_sequences=number_samples, 
                                 pad_token_id=tokenizer.eos_token_id)
        for index, output in enumerate(outputs):
            #dict_generated_code[index].append(output[len(input['input_ids'][0]):].tolist())
            dict_generated_code[index].append(output.tolist())
    df_temp = pd.DataFrame().from_dict(data=dict_generated_code) # DataFrame from Generation
    df_temp = pd.concat([df_sampled_code.reset_index(), df_temp ], axis=1) #Index before concating
    return df_temp
        
    

In [28]:
#WARNING TIME CONSUMING
df_generated_input = df_sampled_generation(
    df_sampled_code=df_sampled_code, 
    model=model, 
    number_samples=SAMPLES, 
    max_gen_tok=MAX_GEN_TOK)

In [29]:
df_generated_input.head(5)

Unnamed: 0,index,prompt,ground_truth,size,input_ids,0,1,2,3,4,...,20,21,22,23,24,25,26,27,28,29
0,0,Generate Pyhton code that Check if the grid cl...,Check if the grid client is up.\n\n Check f...,42,"[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...",...,"[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3..."
1,1,Generate Pyhton code that Ensure that powershe...,\n Ensure that powershell processes inl...,142,"[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...",...,"[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7..."
2,2,Generate Pyhton code that Encode a bytestring ...,\n Encode a bytestring to a base64 string f...,59,"[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...",...,"[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ..."
3,3,Generate Pyhton code that Add the arguments fo...,Add the arguments for the protocol to the clie...,97,"[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...",...,"[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2..."
4,4,Generate Pyhton code that Locking should inclu...,Locking should include hashes for *all* platf...,60,"[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...",...,"[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ..."


In [30]:
df_generated_input[0]

0     [6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...
1     [6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...
2     [6864, 1611, 517, 265, 1233, 626, 19244, 282, ...
3     [6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...
4     [6864, 1611, 517, 265, 1233, 626, 15536, 316, ...
                            ...                        
95    [6864, 1611, 517, 265, 1233, 626, 1432, 314, 1...
96    [6864, 1611, 517, 265, 1233, 626, 1432, 314, 6...
97    [6864, 1611, 517, 265, 1233, 626, 2816, 1175, ...
98    [6864, 1611, 517, 265, 1233, 626, 6516, 316, 2...
99    [6864, 1611, 517, 265, 1233, 626, 1432, 3775, ...
Name: 0, Length: 100, dtype: object

### Statistics and Checkpoint

In [31]:
np_len_method = [ (np.array([ len(gen_method) for gen_method in df_generated_input[j] ]).mean(),
                   np.array([ len(gen_method) for gen_method in df_generated_input[j] ]).std()  )
                    for j in range(30) ]

In [32]:
np_len_method

[(157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0),
 (157.0, 0.0)]

In [33]:
#Checkpoint of Generation
def checkpoint_generation( df , name = 'output' ):
    df.to_csv(param_default()['sampling_results'] +  '/' + name + '.csv')
    pass

In [34]:
checkpoint_generation( df = df_generated_input, name=param_default()['dataset'])

In [35]:
df_generated_input = pd.read_csv( param_default()['sampling_results'] + '/' + param_default()['dataset'] +'.csv' , index_col=0)

In [36]:
df_generated_input.head()

Unnamed: 0,index,prompt,ground_truth,size,input_ids,0,1,2,3,4,...,20,21,22,23,24,25,26,27,28,29
0,0,Generate Pyhton code that Check if the grid cl...,Check if the grid client is up.\n\n Check f...,42,"[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...",...,"[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3...","[6864, 1611, 517, 265, 1233, 626, 2670, 340, 3..."
1,1,Generate Pyhton code that Ensure that powershe...,\n Ensure that powershell processes inl...,142,"[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...",...,"[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7...","[6864, 1611, 517, 265, 1233, 626, 7523, 626, 7..."
2,2,Generate Pyhton code that Encode a bytestring ...,\n Encode a bytestring to a base64 string f...,59,"[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...",...,"[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ...","[6864, 1611, 517, 265, 1233, 626, 19244, 282, ..."
3,3,Generate Pyhton code that Add the arguments fo...,Add the arguments for the protocol to the clie...,97,"[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...",...,"[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2...","[6864, 1611, 517, 265, 1233, 626, 2654, 314, 2..."
4,4,Generate Pyhton code that Locking should inclu...,Locking should include hashes for *all* platf...,60,"[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...",...,"[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ...","[6864, 1611, 517, 265, 1233, 626, 15536, 316, ..."


In [37]:
df_generated_input.describe()

Unnamed: 0,index,size
count,100.0,100.0
mean,49.5,83.97
std,29.01,36.43
min,0.0,29.0
25%,24.75,50.0
50%,49.5,82.5
75%,74.25,115.0
max,99.0,157.0


In [38]:
#tst decoding
decoded_input = tokenizer.decode(eval(df_generated_input['input_ids'][1]))
decoded_output = tokenizer.decode(eval(df_generated_input['1'][1]))
print(decoded_input)
print('-'*100)
print(decoded_output)

Generate Pyhton code that Ensure that powershell processes inline script in args with powershell
        core :
----------------------------------------------------------------------------------------------------
Generate Pyhton code that Ensure that powershell processes inline script in args with powershell
        core : Python object you need to start the Python window
        args : PyHTonCmd arguments to be run here, e.g., powershell.py
        """  
        core.powershell()   
        return 0    
    # implement pyhton
    # add error hook.
    # add trace hook.  This is useful if we need from urglue tracebacks.
    # no sanity checking this could go back to pyhon.processStream() whenever it works.
    def main(self, argv, debug):
       
        Parse arguments and ensure it feasible.
        """  
        if argv[1:]:
            argv = argv[1:]
        gpu =


In [39]:
## MEMORy DEALLOCATION
torch.cuda.empty_cache()