## Canonical evaluation notebook

__Accuracy__ is the proportion of correct predictions among the total number of cases processed. It can be computed with: __Accuracy = (TP + TN) / (TP + TN + FP + FN)__ Where: TP: True positive TN: True negative FP: False positive FN: False negative

In [37]:
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
import os


### Calculating Accuracy

__Calculating the accuracy need to be reviwed fromthe logits__ this step is in stand by

In [38]:
CASE = 'c17'

In [39]:
#Available Datasets

def params(): 
    ## Models are describe in the following spreadsheet: https://docs.google.com/spreadsheets/d/1lJG0M3-WCR-oV5OSMlKsMIN6ERAFrfrVn7iX3Dz6_nY/edit#gid=883975699
    code_models = {
        'c1':('EleutherAI/gpt-neo-125M','EleutherAI-gpt-neo-125M'),
        'c2':('EleutherAI/gpt-neo-1.3B','EleutherAI-gpt-neo-1.3B'),    
        'c3':('EleutherAI/gpt-neo-2.7B','EleutherAI-gpt-neo-2.7B'),
        'c4':(),
        'c5':('Salesforce/codegen-350M-nl','Salesforce-codegen-350M-nl'),
        'c6':('Salesforce/codegen-2B-nl', 'Salesforce-codegen-2B-nl'),
        'c7':(),
        'c8':(),
        'c9':('codeparrot/codeparrot-small-multi','codeparrot-codeparrot-small-multi'),
        'c10':('Salesforce/codegen-350M-multi','Salesforce-codegen-350M-multi'),
        'c11':('Salesforce/codegen-2B-multi','Salesforce-codegen-2B-multi'),
        'c14':('codeparrot/codeparrot-small','codeparrot-codeparrot-small'),
        'c15':('codeparrot/codeparrot','codeparrot-codeparrot'),
        'c16':('Salesforce/codegen-350M-mono','Salesforce-codegen-350M-mono'),
        'c17':('Salesforce/codegen-2B-mono','Salesforce-codegen-2B-mono')
    }
    current_case = CASE
        
    return {
            'hf_model' :  code_models[current_case][0],
            'model_name': code_models[current_case][1],
            'input_vector_df': '../data/ds_raw_logits/'+'out_astevalverticalfiltered_{}.csv'.format(CASE),
            'cache_dir':'../data/hugginface/cache/models' ## <- use this path to save model checkpoints and avoid to use container disk!
    }

In [40]:
#pwd
parameters = params()

## Loading data <- skip this secction if you alreade have a subset and jump to the model load section

Loading datasets from previous raw outputs

In [None]:
path = parameters['input_vector_df']

In [None]:
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['code'][0]

### Sampling dataset

According to the population size $N=50971$ we can use a sample size of $382$ with a confidence level $z=95%$ and margin error $e=5%$

In [None]:
SAMPLE_SIZE = 382

In [None]:
indexes = np.random.randint(low=0.0, high=len(df), size=SAMPLE_SIZE)
df_subset = df.iloc[indexes]

In [None]:
df['size'].describe()

In [None]:
df_subset=df_subset.reset_index(drop=True)
df_subset.head()

### Create prompt: SMethod signature only

In [None]:
df_subset['prompts'] = df_subset['code'].apply(lambda x: x.split(":")[0])

## Model load <- Run this script from this point if you already have a subset

In [41]:
#This code works for CodeGPT, Salesforce and GPTNeoX
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

if 'gpt' not in parameters['hf_model']:
    tokenizer = AutoTokenizer.from_pretrained(parameters['hf_model'], cache_dir=parameters['cache_dir'])
    model = AutoModelForCausalLM.from_pretrained(parameters['hf_model'],device_map="auto", torch_dtype=torch.float16, cache_dir = parameters['cache_dir'])
    print("model loaded as CausalLM")

Downloading (…)okenizer_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

model loaded as CausalLM


In [42]:
#This code works for GPT-Neo
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
if 'gpt' in parameters['hf_model'] or '':
    model = GPTNeoForCausalLM.from_pretrained( parameters['hf_model'], ignore_mismatched_sizes=True,cache_dir = parameters['cache_dir'] )
    tokenizer = GPT2Tokenizer.from_pretrained( parameters['hf_model'],cache_dir = parameters['cache_dir'] )
    print("model loaded as GPT-Neo")

In [43]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [44]:
model.to( device )

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 2560)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): CodeGenBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=2560, out_features=7680, bias=False)
          (out_proj): Linear(in_features=2560, out_features=2560, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=2560, out_features=10240, bias=True)
          (fc_out): Linear(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): CodeGenBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dro

### Load sub dataset from checkpoint

In [45]:
SAMPLE_SIZE = 382

In [46]:
path = "../datax/pandas/evaluation/subset_canonical_evaluation_prompt_" + str(SAMPLE_SIZE) + ".json"

In [47]:
df_subset = pd.read_json(path)

Create the tensor vector from the given prompt for each data point

In [48]:
input_ids_list = [torch.tensor(  [tokenizer.encode(input_ids)], dtype = torch.int, device=device ) for input_ids in df_subset.prompts.values]

In [49]:
#encoding_ids_list = tokenizer.batch_encode_plus( list( df_subset.prompts.values ) )


In [50]:
input_ids_list

[tensor([[ 4299,  4808, 39014,    62, 14421,    62,  1370,    62,  8929,  2971,
              7,   944,     8]], device='cuda:0', dtype=torch.int32),
 tensor([[ 4299,  1332,    62, 28665,    62,  7645, 16354,    62,    83, 29291,
             62, 35636,   364,    62, 17143,  2357,  3419]], device='cuda:0',
        dtype=torch.int32),
 tensor([[ 4299,  1332,    62,  4774,    62, 12102,   341,    62,   259,    62,
          24442,    62, 14171,     7,   944,     8]], device='cuda:0',
        dtype=torch.int32),
 tensor([[ 4299,  1332,    62, 17752,    62,   354,   945,   316,     7,   944,
              8]], device='cuda:0', dtype=torch.int32),
 tensor([[ 4299,  1332,    62,  1069,  4516,    62, 13116,    62,  2664,    62,
          28961,    62, 10379, 20212,     7,   944,     8]], device='cuda:0',
        dtype=torch.int32),
 tensor([[ 4299,  1332,    62,  8094,  1525,    62, 48101,    62,  2539,  3419]],
        device='cuda:0', dtype=torch.int32),
 tensor([[4299, 1332,   62, 7783,   

In [51]:
input_ids_list[0]

tensor([[ 4299,  4808, 39014,    62, 14421,    62,  1370,    62,  8929,  2971,
             7,   944,     8]], device='cuda:0', dtype=torch.int32)

In [52]:
#model( input_ids = input_ids_list[0], labels = input_ids_list[0].to(torch.long).to(device) )

In [53]:
df_subset['size'].describe()

count     382.000000
mean      301.971204
std       224.503796
min        65.000000
25%       132.000000
50%       233.500000
75%       412.750000
max      1016.000000
Name: size, dtype: float64

In [54]:
df_subset.head()

Unnamed: 0.1,Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,...,outcome_c10,outcome_vector_c10,outcome_c11,outcome_vector_c11,outcome_c14,outcome_vector_c14,outcome_c15,outcome_vector_c15,outcome_c16,outcome_vector_c16
0,4710,295,"[4299, 4808, 39014, 62, 14421, 62, 1370, 62, 8...",c1,def _apply_current_line_highlight(self):\n ...,[],0,11,32,3,...,def _apply_current_line_highlight(self) -> Non...,"[[4299, 4808, 39014, 62, 14421, 62, 1370, 62, ...",def _apply_current_line_highlight(self) -> Non...,"[[4299, 4808, 39014, 62, 14421, 62, 1370, 62, ...",def _apply_current_line_highlight(self) -> Non...,"[[318, 485, 4998, 63, 1818, 63, 604, 63, 14337...",def _apply_current_line_highlight(self) -> Non...,"[[318, 485, 4998, 63, 1818, 63, 604, 63, 14337...",def _apply_current_line_highlight(self) -> boo...,"[[4299, 4808, 39014, 62, 14421, 62, 1370, 62, ..."
1,37857,237,"[4299, 1332, 62, 28665, 62, 7645, 16354, 62, 8...",c1,def test_column_transformer_tuple_transformers...,[],0,11,32,1,...,def test_column_transformer_tuple_transformers...,"[[4299, 1332, 62, 28665, 62, 7645, 16354, 62, ...",def test_column_transformer_tuple_transformers...,"[[4299, 1332, 62, 28665, 62, 7645, 16354, 62, ...",def test_column_transformer_tuple_transformers...,"[[318, 511, 63, 2301, 63, 19754, 63, 2960, 63,...",def test_column_transformer_tuple_transformers...,"[[318, 511, 63, 2301, 63, 19754, 63, 2960, 63,...",def test_column_transformer_tuple_transformers...,"[[4299, 1332, 62, 28665, 62, 7645, 16354, 62, ..."
2,18219,248,"[4299, 1332, 62, 4774, 62, 12102, 341, 62, 259...",c1,def test_host_validation_in_debug_mode(self):\...,[],0,11,37,2,...,def test_host_validation_in_debug_mode(self) -...,"[[4299, 1332, 62, 4774, 62, 12102, 341, 62, 25...",def test_host_validation_in_debug_mode(self) -...,"[[4299, 1332, 62, 4774, 62, 12102, 341, 62, 25...",def test_host_validation_in_debug_mode(self) -...,"[[318, 511, 63, 1102, 63, 6136, 63, 262, 63, 1...",def test_host_validation_in_debug_mode(self) -...,"[[318, 511, 63, 1102, 63, 6136, 63, 262, 63, 1...",def test_host_validation_in_debug_mode(self) -...,"[[4299, 1332, 62, 4774, 62, 12102, 341, 62, 25..."
3,12285,89,"[4299, 1332, 62, 17752, 62, 354, 945, 316, 7, ...",c1,def test_json_charset(self):\n response...,[],0,10,10,1,...,def test_json_charset(self) -> None:\n ...,"[[4299, 1332, 62, 17752, 62, 354, 945, 316, 7,...",def test_json_charset(self) -> None:\n self...,"[[4299, 1332, 62, 17752, 62, 354, 945, 316, 7,...",def test_json_charset(self) -> None:\n asse...,"[[318, 511, 63, 1001, 63, 6043, 8, 277, 9, 103...",def test_json_charset(self) -> None:\n ...,"[[318, 511, 63, 1001, 63, 6043, 8, 277, 9, 103...",def test_json_charset(self) -> None:\n ...,"[[4299, 1332, 62, 17752, 62, 354, 945, 316, 7,..."
4,13331,188,"[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 26...",c1,def test_exception_report_uses_meta_filtering(...,[],0,10,17,1,...,def test_exception_report_uses_meta_filtering(...,"[[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 2...",def test_exception_report_uses_meta_filtering(...,"[[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 2...",def test_exception_report_uses_meta_filtering(...,"[[318, 511, 63, 1971, 63, 3070, 63, 5589, 63, ...",def test_exception_report_uses_meta_filtering(...,"[[318, 511, 63, 1971, 63, 3070, 63, 5589, 63, ...",def test_exception_report_uses_meta_filtering(...,"[[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 2..."


In [55]:
max_new_tokens = int(df_subset['size'].mean())

In [56]:
generated_list=[]
generated_vector=[]
for i in range(SAMPLE_SIZE):
    generated = model.generate(input_ids_list[i],do_sample=True, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_new_tokens) # <- max_new_tokens is the mean of the ground truth size
    generated_vector.append(generated)
    generated_list.append(tokenizer.batch_decode(generated, skip_special_tokens=True))

In [57]:

df_subset['outcome_'+CASE] = [tensor[0] for tensor in generated_list]
df_subset['outcome_vector_'+CASE] = [tensor.cpu().numpy() for tensor in generated_vector]

In [58]:
df_subset.head()

Unnamed: 0.1,Unnamed: 0,size,ids,m_name,code,ast_errors,n_ast_errors,ast_levels,n_whitespaces_,complexity,...,outcome_c11,outcome_vector_c11,outcome_c14,outcome_vector_c14,outcome_c15,outcome_vector_c15,outcome_c16,outcome_vector_c16,outcome_c17,outcome_vector_c17
0,4710,295,"[4299, 4808, 39014, 62, 14421, 62, 1370, 62, 8...",c1,def _apply_current_line_highlight(self):\n ...,[],0,11,32,3,...,def _apply_current_line_highlight(self) -> Non...,"[[4299, 4808, 39014, 62, 14421, 62, 1370, 62, ...",def _apply_current_line_highlight(self) -> Non...,"[[318, 485, 4998, 63, 1818, 63, 604, 63, 14337...",def _apply_current_line_highlight(self) -> Non...,"[[318, 485, 4998, 63, 1818, 63, 604, 63, 14337...",def _apply_current_line_highlight(self) -> boo...,"[[4299, 4808, 39014, 62, 14421, 62, 1370, 62, ...",def _apply_current_line_highlight(self) -> Non...,"[[4299, 4808, 39014, 62, 14421, 62, 1370, 62, ..."
1,37857,237,"[4299, 1332, 62, 28665, 62, 7645, 16354, 62, 8...",c1,def test_column_transformer_tuple_transformers...,[],0,11,32,1,...,def test_column_transformer_tuple_transformers...,"[[4299, 1332, 62, 28665, 62, 7645, 16354, 62, ...",def test_column_transformer_tuple_transformers...,"[[318, 511, 63, 2301, 63, 19754, 63, 2960, 63,...",def test_column_transformer_tuple_transformers...,"[[318, 511, 63, 2301, 63, 19754, 63, 2960, 63,...",def test_column_transformer_tuple_transformers...,"[[4299, 1332, 62, 28665, 62, 7645, 16354, 62, ...",def test_column_transformer_tuple_transformers...,"[[4299, 1332, 62, 28665, 62, 7645, 16354, 62, ..."
2,18219,248,"[4299, 1332, 62, 4774, 62, 12102, 341, 62, 259...",c1,def test_host_validation_in_debug_mode(self):\...,[],0,11,37,2,...,def test_host_validation_in_debug_mode(self) -...,"[[4299, 1332, 62, 4774, 62, 12102, 341, 62, 25...",def test_host_validation_in_debug_mode(self) -...,"[[318, 511, 63, 1102, 63, 6136, 63, 262, 63, 1...",def test_host_validation_in_debug_mode(self) -...,"[[318, 511, 63, 1102, 63, 6136, 63, 262, 63, 1...",def test_host_validation_in_debug_mode(self) -...,"[[4299, 1332, 62, 4774, 62, 12102, 341, 62, 25...",def test_host_validation_in_debug_mode(self) -...,"[[4299, 1332, 62, 4774, 62, 12102, 341, 62, 25..."
3,12285,89,"[4299, 1332, 62, 17752, 62, 354, 945, 316, 7, ...",c1,def test_json_charset(self):\n response...,[],0,10,10,1,...,def test_json_charset(self) -> None:\n self...,"[[4299, 1332, 62, 17752, 62, 354, 945, 316, 7,...",def test_json_charset(self) -> None:\n asse...,"[[318, 511, 63, 1001, 63, 6043, 8, 277, 9, 103...",def test_json_charset(self) -> None:\n ...,"[[318, 511, 63, 1001, 63, 6043, 8, 277, 9, 103...",def test_json_charset(self) -> None:\n ...,"[[4299, 1332, 62, 17752, 62, 354, 945, 316, 7,...",def test_json_charset(self) -> None:\n ...,"[[4299, 1332, 62, 17752, 62, 354, 945, 316, 7,..."
4,13331,188,"[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 26...",c1,def test_exception_report_uses_meta_filtering(...,[],0,10,17,1,...,def test_exception_report_uses_meta_filtering(...,"[[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 2...",def test_exception_report_uses_meta_filtering(...,"[[318, 511, 63, 1971, 63, 3070, 63, 5589, 63, ...",def test_exception_report_uses_meta_filtering(...,"[[318, 511, 63, 1971, 63, 3070, 63, 5589, 63, ...",def test_exception_report_uses_meta_filtering(...,"[[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 2...",def test_exception_report_uses_meta_filtering(...,"[[4299, 1332, 62, 1069, 4516, 62, 13116, 62, 2..."


### Checkpoint save subset

In [59]:
df_subset.to_json(path)

## BLUE and CodeBLUE score evaluation

In [60]:
import sys
import logging

In [61]:
logging.basicConfig(
    filename="../datax/logs/canonical_evaluation_log.txt",
    filemode='a',
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO
    )

In [62]:
## Params for codebleu: alpha, beta, gamma, theta
params='0.25,0.25,0.25,0.25'
lang= 'python'

In [63]:
# This line is needed to load the local CodeBLEU library. Do not use it to export this notebook!!
sys.path.append('/workspaces/CodeSyntaxConcept/scripts')

In [64]:
## based on microsoft script for calculating codeBLEU in codeSearchNet
import CodeBLEU.bleu as bleu
import CodeBLEU.weighted_ngram_match as weighted_ngram_match
import CodeBLEU.syntax_match as syntax_match
import CodeBLEU.dataflow_match as dataflow_match

In [65]:
def calculate_bleu_codeBleu(lang,params,df, gt_col, pred_col, keywords):
    alpha,beta,gamma,theta = [float(x) for x in params.split(',')]
    # preprocess inputs
    pre_references = [df[gt_col].to_list()]
    hypothesis = df[pred_col].to_list()
    for i in range(len(pre_references)):
        assert len(hypothesis) == len(pre_references[i])

    references = []
    for i in range(len(hypothesis)):
        ref_for_instance = []
        for j in range(len(pre_references)):
            ref_for_instance.append(pre_references[j][i])
        references.append(ref_for_instance)
    assert len(references) == len(pre_references)*len(hypothesis)


    # calculate ngram match (BLEU)
    tokenized_hyps = [x.split() for x in hypothesis]
    tokenized_refs = [[x.split() for x in reference] for reference in references]

    ngram_match_score = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)
    
    # calculate weighted ngram match
    keywords = [x.strip() for x in open(keywords, 'r', encoding='utf-8').readlines()]
    def make_weights(reference_tokens, key_word_list):
        return {token:1 if token in key_word_list else 0.2 \
                for token in reference_tokens}
    tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
                for reference_tokens in reference] for reference in tokenized_refs]

    weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights,tokenized_hyps)

    # calculate syntax match
    syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis,lang)

    # calculate dataflow match
    dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis,lang)

    logging.info('ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}'.\
                        format(ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score))

    code_bleu_score = alpha*ngram_match_score\
                    + beta*weighted_ngram_match_score\
                    + gamma*syntax_match_score\
                    + theta*dataflow_match_score

    logging.info('CodeBLEU score: '+ str(code_bleu_score))
    return ngram_match_score, code_bleu_score

In [66]:
lang='python'
keywords = '/workspaces/CodeSyntaxConcept/scripts/CodeBLEU/keywords/'+lang+'.txt'

In [67]:
CASE

'c17'

In [68]:
bleuScore, codebleuScore = calculate_bleu_codeBleu(lang,params,df_subset,'code','outcome_'+CASE,keywords)

In [69]:
print('Blue: {} CodeBLEU: {}'.format(bleuScore,codebleuScore) )
logging.info(f"Case: '{CASE}' Blue: '{bleuScore}' CodeBLEU: '{codebleuScore}'")

Blue: 0.01647760506203721 CodeBLEU: 0.1817452442204766


In [70]:
torch.cuda.empty_cache()

## Clean-up model cache. 

This section is optionl, however, consider to delete any previous model from cache if you wont use it later

In [71]:
import shutil

In [72]:
try:
    folder_name= parameters['cache_dir']
    shutil.rmtree(folder_name)
    print(f"The folder '{folder_name}' has been successfully deleted.")
except FileNotFoundError:
    print(f"The folder '{folder_name}' does not exist.")
except OSError as e:
    print(f"Error occurred while deleting the folder '{folder_name}': {e}")

The folder '../data/hugginface/cache/models' has been successfully deleted.
