In [21]:

import logging
from pathlib import Path

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from accelerate import Accelerator
import tensorflow as tf



In [None]:
from datasets import load_dataset

In [None]:
from torch.utils.data.dataloader import DataLoader

In [None]:
from torch.utils.data import IterableDataset

# 0.0 Model evaluation  for mmodel generation

This part of the notebook loads a Json testbed, generate the prediction, align the output for each generation agaisnt the ground truth the measure the distance. NO Bootstraping applied!

In [None]:
class ConstantLengthDataset(IterableDataset):
    def __init__(self, tokenizer, dataset, field, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
        self.field=field

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    buffer.append(next(iterator)[self.field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    more_examples = False
                    break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [None]:
def create_dataloader(args,tokenizer):
    data_files  = {"test":args['test_bed_name']}
    valid_data = load_dataset(args['data_path'], data_files=data_files, split="test")
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data, args['field'], seq_length=args['seq_length'])
    eval_dataloader = DataLoader(valid_dataset, batch_size=args['batch_size'])
    return  eval_dataloader

In [None]:
def evaluate(args,model,eval_dataloader):
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args['batch_size'])
        losses.append(accelerator.gather(loss))

        if args['max_eval_steps'] > 0 and step >= args['max_eval_steps']:
            break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()




In [None]:
def param_default():
    model_name = 'codeparrot-small' #<-- Scope
    test_bed_name='code_completion_random_cut_5k_30_512_tokens.json'
    semeru_datases_path= '/workspaces/code-rationales/'
    data_path = Path(semeru_datases_path+'datax/' + model_name + '/')
    data_path= semeru_datases_path+'semeru-datasets/semeru/galeras/code_rationales'
    return {
        'out_processed' : '/datasets/out_processed/',
        'checkpoint_file': Path(semeru_datases_path+'data/codeparrot-small/checkpoints/checkpoint-29000'), #Model
        'output_results' : 'results/' ,
        'seed': 1,
        'data_path': data_path,
        'test_bed_name':test_bed_name,
        'seq_length': 64,
        'batch_size': 2,
        'field': "random_cut",
        'max_eval_steps':-1
    }

In [None]:
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
device

In [None]:
torch.cuda.memory_allocated()

In [None]:
# Setup Accelerator
accelerator = Accelerator()
params = param_default()
# Parse configuration
set_seed(params['seed'])

# Logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
)

# Load model and tokenizer
checkpoint = params['checkpoint_file']
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = model.to( device ) #WARNING, Verify the device before assigning to memory

# Load dataset and dataloader
valid_dataset, eval_dataloader = create_dataloader(params,tokenizer)

# Prepare everything with our `accelerator`.
model, valid_dataset, eval_dataloader = accelerator.prepare(model, valid_dataset, eval_dataloader)

# Evaluate and save the last checkpoint
logger.info("Evaluating and saving model after training")
eval_loss, perplexity = evaluate(params, model, eval_dataloader)
logger.info(f"loss/eval: {eval_loss}, perplexity: {perplexity}")

## Naive test for code completion

Testing a basic example for code generation from codeparrot model and using the given checkpoint from compatibilization process

In [None]:
#device ="cpu"

In [None]:
prompt ="def duntion_test():"
prompt="def test_frequency_condition_alone(self):\n        prev_hour = timezone.now() - timedelta(hours=1)"
params = param_default()

#torch.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(params['checkpoint_file'])
tokenizer = AutoTokenizer.from_pretrained(params['checkpoint_file'])
model = model.to( device ) #WARNING, Verify the device before assigning to memory
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(input_ids, do_sample=True, max_length=128)

tokenizer.batch_decode(outputs, skip_special_tokens=True)


# 1.0 Evaluation for codeparrot using the sampling testbeds

## 1.1 Outcome generation & Levenshtein evaluation

In [None]:
## This iterator is NOT working for batches > 1!!
class ConstantTokenLengthDataset(IterableDataset):
    def __init__(self, tokenizer, dataset, field, num_of_tokens=64, num_of_sequences=1024):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.num_of_tokens = min(num_of_tokens, tokenizer.model_max_length)
        self.field=field
        self.input_char = int(self.num_of_tokens*3.6)
        self.num_of_sequences=num_of_sequences
        self.prompts=[]

    def __iter__(self):  
        for i, buffer in enumerate(self.dataset):
            size = min(len(buffer[self.field]),self.input_char)
            input = buffer[self.field][:size]
            self.prompts.append(input)
            if i > self.num_of_sequences:
                break
        tokenized_inputs = self.tokenizer(self.prompts, max_length= self.num_of_tokens, padding=True, truncation=True, return_tensors="pt")["input_ids"]
        for tokenized_input in tokenized_inputs:
            yield torch.tensor(tokenized_input)

In [None]:
def create_dataloader(args,tokenizer):
    data_files  = {"test":args['test_bed_name']}
    valid_data = load_dataset(args['data_path'], data_files=data_files, split="test")
    valid_dataset = ConstantTokenLengthDataset(tokenizer, valid_data, args['field'], num_of_tokens=args['seq_length'])
    eval_dataloader = DataLoader(valid_dataset, batch_size=1)
    return  valid_dataset, eval_dataloader

In [None]:
def generate_outcomes(args,model,eval_dataloader,valid_data):
    model.eval()
    results = []
    for step, inputs in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model.generate(inputs, do_sample=True, max_length=128,  pad_token_id=tokenizer.eos_token_id)
            outcome = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        prompt=valid_data.prompts[step]
        result = {"prompt": prompt, "outcome":outcome}
        results.append(result)
        if args['max_eval_steps'] > 0 and step >= args['max_eval_steps']:
            break
    return results


In [None]:
# Setup Accelerator
accelerator = Accelerator()
params = param_default()
# Parse configuration
set_seed(params['seed'])

# Logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
)

# Load model and tokenizer
checkpoint = params['checkpoint_file']
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = model.to( device ) #WARNING, Verify the device before assigning to memory

# Load dataset and dataloader
valid_dataset, eval_dataloader = create_dataloader(params,tokenizer)

# Prepare everything with our `accelerator`.
model, valid_dataset, eval_dataloader = accelerator.prepare(model, valid_dataset, eval_dataloader)

# Evaluate and save the last checkpoint
logger.info("Evaluating and saving model after training")
outcomes = generate_outcomes(params, model, eval_dataloader,valid_dataset)
logger.info(f"outomces: {len(outcomes)}")

In [None]:
outcomes

In [None]:
import pandas as pd
import textdistance

In [None]:
levenshtein_similarity = textdistance.levenshtein


In [None]:
lev_calc = [levenshtein_similarity.normalized_similarity(x["prompt"].strip(), x["outcome"][0].strip() ) for x in outcomes]


In [None]:
df = pd.DataFrame(outcomes)
df = df.assign(lev_sim=lev_calc)

In [None]:
df.describe()

## 1.2 Evaluation model from samples

In [None]:
import pandas as pd
import textdistance
import ast

In [None]:
path = "/workspaces/code-rationales/data/sampling/gpt/code_completion_docstring_5k_30_150_tokens.csv"

In [None]:
testbeds = ['docstring','docst_randcut','docst_sign','randcut']

In [None]:
df = pd.read_csv(path, index_col=0)

In [None]:
df.head()

In [None]:
df.shape

In [14]:
def param_default():
    model_name = 'codeparrot-small' #<-- Model
    test_bed_name= 'code_completion_docstring_5k_30_150_tokens.csv'
    datasets_path= Path('/workspaces/code-rationales')
    datax = Path(datasets_path/'datax')
    data = Path(datasets_path/'data')
    galeras_sampling= Path(data/ 'sampling/gpt')
    model_path='codeparrot-small/checkpoints/checkpoint-29000'
    return {
        'out_processed' : '/datasets/out_processed/',
        'checkpoint_file': str(Path(data/model_path)), #Model
        'output_results' : 'results/' ,
        'seed': 1,
        'data_path': str(Path(galeras_sampling/test_bed_name)),
        'test_bed_name':test_bed_name,
        'seq_length': 64,
        'batch_size': 2,
        'field': "random_cut",
        'max_eval_steps':-1,
        'log_path': str(Path(datax/'logs/logs.log'))
    }

In [None]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
device = "cpu"

In [None]:
torch.device(device)

In [None]:
accelerator = Accelerator()
params = param_default()
# Parse configuration
set_seed(params['seed'])
# Logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, filename=params['log_path']
)
logger = logging.getLogger(__name__)


# Load model and tokenizer
checkpoint = params['checkpoint_file']
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#tokenizer.pad_token = tokenizer.eos_token
model = model.to( device ) #WARNING, Verify the device before assigning to memory

# Prepare everything with our `accelerator`.
model = accelerator.prepare(model)



In [None]:
levenshtein_similarity = textdistance.levenshtein


In [None]:
def convert_df_string_to_array(df,column):
    try : 
        df[column] = df[column].apply(ast.literal_eval)
    except:
        logger.warning("Not column convertion from string to array")
    finally:
        return df

In [None]:
def compute_levenshtein_for_samples(tokenizer, df, samples= 30):
    outcomes_df = pd.DataFrame()
    outcomes_df['ground_truth'] = df['ground_truth'].apply(lambda x: x.strip())
    for i in range(0,samples):
        with torch.no_grad():
            df = convert_df_string_to_array(df, str(i))
            outcomes = tokenizer.batch_decode(df[str(i)].to_list(), skip_special_tokens=True)
            outcomes_df['outcome_'+str(i)] = outcomes
            col_name = 'lev_'+str(i)
            outcomes_df[col_name] = [levenshtein_similarity.normalized_similarity(x.strip(), outcomes_df['ground_truth'][step]) for step,x in enumerate(outcomes)]
            logger.info("computed levenshtein for outome " + str(i))   
       
    return outcomes_df
            

In [None]:
def compute_levenshtenin_mean_pivot(df,  df_name, samples=30):
    result_df = pd.DataFrame()
    lev_columns = []

    for i in range(0,samples):
        col_name = 'lev_'+str(i)
        lev_columns.append(col_name)
        result_df.loc[i,df_name] = df[col_name].mean()
    #result_df.loc[samples,df_name]  = result_df[df_name].mean(axis=1)
    #result_df['avg_std_lev'] = result_df[lev_columns].std(axis=1)
    return result_df

## Only docstring

In [None]:
# Compute levenshtein for each outcome and the average
path = "/workspaces/code-rationales/data/sampling/gpt/code_completion_docstring_5k_30_150_tokens.csv"
df = pd.read_csv(path, index_col=0)
logger.info("Calculating levenshtein simmilarity")
docstring_df = compute_levenshtein_for_samples(tokenizer, df, 30)
logger.info(f"outomces: {len(docstring_df.shape)}")

In [None]:
mean_df = compute_levenshtenin_mean_pivot(docstring_df,'docstring')


### Docstring & random cut

In [None]:
path = "/workspaces/code-rationales/data/sampling/gpt/code_completion_docstring_random_cut_3.8k_30_150_tokens.csv"

In [None]:
random_cut_df = pd.read_csv(path, index_col=0)

In [None]:
random_cut_df.head()

In [None]:
# Compute levenshtein for each outcome and the average
logger.info("Computing docstring and random cut")
lev_rand_df = compute_levenshtein_for_samples(tokenizer, random_cut_df, 30)
logger.info(f"outcomes: {len(lev_rand_df.shape)}")

In [None]:
lev_rand_df

In [None]:
mean_df = pd.concat([mean_df,compute_levenshtenin_mean_pivot(lev_rand_df,'docst_randcut')],axis=1)


In [None]:
mean_df

In [None]:
# Compute levenshtein for each outcome and the average
path = "/workspaces/code-rationales/data/sampling/gpt/code_completion_docstring_signature_3.8k_30_150_tokens.csv"
df = pd.read_csv(path, index_col=0)
logger.info("Evaluating adocstring and signature")
lev_signature_df = compute_levenshtein_for_samples(tokenizer, df, 30)
logger.info(f"outcomes: {len(lev_signature_df.shape)}")

In [None]:
mean_df = pd.concat([mean_df,compute_levenshtenin_mean_pivot(lev_signature_df,'docst_sign')],axis=1)

In [None]:
# Compute levenshtein for each outcome and the average
path = "/workspaces/code-rationales/data/sampling/gpt/code_completion_random_cut_5k_30_512_tokens.csv"
df = pd.read_csv(path, index_col=0)
logger.info("Evaluating and saving model after training")
lev_rand_code_df = compute_levenshtein_for_samples(tokenizer, df, 30)
logger.info(f"outomces: {len(lev_rand_code_df.shape)}")

In [None]:
mean_df = pd.concat([mean_df,compute_levenshtenin_mean_pivot(lev_rand_code_df,'rand_cut')],axis=1)

In [None]:
mean_df.head()

In [None]:
mean_df.shape

In [None]:
plt.figure(figsize=(15, 6)) 
sns.boxplot(data=mean_df, showfliers=False,palette="Set2").set_title("Codeparrot - AVG Levenshtein - 30 samples")

In [None]:
save_path= "/workspaces/code-rationales/datax/evaluation/"
logger.info("Saving levenshtain calculation")

In [None]:
    # Save the dataframe to a Parquet file
docstring_df.to_parquet(save_path+'code_completion_docstring_5k_30_150_tokens.parquet')

In [None]:
lev_rand_code_df.to_parquet(save_path+'code_completion_random_cut_5k_30_512_tokens.parquet')

In [None]:
lev_signature_df.to_parquet(save_path+'code_completion_docstring_signature_3.8k_30_150_tokens.parquet')

In [None]:
lev_rand_df.to_parquet(save_path+'code_completion_docstring_random_cut_3.8k_30_150_tokens.parquet')

In [None]:
mean_df.mean()

## 2.0 Calculating BLUE and codeBLEU

For each sample calculate the BLUE and CodeBLUE then calculate the AVG for each sample and then the AVG of AVG

In [28]:
## Params for codebleu: alpha, beta, gamma, theta
params='0.25,0.25,0.25,0.25'
lang= 'python'

In [2]:
#sys.path.clear()

In [3]:
import sys
sys.path.append('/workspaces/code-rationales/scripts')

sys.path

['/workspaces/code-rationales/nbs',
 '/usr/lib/python38.zip',
 '/usr/lib/python3.8',
 '/usr/lib/python3.8/lib-dynload',
 '',
 '/usr/local/lib/python3.8/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/workspaces/code-rationales/scripts']

In [35]:
## based on microsoft script for calculating codeBLEU in codeSearchNet
import CodeBLEU.bleu as bleu
import CodeBLEU.weighted_ngram_match as weighted_ngram_match
import CodeBLEU.syntax_match as syntax_match
import CodeBLEU.dataflow_match as dataflow_match
import sys
import json
import pandas as pd

### Load sampled testbeds 

In [5]:
path= "/workspaces/code-rationales/datax/evaluation/"


In [6]:
docstring_df =pd.read_parquet(path+'code_completion_docstring_5k_30_150_tokens.parquet')

In [7]:
lev_rand_code_df = pd.read_parquet(path+'code_completion_random_cut_5k_30_512_tokens.parquet')

In [8]:
lev_signature_df=pd.read_parquet(path+'code_completion_docstring_signature_3.8k_30_150_tokens.parquet')

In [9]:
lev_rand_df= pd.read_parquet(path+'code_completion_docstring_random_cut_3.8k_30_150_tokens.parquet')

In [39]:
def calculate_bleu_codeBleu(lang,params,df, gt_col, pred_col, keywords):
    alpha,beta,gamma,theta = [float(x) for x in params.split(',')]
    # preprocess inputs
    pre_references = [df[gt_col].to_list()]
    hypothesis = df[pred_col].to_list()
    for i in range(len(pre_references)):
        assert len(hypothesis) == len(pre_references[i])

    references = []
    for i in range(len(hypothesis)):
        ref_for_instance = []
        for j in range(len(pre_references)):
            ref_for_instance.append(pre_references[j][i])
        references.append(ref_for_instance)
    assert len(references) == len(pre_references)*len(hypothesis)


    # calculate ngram match (BLEU)
    tokenized_hyps = [x.split() for x in hypothesis]
    tokenized_refs = [[x.split() for x in reference] for reference in references]

    ngram_match_score = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)
    
    # calculate weighted ngram match
    keywords = [x.strip() for x in open(keywords, 'r', encoding='utf-8').readlines()]
    def make_weights(reference_tokens, key_word_list):
        return {token:1 if token in key_word_list else 0.2 \
                for token in reference_tokens}
    tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
                for reference_tokens in reference] for reference in tokenized_refs]

    weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights,tokenized_hyps)

    # calculate syntax match
    syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis,lang)

    # calculate dataflow match
    dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis,lang)

    logger.info('ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}'.\
                        format(ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score))

    code_bleu_score = alpha*ngram_match_score\
                    + beta*weighted_ngram_match_score\
                    + gamma*syntax_match_score\
                    + theta*dataflow_match_score

    logger.info('CodeBLEU score: '+ str(code_bleu_score))
    return ngram_match_score, code_bleu_score

In [11]:
lang='python'
keywords = '/workspaces/code-rationales/scripts/CodeBLEU/keywords/'+lang+'.txt'

### Evaluate docstring

In [12]:
docstring_df

Unnamed: 0,ground_truth,outcome_0,lev_0,outcome_1,lev_1,outcome_2,lev_2,outcome_3,lev_3,outcome_4,...,outcome_25,lev_25,outcome_26,lev_26,outcome_27,lev_27,outcome_28,lev_28,outcome_29,lev_29
0,Check if the grid client is up.\n\n Check f...,Generate Pyhton code that Check if the grid cl...,0.224626,Generate Pyhton code that Check if the grid cl...,0.192362,Generate Pyhton code that Check if the grid cl...,0.243636,Generate Pyhton code that Check if the grid cl...,0.227273,Generate Pyhton code that Check if the grid cl...,...,Generate Pyhton code that Check if the grid cl...,0.222586,Generate Pyhton code that Check if the grid cl...,0.217252,Generate Pyhton code that Check if the grid cl...,0.245931,Generate Pyhton code that Check if the grid cl...,0.241071,Generate Pyhton code that Check if the grid cl...,0.198839
1,Ensure that powershell processes inline script...,Generate Pyhton code that Ensure that powershe...,0.322621,Generate Pyhton code that Ensure that powershe...,0.326347,Generate Pyhton code that Ensure that powershe...,0.325524,Generate Pyhton code that Ensure that powershe...,0.327132,Generate Pyhton code that Ensure that powershe...,...,Generate Pyhton code that Ensure that powershe...,0.345768,Generate Pyhton code that Ensure that powershe...,0.326271,Generate Pyhton code that Ensure that powershe...,0.364109,Generate Pyhton code that Ensure that powershe...,0.331646,Generate Pyhton code that Ensure that powershe...,0.306836
2,Encode a bytestring to a base64 string for use...,Generate Pyhton code that Encode a bytestring ...,0.257911,Generate Pyhton code that Encode a bytestring ...,0.255556,Generate Pyhton code that Encode a bytestring ...,0.258224,Generate Pyhton code that Encode a bytestring ...,0.275510,Generate Pyhton code that Encode a bytestring ...,...,Generate Pyhton code that Encode a bytestring ...,0.227596,Generate Pyhton code that Encode a bytestring ...,0.253756,Generate Pyhton code that Encode a bytestring ...,0.224818,Generate Pyhton code that Encode a bytestring ...,0.229412,Generate Pyhton code that Encode a bytestring ...,0.260450
3,Add the arguments for the protocol to the clie...,Generate Pyhton code that Add the arguments fo...,0.328746,Generate Pyhton code that Add the arguments fo...,0.326996,Generate Pyhton code that Add the arguments fo...,0.343606,Generate Pyhton code that Add the arguments fo...,0.333333,Generate Pyhton code that Add the arguments fo...,...,Generate Pyhton code that Add the arguments fo...,0.385366,Generate Pyhton code that Add the arguments fo...,0.316712,Generate Pyhton code that Add the arguments fo...,0.334405,Generate Pyhton code that Add the arguments fo...,0.325581,Generate Pyhton code that Add the arguments fo...,0.343548
4,Locking should include hashes for *all* platfo...,Generate Pyhton code that Locking should inclu...,0.248555,Generate Pyhton code that Locking should inclu...,0.254360,Generate Pyhton code that Locking should inclu...,0.257576,Generate Pyhton code that Locking should inclu...,0.273921,Generate Pyhton code that Locking should inclu...,...,Generate Pyhton code that Locking should inclu...,0.269108,Generate Pyhton code that Locking should inclu...,0.262270,Generate Pyhton code that Locking should inclu...,0.262519,Generate Pyhton code that Locking should inclu...,0.275081,Generate Pyhton code that Locking should inclu...,0.227213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Return the wind_speed for backward compatibili...,Generate Pyhton code that Return the wind_spee...,0.265306,Generate Pyhton code that Return the wind_spee...,0.240057,Generate Pyhton code that Return the wind_spee...,0.260597,Generate Pyhton code that Return the wind_spee...,0.267227,Generate Pyhton code that Return the wind_spee...,...,Generate Pyhton code that Return the wind_spee...,0.289982,Generate Pyhton code that Return the wind_spee...,0.252239,Generate Pyhton code that Return the wind_spee...,0.304432,Generate Pyhton code that Return the wind_spee...,0.234586,Generate Pyhton code that Return the wind_spee...,0.250377
96,Return the visibility for backward compatibili...,Generate Pyhton code that Return the visibilit...,0.235905,Generate Pyhton code that Return the visibilit...,0.246032,Generate Pyhton code that Return the visibilit...,0.223979,Generate Pyhton code that Return the visibilit...,0.268750,Generate Pyhton code that Return the visibilit...,...,Generate Pyhton code that Return the visibilit...,0.225585,Generate Pyhton code that Return the visibilit...,0.193838,Generate Pyhton code that Return the visibilit...,0.217277,Generate Pyhton code that Return the visibilit...,0.219645,Generate Pyhton code that Return the visibilit...,0.195274
97,Whitelister.clean should remove disallowed tag...,Generate Pyhton code that Whitelister.clean sh...,0.272346,Generate Pyhton code that Whitelister.clean sh...,0.297721,Generate Pyhton code that Whitelister.clean sh...,0.299248,Generate Pyhton code that Whitelister.clean sh...,0.273598,Generate Pyhton code that Whitelister.clean sh...,...,Generate Pyhton code that Whitelister.clean sh...,0.308397,Generate Pyhton code that Whitelister.clean sh...,0.307937,Generate Pyhton code that Whitelister.clean sh...,0.288824,Generate Pyhton code that Whitelister.clean sh...,0.286885,Generate Pyhton code that Whitelister.clean sh...,0.298197
98,Building a Job twice from should return differ...,Generate Pyhton code that Building a Job twice...,0.312782,Generate Pyhton code that Building a Job twice...,0.305866,Generate Pyhton code that Building a Job twice...,0.289474,Generate Pyhton code that Building a Job twice...,0.340764,Generate Pyhton code that Building a Job twice...,...,Generate Pyhton code that Building a Job twice...,0.302290,Generate Pyhton code that Building a Job twice...,0.342282,Generate Pyhton code that Building a Job twice...,0.326748,Generate Pyhton code that Building a Job twice...,0.306864,Generate Pyhton code that Building a Job twice...,0.286331


In [27]:
import logging
from pathlib import Path
params_dict = param_default()
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, filename=params_dict['log_path']
)
logger = logging.getLogger(__name__)

In [42]:
docst_codebleu_df = pd.DataFrame()
samples=30
for i in range(0,samples):
    bleuScore, codebleuScore = calculate_bleu_codeBleu(lang,params,docstring_df,'ground_truth','outcome_'+str(i),keywords)
    docst_codebleu_df.loc[i,'docst_bleu'] = bleuScore
    docst_codebleu_df.loc[i,'docst_codebleu'] = codebleuScore

In [44]:
docst_codebleu_df.mean()

docst_bleu        0.178065
docst_codebleu    0.355168
dtype: float64

In [45]:
for i in range(0,samples):
    bleuScore, codebleuScore = calculate_bleu_codeBleu(lang,params,lev_rand_code_df,'ground_truth','outcome_'+str(i),keywords)
    docst_codebleu_df.loc[i,'rancut_bleu'] = bleuScore
    docst_codebleu_df.loc[i,'rancut_codebleu'] = codebleuScore

In [46]:
for i in range(0,samples):
    bleuScore, codebleuScore = calculate_bleu_codeBleu(lang,params,lev_signature_df,'ground_truth','outcome_'+str(i),keywords)
    docst_codebleu_df.loc[i,'doc_sig_bleu'] = bleuScore
    docst_codebleu_df.loc[i,'doc_sig_codebleu'] = codebleuScore

In [47]:
for i in range(0,samples):
    bleuScore, codebleuScore = calculate_bleu_codeBleu(lang,params,lev_rand_df,'ground_truth','outcome_'+str(i),keywords)
    docst_codebleu_df.loc[i,'doc_ran_bleu'] = bleuScore
    docst_codebleu_df.loc[i,'doc_ran_codebleu'] = codebleuScore

In [48]:
docst_codebleu_df.mean()

docst_bleu          0.178065
docst_codebleu      0.355168
rancut_bleu         0.318113
rancut_codebleu     0.531193
doc_sig_bleu        0.230144
doc_sig_codebleu    0.429848
doc_ran_bleu        0.358623
doc_ran_codebleu    0.573638
dtype: float64