# Evaluation of Seq2Seq Models

In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, LogitsProcessorList, MinLengthLogitsProcessor, TopKLogitsWarper, TemperatureLogitsWarper, BeamSearchScorer
import torch
import datasets 
import pickle 

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

### Setup & Helper Functions

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def tokenize_function(set):
    inputs = tokenizer(set["code"], max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
       labels = tokenizer(set["docstring"], max_length=512, padding="max_length", truncation=True, return_tensors="pt")

    inputs["labels"] = labels["input_ids"]

    return inputs

In [3]:
bleu = datasets.load_metric('sacrebleu')
rouge = datasets.load_metric('rouge')
meteor = datasets.load_metric('meteor')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Parry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Parry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Parry\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
import numpy as np
class testWrapper():
    def __init__(self, model):
        self.model = model.cuda()

        self.beam_scorer = BeamSearchScorer(
        batch_size=4,
        max_length=self.model.config.max_length,
        num_beams=4,
        device=self.model.device,
        )
        
        self.logits_processor = LogitsProcessorList(
        [MinLengthLogitsProcessor(5, eos_token_id=self.model.config.eos_token_id)]
        )

        self.logits_warper = LogitsProcessorList(
            [
            TopKLogitsWarper(50),
            TemperatureLogitsWarper(0.7),
            ]
        )

        input_ids = torch.ones((4, 1), device=self.model.device, dtype=torch.long)
        self.input_ids = input_ids * self.model.config.decoder_start_token_id
    
    def generate_string(self, batch):
        inputs = tokenizer(batch["code"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs.input_ids.cuda()
        attention_mask = inputs.attention_mask.cuda()
        outputs = self.model.generate(input_ids, attention_mask=attention_mask)
        output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        batch["pred_string"] = output_str
        return batch
    
    def generate_per_string(self, batch):
        inputs = tokenizer(batch["code"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs.input_ids.cuda()
        attention_mask = inputs.attention_mask.cuda()
        outputs = self.model.generate(input_ids, attention_mask=attention_mask)
        output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        batch["pred_string"] = output_str

        predictions = output_str
        references = [batch["docstring"]]

        rouge_output = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge2"])["rouge2"].mid
        bleu_output = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
        meteor_output = meteor.compute(predictions=predictions, references=references)

        batch["rouge2_precision"] = round(rouge_output.precision, 4)
        batch["rouge2_recall"] = round(rouge_output.recall, 4)
        batch["rouge2_fmeasure"] = round(rouge_output.fmeasure, 4)
        batch["bleu_score"] = bleu_output["score"]
        batch["meteor_score"] = meteor_output["meteor"]
        
        return batch

    def test_gen(self, batch):
        encoder_input_ids = tokenizer(batch['code'], padding="max_length", truncation=True, max_length=512, return_tensors="pt").input_ids
        model_kwargs = {
        "encoder_outputs": self.model.get_encoder()(
            encoder_input_ids.repeat_interleave(4, dim=0), return_dict=True
            )
        }   
        outputs = self.model.beam_sample(
        self.input_ids, self.beam_scorer, logits_processor=self.logits_processor, logits_warper=self.logits_warper, **model_kwargs
        )
        batch['pred_string'] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return batch

In [5]:
def eval_compute(results):
    predictions=results["pred_string"] 
    references=results["docstring"]

    rouge_output = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge2"])["rouge2"].mid
    bleu_output = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    meteor_output = meteor.compute(predictions=predictions, references=references)

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
        "bleu_score" : bleu_output["score"],
        "meteor_score" : meteor_output["meteor"]
    }

In [6]:
def modelSetup(path):
    model = AutoModelForSeq2SeqLM.from_pretrained(path)

    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.eos_token_id = tokenizer.sep_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.vocab_size = model.config.encoder.vocab_size
    model.config.num_beams = 4
    
    return model

In [None]:
def ttest(delta, N):
    deg_free = N - 1
    d_sq = delta ** 2
    t = (np.sum(delta)/N) / np.sqrt((np.sum(d_sq) - ((np.sum(delta)**2) / N)) / ((N - 1) * N))
    return t, p


# Test Generation

## No Augmentation 

In [14]:
test_set = datasets.load_dataset('json', data_files="D:\\PROJECT\\data\\CodeSearchNet\\py_clean\\test.jsonl")["train"]

Using custom data configuration default-3f4bdb1ea0486ba6
Reusing dataset json (C:\Users\Parry\.cache\huggingface\datasets\json\default-3f4bdb1ea0486ba6\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 1/1 [00:00<00:00, 200.01it/s]


In [15]:
frame_test = test_set.to_pandas()

In [16]:
frame_test

Unnamed: 0,code,docstring
0,"def sina_xml_to_url_list(xml_data):\n """"""st...",str->list\n Convert XML to URL List.\n F...
1,"def dailymotion_download(url, output_dir='.', ...",Downloads Dailymotion videos by URL.
2,"def sina_download(url, output_dir='.', merge=T...",Downloads Sina videos by URL.
3,"def sprint(text, *colors):\n """"""Format text...",Format text with color or other effects into A...
4,"def print_log(text, *colors):\n """"""Print a ...",Print a log message to standard error.
...,...,...
14913,"def from_grayscale(im, channels_on=(True, True...",Return a canvas from a grayscale image.\n\n ...
14914,"def get_uuid(length=32, version=1):\n """"""\n...",Returns a unique ID of a given length.\n Us...
14915,"def get_unique_key_from_get(get_dict):\n """"...",Build a unique key from get data
14916,"def get_domain(url):\n """""" Returns domain n...",Returns domain name portion of a URL


In [22]:
frame_test.loc[6100]

code         def save_token(self, token_file=None):\n      ...
docstring    Obtain the user's long-lived API token and sav...
Name: 6100, dtype: object

In [19]:
import pickle

with open("D:\\PROJECT\\out\\original\\small\\results.pkl", 'rb') as f:
    frame = pickle.load(f)

In [27]:
frame

{'rouge2_precision': 0.99,
 'rouge2_recall': 0.5708,
 'rouge2_fmeasure': 0.6429,
 'bleu_score': 1.1199619518003372,
 'meteor_score': 0.5586790863025023}

In [8]:
small_path = "D:\PROJECT\out\original\small\model_out"
medium_path = "D:\PROJECT\out\original\medium\model_out"

In [9]:
small_model = modelSetup(small_path)
medium_model = modelSetup(medium_path)

In [10]:
small_tester = testWrapper(small_model)
medium_tester = testWrapper(medium_model)



In [45]:
small_res = test_set.map(small_tester.generate_string, batched=True, batch_size=8)

100%|██████████| 1865/1865 [17:43<00:00,  1.75ba/s]


In [47]:
small_res

Dataset({
    features: ['code', 'docstring', 'pred_string'],
    num_rows: 14918
})

In [11]:
medium_res = test_set.map(medium_tester.generate_string, batched=True, batch_size=8)

100%|██████████| 1865/1865 [35:38<00:00,  1.15s/ba]


In [12]:
medium_per_scores = eval_compute(medium_res)

In [13]:
medium_per_scores

{'rouge2_precision': 0.9949,
 'rouge2_recall': 0.5737,
 'rouge2_fmeasure': 0.646,
 'bleu_score': 1.1251101518020312,
 'meteor_score': 0.5604892748778642}

In [50]:
del small_res

In [48]:
small_per_scores = eval_compute(small_res)

In [49]:
small_per_scores

{'rouge2_precision': 0.99,
 'rouge2_recall': 0.571,
 'rouge2_fmeasure': 0.643,
 'bleu_score': 1.1199619518003372,
 'meteor_score': 0.5586790863025023}

In [17]:
medium_per_scores = eval_compute(medium_res)

In [18]:
with open("D:\\PROJECT\\out\\original\\medium\\per_scores.pkl", 'wb') as f:
    pickle.dump(medium_per_scores, f)

In [40]:
with open("D:\\PROJECT\\out\\original\\medium\\per_scores.pkl", 'rb') as f:
    print(pickle.load(f))

{'rouge2_precision': 0.9949, 'rouge2_recall': 0.5739, 'rouge2_fmeasure': 0.6462, 'bleu_score': 1.1251101518020312, 'meteor_score': 0.5604892748778642}


In [41]:
with open("D:\\PROJECT\\out\\original\\small\\per_scores.pkl", 'rb') as f:
    print(pickle.load(f))

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\PROJECT\\out\\original\\small\\per_scores.pkl'

In [19]:
medium_per_res = test_set.map(medium_tester.generate_per_string, batched=False)

100%|██████████| 14918/14918 [1:38:23<00:00,  2.53ex/s]


In [25]:
small_per_res = test_set.map(small_tester.generate_per_string, batched=False)

100%|██████████| 14918/14918 [1:09:31<00:00,  3.58ex/s]


In [26]:
with open("D:\\PROJECT\\out\\original\\small\\per_res.pkl", 'wb') as f:
    pickle.dump(small_per_res, f)

In [20]:
with open("D:\\PROJECT\\out\\original\\medium\\per_res.pkl", 'wb') as f:
    pickle.dump(medium_per_res, f)

In [21]:

small_per_scores = eval_compute(small_res)
medium_per_scores = eval_compute(medium_res)


'\nsmall_per_scores = eval_compute(small_res)\nmedium_per_scores = eval_compute(medium_res)\n'

In [22]:
#medium_scores

In [23]:
#small_scores

In [78]:
input_str = "list = ['x', 'y', 'z']"

inputs = tokenizer([input_str], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.cuda()
attention_mask = inputs.attention_mask.cuda()
outputs_orig = medium_model.generate(input_ids, attention_mask=attention_mask)
outputs = medium_aug_model.cuda().generate(input_ids, attention_mask=attention_mask)
output_str = tokenizer.batch_decode(outputs_orig, skip_special_tokens=True)
output_aug_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output_str, output_aug_str)

['Listed results to xy.'] ["AList of points in the list of points in a list'\n"]


## Augmentation

In [25]:
with open("D:\\PROJECT\\out\\aug\\medium\\per_sentence_res.pkl", 'rb') as f:
    frame = pickle.load(f).to_pandas()

In [34]:
print(frame[frame.bleu_score < 1.0].iloc[1790].pred_string)

['Parses sitetree_menu tag parameters.\n\n      ']


In [54]:
small_aug_path = "D:\\PROJECT\\out\\aug\small\\model_out"
medium_aug_path = "D:\\PROJECT\\out\\aug\medium\\model_out"

In [55]:
small_aug_model = modelSetup(small_aug_path)
medium_aug_model = modelSetup(medium_aug_path)

In [10]:
small_aug_tester = testWrapper(small_aug_model)
medium_aug_tester = testWrapper(medium_aug_model)



In [12]:
medium_aug_res = test_set.map(medium_aug_tester.generate_string, batched=True, batch_size=8)
small_aug_res = test_set.map(small_aug_tester.generate_string, batched=True, batch_size=8)

100%|██████████| 1865/1865 [37:15<00:00,  1.20s/ba]
100%|██████████| 1865/1865 [18:04<00:00,  1.72ba/s]


In [13]:
medium_aug_scores = eval_compute(medium_aug_res)
small_aug_scores = eval_compute(small_aug_res) 

In [16]:
medium_aug_scores

{'rouge2_precision': 0.9386,
 'rouge2_recall': 0.5736,
 'rouge2_fmeasure': 0.607,
 'bleu_score': 1.4974138333701776,
 'meteor_score': 0.5430670378070557}

In [15]:
small_aug_scores

{'rouge2_precision': 0.9335,
 'rouge2_recall': 0.5708,
 'rouge2_fmeasure': 0.6035,
 'bleu_score': 1.489514462281551,
 'meteor_score': 0.5415090320766118}

In [17]:
del medium_aug_scores
del small_aug_scores

In [18]:
medium_aug_per_res = test_set.map(medium_aug_tester.generate_per_string, batched=False)
with open("D:\\PROJECT\\out\\aug\\medium\\per_sentence_res.pkl", 'wb') as f:
    pickle.dump(medium_aug_per_res, f)
small_aug_per_res = test_set.map(small_aug_tester.generate_per_string, batched=False)
with open("D:\\PROJECT\\out\\aug\\small\\per_sentence_res.pkl", 'wb') as f:
    pickle.dump(small_aug_per_res, f)

100%|██████████| 14918/14918 [1:51:55<00:00,  2.22ex/s]
100%|██████████| 14918/14918 [1:11:48<00:00,  3.46ex/s]


In [30]:
docstrings = test_set.to_pandas()['docstring']

In [31]:
aug_doc = medium_aug_per_res.to_pandas()['docstring']

In [34]:
docstrings

0        str->list\n    Convert XML to URL List.\n    F...
1                     Downloads Dailymotion videos by URL.
2                            Downloads Sina videos by URL.
3        Format text with color or other effects into A...
4                   Print a log message to standard error.
                               ...                        
14913    Return a canvas from a grayscale image.\n\n   ...
14914    Returns a unique ID of a given length.\n    Us...
14915                     Build a unique key from get data
14916                 Returns domain name portion of a URL
14917               Returns a dictionary from a URL params
Name: docstring, Length: 14918, dtype: object

In [35]:
avg_orig = np.mean(docstrings.apply(lambda x : len(x)))
avg_aug = np.mean(aug_doc.apply(lambda x : len(x)))

In [38]:
avg_aug

282.0935111945301