# Evaluation of Seq2Seq Models

In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, LogitsProcessorList, MinLengthLogitsProcessor, TopKLogitsWarper, TemperatureLogitsWarper, BeamSearchScorer
import torch
import datasets 

import pandas as pd

### Setup & Helper Functions

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def tokenize_function(set):
    inputs = tokenizer(set["code"], max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
       labels = tokenizer(set["docstring"], max_length=512, padding="max_length", truncation=True, return_tensors="pt")

    inputs["labels"] = labels["input_ids"]

    return inputs

In [3]:
bleu = datasets.load_metric('sacrebleu')
rouge = datasets.load_metric('rouge')
meteor = datasets.load_metric('meteor')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Parry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Parry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Parry\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [67]:
import numpy as np
class testWrapper():
    def __init__(self, model):
        self.model = model.cuda()

        self.beam_scorer = BeamSearchScorer(
        batch_size=4,
        max_length=self.model.config.max_length,
        num_beams=4,
        device=self.model.device,
        )
        
        self.logits_processor = LogitsProcessorList(
        [MinLengthLogitsProcessor(5, eos_token_id=self.model.config.eos_token_id)]
        )

        self.logits_warper = LogitsProcessorList(
            [
            TopKLogitsWarper(50),
            TemperatureLogitsWarper(0.7),
            ]
        )

        input_ids = torch.ones((4, 1), device=self.model.device, dtype=torch.long)
        self.input_ids = input_ids * self.model.config.decoder_start_token_id
    
    def generate_string(self, batch):
        inputs = tokenizer(batch["code"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs.input_ids.cuda()
        attention_mask = inputs.attention_mask.cuda()
        outputs = self.model.generate(input_ids, attention_mask=attention_mask)
        output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        batch["pred_string"] = output_str

        predictions = output_str
        references = [batch["docstring"]]

        rouge_output = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge2"])["rouge2"].mid
        bleu_output = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
        meteor_output = meteor.compute(predictions=predictions, references=references)

        batch["rouge2_precision"] = round(rouge_output.precision, 4)
        batch["rouge2_recall"] = round(rouge_output.recall, 4)
        batch["rouge2_fmeasure"] = round(rouge_output.fmeasure, 4)
        batch["bleu_score"] = bleu_output["score"]
        batch["meteor_score"] = meteor_output["meteor"]
        
        return batch

    def test_gen(self, batch):
        encoder_input_ids = tokenizer(batch['code'], padding="max_length", truncation=True, max_length=512, return_tensors="pt").input_ids
        model_kwargs = {
        "encoder_outputs": self.model.get_encoder()(
            encoder_input_ids.repeat_interleave(4, dim=0), return_dict=True
            )
        }   
        outputs = self.model.beam_sample(
        self.input_ids, self.beam_scorer, logits_processor=self.logits_processor, logits_warper=self.logits_warper, **model_kwargs
        )
        batch['pred_string'] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return batch

In [5]:
def eval_compute(results):
    predictions=results["pred_string"] 
    references=results["docstring"]

    rouge_output = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge2"])["rouge2"].mid
    bleu_output = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    meteor_output = meteor.compute(predictions=predictions, references=references)

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
        "bleu_score" : bleu_output["score"],
        "meteor_score" : meteor_output["meteor"]
    }

In [6]:
def modelSetup(path):
    model = AutoModelForSeq2SeqLM.from_pretrained(path)

    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.eos_token_id = tokenizer.sep_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.vocab_size = model.config.encoder.vocab_size
    model.config.num_beams = 4
    
    return model

## No Augmentation 

In [7]:
test_set = datasets.load_dataset('json', data_files="D:\\PROJECT\\data\\CodeSearchNet\\py_clean\\test.jsonl")["train"]

Using custom data configuration default-3f4bdb1ea0486ba6
Reusing dataset json (C:\Users\Parry\.cache\huggingface\datasets\json\default-3f4bdb1ea0486ba6\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 1/1 [00:00<00:00, 250.18it/s]


In [19]:
frame_test = test_set.to_pandas()

In [20]:
frame_test

Unnamed: 0,code,docstring
0,"def sina_xml_to_url_list(xml_data):\n """"""st...",str->list\n Convert XML to URL List.\n F...
1,"def dailymotion_download(url, output_dir='.', ...",Downloads Dailymotion videos by URL.
2,"def sina_download(url, output_dir='.', merge=T...",Downloads Sina videos by URL.
3,"def sprint(text, *colors):\n """"""Format text...",Format text with color or other effects into A...
4,"def print_log(text, *colors):\n """"""Print a ...",Print a log message to standard error.
...,...,...
14913,"def from_grayscale(im, channels_on=(True, True...",Return a canvas from a grayscale image.\n\n ...
14914,"def get_uuid(length=32, version=1):\n """"""\n...",Returns a unique ID of a given length.\n Us...
14915,"def get_unique_key_from_get(get_dict):\n """"...",Build a unique key from get data
14916,"def get_domain(url):\n """""" Returns domain n...",Returns domain name portion of a URL


In [17]:
import pickle

with open("D:\\PROJECT\\out\\original\\small\\results.pkl", 'rb') as f:
    frame = pickle.load(f)

In [18]:
frame

{'rouge2_precision': 0.0444,
 'rouge2_recall': 0.0092,
 'rouge2_fmeasure': 0.0132,
 'bleu_score': 0.0018771960314820039,
 'meteor_score': 0.0681162073537217}

In [8]:
small_path = "D:\PROJECT\out\original\small\model_out"
medium_path = "D:\PROJECT\out\original\medium\model_out"

In [9]:
small_model = modelSetup(small_path)
medium_model = modelSetup(medium_path)

In [64]:
small_tester = testWrapper(small_model)
medium_tester = testWrapper(medium_model)

In [65]:
medium_per_res = frame_test.apply(lambda x : medium_tester.generate_string(x), axis=1)

KeyboardInterrupt: 

In [68]:
small_per_res = test_set.map(small_tester.generate_string, batched=False)

100%|██████████| 14918/14918 [43:07<00:00,  5.77ex/s]


In [70]:
medium_per_res = test_set.map(medium_tester.generate_string, batched=False)

  4%|▎         | 558/14918 [17:58<7:42:35,  1.93s/ex] 


KeyboardInterrupt: 

In [13]:
"""
small_per_scores = eval_compute(small_res)
medium_per_scores = eval_compute(medium_res)
"""

In [36]:
#medium_scores

{'rouge2_precision': 0.9949,
 'rouge2_recall': 0.5738,
 'rouge2_fmeasure': 0.6461,
 'bleu_score': 1.1251101518020312,
 'meteor_score': 0.5604892748778642}

In [24]:
#small_scores

{'rouge2_precision': 0.0444,
 'rouge2_recall': 0.0092,
 'rouge2_fmeasure': 0.0132,
 'bleu_score': 0.0018771960314820039,
 'meteor_score': 0.0681162073537217}

In [62]:
inputs = tokenizer([str('import tensorflow as tf mnist = tf.keras.datasets.mnist (x_train, y_train),(x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0')], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.cuda()
attention_mask = inputs.attention_mask.cuda()
outputs = medium_model.generate(input_ids, attention_mask=attention_mask)
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output_str)

['The test function to be used for training.\n\n    Args:\n ']


## Augmentation

In [14]:
small_path = ""
medium_path = ""