# Evaluation Notebook
### Purpose of this notebook is to evaluate models for the decipher task

In [34]:
# Imports
from transformers import T5Tokenizer, T5ForConditionalGeneration
from evaluate import load
from datasets import load_from_disk
import torch
from tqdm.notebook import tqdm

In [10]:
# Check CUDA working
print(torch.cuda.is_available())
device = torch.device('cuda')

True


In [2]:
# Load model from fine tuning checkpoint
last_checkpoint = '/home/as6734/langgen_class_project/results/caesar/checkpoint-14000'
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
# Load dataset
dataset = load_from_disk('/home/as6734/langgen_class_project/data/caesar')

  table = cls._concat_blocks(blocks, axis=0)


In [11]:
finetuned_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

### Qualitative Examples

In [13]:
# Qualitative example outside of test dataset
input_text = "Use a Caesar cipher with shift 25 to decipher the following text: gdkkn lx mzld hr ztrshm"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(input_ids.to("cuda"))
print(tokenizer.decode(outputs[0]))
print('True output: hello my name is austin')

<pad> helen mo nao is austin</s>
True output: hello my name is austin


In [25]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][0]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"))
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][0]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 17 to decipher the following text: argre j evt tfig reu leb tfdglkvi tfig fw kyv lezkvu jkrkvj jrzu nvuevjurp kyvp yru rxivvu kf afze wfitvj ze jlgvitfdglkvi jrcvj</s>'




Model Output: '<pad>japan s new corp and unk computer corp of the united states said wednes'
True Output: 'japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'


In [27]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][1]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"))
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][1]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 1 to decipher the following text: uif tsj mbolbo hpwfsonfou po xfeoftebz boopvodfe uif dmptvsf pg hpwfsonfou tdippmt xjui jnnfejbuf fggfdu bt b njmjubsz dbnqbjho bhbjot</s>'
Model Output: '<pad>the sri lankan government on wednesday announced the closure of government'
True Output: 'the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country</s>'


In [28]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][2]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"))
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][2]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -17 to decipher the following text: yxurln jaanbcnm oren jwcrwdlunja yaxcnbcnab cqdabmjh jocna cqnh bxdpqc cx mrbadyc uxjmrwp xo j oanwlq jwcjalcrl anbnjalq jwm bdy</s>'
Model Output: '<pad>police arrested five antigovernment protesters thursday after they sought to disrupt running of'
True Output: 'police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said</s>'


### Metrics

In [31]:
cer = load("cer")
bleu = load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [30]:
# Character Error Rate example
predictions = ['police arrested five antigovernment protesters thursday after they sought to disrupt running of']
references = ['police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said']
cer_score = cer.compute(predictions=predictions, references=references)
cer_score

0.5344827586206896

In [33]:
# BLEU example
predictions = ['police arrested five antigovernment protesters thursday after they sought to disrupt running of']
references = ['police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said']
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.21789620965758838, 'precisions': [0.8461538461538461, 0.6666666666666666, 0.5454545454545454, 0.4], 'brevity_penalty': 0.36787944117144233, 'length_ratio': 0.5, 'translation_length': 13, 'reference_length': 26}


In [35]:
# CER over test dataset
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    input_string = tokenizer.decode(dataset['test'][i]['input_ids'])
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"))
    predictions.append(tokenizer.decode(outputs[0]))
    references.append(tokenizer.decode(dataset['test'][i]['labels']))
cer_score = cer.compute(predictions=predictions, references=references)
cer_score

  0%|          | 0/1951 [00:00<?, ?it/s]

0.6369809534348986

In [36]:
# BLEU over test dataset
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.20935406875386808, 'precisions': [0.8194805194805195, 0.6153282800541946, 0.5498856376685859, 0.49245952065621396], 'brevity_penalty': 0.3443969524159974, 'length_ratio': 0.4840363937138131, 'translation_length': 29260, 'reference_length': 60450}


## Zero Shot Evaluation

In [37]:
base_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
base_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")
base_model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [38]:
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    input_string = base_tokenizer.decode(dataset['test'][i]['input_ids'])
    input_ids = base_tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = base_model.generate(input_ids.to("cuda"))
    predictions.append(base_tokenizer.decode(outputs[0]))
    references.append(base_tokenizer.decode(dataset['test'][i]['labels']))
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

  0%|          | 0/1951 [00:00<?, ?it/s]

CER Score: 0.9260700628327616
{'bleu': 0.011973073492356814, 'precisions': [0.4341629867523083, 0.2885725278953444, 0.22718218036943136, 0.1404376993291543], 'brevity_penalty': 0.04761723909245882, 'length_ratio': 0.24724565756823821, 'translation_length': 14946, 'reference_length': 60450}


### In Context Learning

In [None]:
#TODO