# Evaluation Notebook
### Purpose of this notebook is to evaluate models for the decipher task

In [1]:
# Imports
from transformers import T5Tokenizer, T5ForConditionalGeneration
from evaluate import load
from datasets import load_from_disk
import torch
from tqdm.notebook import tqdm
import re
import random

In [2]:
# Check CUDA working
print(torch.cuda.is_available())
device = torch.device('cuda')

True


In [3]:
# Load model from fine tuning checkpoint
last_checkpoint = '/home/as6734/langgen_class_project/results/caesar/checkpoint-14000'
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Load dataset
dataset = load_from_disk('/home/as6734/langgen_class_project/data/caesar')

  table = cls._concat_blocks(blocks, axis=0)


In [5]:
finetuned_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

### Qualitative Examples

In [6]:
# Qualitative example outside of test dataset
input_text = "Use a Caesar cipher with shift 25 to decipher the following text: gdkkn lx mzld hr ztrshm"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(tokenizer.decode(outputs[0]))
print('True output: hello my name is austin')

<pad> helen mo nao is austin</s>
True output: hello my name is austin


In [7]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][0]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][0]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 17 to decipher the following text: argre j evt tfig reu leb tfdglkvi tfig fw kyv lezkvu jkrkvj jrzu nvuevjurp kyvp yru rxivvu kf afze wfitvj ze jlgvitfdglkvi jrcvj</s>'




Model Output: '<pad> japan s new corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'
True Output: 'japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'


In [8]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][1]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][1]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 1 to decipher the following text: uif tsj mbolbo hpwfsonfou po xfeoftebz boopvodfe uif dmptvsf pg hpwfsonfou tdippmt xjui jnnfejbuf fggfdu bt b njmjubsz dbnqbjho bhbjot</s>'
Model Output: '<pad> the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil tiger rebels continued</s>'
True Output: 'the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country</s>'


In [9]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][2]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][2]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -17 to decipher the following text: yxurln jaanbcnm oren jwcrwdlunja yaxcnbcnab cqdabmjh jocna cqnh bxdpqc cx mrbadyc uxjmrwp xo j oanwlq jwcjalcrl anbnjalq jwm bdy</s>'
Model Output: '<pad> police arrested five antigovernment protesters thursday after they sought to disrupt running of a french anticrime research and support center in the capital london police said</s>'
True Output: 'police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said</s>'


In [10]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][3]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][3]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 17 to decipher the following text: wrtkfip fiuvij wfi drelwrtklivu xffuj ifjv gvitvek ze jvgkvdsvi kyv tfddvitv uvgrikdvek jrzu yviv kylijurp</s>'
Model Output: '<pad> factory orders for manufactured goods rose percent in september the commerce department said here thursday</s>'
True Output: 'factory orders for manufactured goods rose percent in september the commerce department said here thursday</s>'


In [11]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][4]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][4]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 10 to decipher the following text: dro lkxu yp tkzkx kzzokvon dy psxkxmskv wkbuodc dy bowksx mkvw pbsnki pyvvygsxq dro ec nomscsyx dy ybnob nksgk lkxu vdn </s>'
Model Output: '<pad> the bank of japan appealed to financial markets to remain calm friday following the us decision to order daiwa bank ltd to sell its assets in a bid to boost liquidity</s>'
True Output: 'the bank of japan appealed to financial markets to remain calm friday following the us decision to order daiwa bank ltd to close its us operations</s>'


### Metrics

In [12]:
cer = load("cer")
bleu = load("bleu")

In [13]:
# Character Error Rate example
predictions = ['police arrested five antigovernment protesters thursday after they sought to disrupt running of a french anticrime research and support center in the capital london police said']
references = ['police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said']
cer_score = cer.compute(predictions=predictions, references=references)
cer_score

0.28735632183908044

In [14]:
# BLEU example
predictions = ['police arrested five antigovernment protesters thursday after they sought to disrupt running of a french anticrime research and support center in the capital london police said']
references = ['police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said']
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.34756561191481233, 'precisions': [0.6538461538461539, 0.44, 0.2916666666666667, 0.17391304347826086], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 26, 'reference_length': 26}


In [17]:
# CER over test dataset
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    input_string = tokenizer.decode(dataset['test'][i]['input_ids'])
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()
    ref = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(dataset['test'][i]['labels'])).strip()
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
cer_score = cer.compute(predictions=predictions, references=references)
cer_score

  0%|          | 0/1951 [00:00<?, ?it/s]

0.3006733773432776

In [18]:
# BLEU over test data
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.5635670573146343, 'precisions': [0.704759646731077, 0.6107309454821939, 0.5501198424927238, 0.5002790365426256], 'brevity_penalty': 0.9606284763178234, 'length_ratio': 0.9613835808988337, 'translation_length': 50613, 'reference_length': 52646}


## Zero Shot Evaluation

In [19]:
base_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
base_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")
base_model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [20]:
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    input_string = base_tokenizer.decode(dataset['test'][i]['input_ids'])
    input_ids = base_tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = base_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", base_tokenizer.decode(outputs[0])).strip()
    ref = re.sub("[\<\[].*?[\>\]]", "", base_tokenizer.decode(dataset['test'][i]['labels'])).strip()
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

  0%|          | 0/1951 [00:00<?, ?it/s]

CER Score: 0.9279454878314194
{'bleu': 0.00028700751106246983, 'precisions': [0.018694304533126694, 0.01641489235161175, 0.014958962139263966, 0.014094846571722214], 'brevity_penalty': 0.01799558505241833, 'length_ratio': 0.19929732442762826, 'translation_length': 10324, 'reference_length': 51802}


### Evaluate Long Training Fine Tuned Version

In [21]:
# Load model from fine tuning checkpoint
last_checkpoint = '/home/as6734/langgen_class_project/results/caesar_long/checkpoint-71000'
long_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
long_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
long_model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [22]:
# Qualitative example within test dataset
input_string = long_tokenizer.decode(dataset['test'][2]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = long_tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = long_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{long_tokenizer.decode(outputs[0])}'")
print(f"True Output: '{long_tokenizer.decode(dataset['test'][2]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -17 to decipher the following text: yxurln jaanbcnm oren jwcrwdlunja yaxcnbcnab cqdabmjh jocna cqnh bxdpqc cx mrbadyc uxjmrwp xo j oanwlq jwcjalcrl anbnjalq jwm bdy</s>'
Model Output: '<pad> police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel in the arctic a police spokesman said</s>'
True Output: 'police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said</s>'


In [23]:
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    input_string = long_tokenizer.decode(dataset['test'][i]['input_ids'])
    input_ids = long_tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = long_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", long_tokenizer.decode(outputs[0])).strip()
    ref = re.sub("[\<\[].*?[\>\]]", "", long_tokenizer.decode(dataset['test'][i]['labels'])).strip()
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

  0%|          | 0/1951 [00:00<?, ?it/s]

CER Score: 0.2357923049788349
{'bleu': 0.6950594722407475, 'precisions': [0.7982921856390173, 0.747893332212579, 0.7206694120739507, 0.6972323879231473], 'brevity_penalty': 0.9391677498239391, 'length_ratio': 0.9409451810204004, 'translation_length': 49537, 'reference_length': 52646}


### Caesar Ciphers with Out of Training Shifts

In [24]:
# Qualitative example outside of test dataset w/ small training model
input_text = "Use a Caesar cipher with shift 30 to decipher the following text: lipps qc reqi mw eywxmr"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(tokenizer.decode(outputs[0]))
print('True output: hello my name is austin')

<pad> hollywood cc nasa is okay</s>
True output: hello my name is austin


In [26]:
# Qualitative example outside of test dataset w/ long training model
input_text = "Use a Caesar cipher with shift 30 to decipher the following text: lipps qc reqi mw eywxmr"
input_ids = long_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = long_model.generate(input_ids.to("cuda"), max_length=128)
print(long_tokenizer.decode(outputs[0]))
print('True output: hello my name is austin')

<pad> helen mc name is augustin</s>
True output: hello my name is austin


In [27]:
# Qualitative example outside of test dataset w/ base model
input_text = "Use a Caesar cipher with shift 30 to decipher the following text: lipps qc reqi mw eywxmr"
input_ids = base_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = base_model.generate(input_ids.to("cuda"), max_length=128)
print(base_tokenizer.decode(outputs[0]))
print('True output: hello my name is austin')

<pad> mw eywxmr</s>
True output: hello my name is austin


In [28]:
# Helper functions
# Enciphering/deciphering helpers
char_to_num = {
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
}


# Remove all non alphabet text except spaces
def format_text(text):
    plaintext = re.sub(r'[^A-Za-z ]+', '', text)
    return plaintext.lower()


# NOTE: shift can be negative (left) or positive (right)
# If encode=True, encipher text, otherwise decipher
def caesar_cipher(original, shift, encode):
    if encode:
        myshift = shift
    else:
        myshift = shift * -1
    newtext = ''
    for i in original:
        if i == ' ':  # Preserve spaces
            newtext += ' '
        else:
            newnum = (char_to_num[i] + myshift) % 26
            newchar = list(char_to_num.keys())[list(char_to_num.values()).index(newnum)]
            newtext += newchar
    return newtext

In [31]:
# metrics for outside of test dataset w/ short training model
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    shift = random.choice(list(set([x for x in range(-100, 101)]) - set([x for x in range(-25, 26)])))
    prefix = f"Use a Caesar cipher with shift {shift} to decipher the following text: "
    plaintext = tokenizer.decode(dataset['test'][i]['labels'])
    plaintext = re.sub("[\<\[].*?[\>\]]", "", plaintext).strip()
    if len(plaintext) + len(prefix) > 128:
        over = len(plaintext) + len(prefix) - 128
        plaintext = plaintext[:-over]
    input_string = prefix + caesar_cipher(plaintext, shift, True)
    
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()
    ref = plaintext.strip()
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

  0%|          | 0/1951 [00:00<?, ?it/s]

CER Score: 0.14365991763898422
{'bleu': 0.6710583413749363, 'precisions': [0.7983968909400049, 0.7062516769519721, 0.6325528474758968, 0.5685464762033715], 'brevity_penalty': 1.0, 'length_ratio': 1.0230604840713682, 'translation_length': 20585, 'reference_length': 20121}


In [32]:
# metrics for outside of test dataset w/ long training model
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    shift = random.choice(list(set([x for x in range(-100, 101)]) - set([x for x in range(-25, 26)])))
    prefix = f"Use a Caesar cipher with shift {shift} to decipher the following text: "
    plaintext = long_tokenizer.decode(dataset['test'][i]['labels'])
    plaintext = re.sub("[\<\[].*?[\>\]]", "", plaintext).strip()
    if len(plaintext) + len(prefix) > 128:
        over = len(plaintext) + len(prefix) - 128
        plaintext = plaintext[:-over]
    input_string = prefix + caesar_cipher(plaintext, shift, True)
    
    input_ids = long_tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = long_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", long_tokenizer.decode(outputs[0])).strip()
    ref = plaintext.strip()
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

  0%|          | 0/1951 [00:00<?, ?it/s]

CER Score: 0.5232511258846236
{'bleu': 0.5544523032577391, 'precisions': [0.6168406871000337, 0.5768565248738284, 0.5367384901565648, 0.4948267917731328], 'brevity_penalty': 1.0, 'length_ratio': 1.4747665408305186, 'translation_length': 29690, 'reference_length': 20132}


In [33]:
# metrics for outside of test dataset w/ base model
predictions = []
references = []
for i in tqdm(range(len(dataset['test']))):
    shift = random.choice(list(set([x for x in range(-100, 101)]) - set([x for x in range(-25, 26)])))
    prefix = f"Use a Caesar cipher with shift {shift} to decipher the following text: "
    plaintext = base_tokenizer.decode(dataset['test'][i]['labels'])
    plaintext = re.sub("[\<\[].*?[\>\]]", "", plaintext).strip()
    if len(plaintext) + len(prefix) > 128:
        over = len(plaintext) + len(prefix) - 128
        plaintext = plaintext[:-over]
    input_string = prefix + caesar_cipher(plaintext, shift, True)
    
    input_ids = base_tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = base_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", base_tokenizer.decode(outputs[0])).strip()
    ref = plaintext.strip()
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

  0%|          | 0/1951 [00:00<?, ?it/s]

CER Score: 0.9598249661025903
{'bleu': 0.00287753338434403, 'precisions': [0.034015966678236725, 0.034420289855072464, 0.030470914127423823, 0.02831923501287238], 'brevity_penalty': 0.0907620477004479, 'length_ratio': 0.2941596896058811, 'translation_length': 5762, 'reference_length': 19588}


### In Context Learning

In [None]:
#TODO

In [34]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3803957
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189651
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1951
    })
})

In [37]:
inputs = []
refs = []
for i in tqdm(range(len(dataset['test']))):
    input_string = tokenizer.decode(dataset['test'][i]['input_ids'])
    input_string = re.sub("[\<\[].*?[\>\]]", "", input_string).strip()
    inputs.append(len(input_string))
    ref_string = tokenizer.decode(dataset['test'][i]['labels'])
    ref_string = re.sub("[\<\[].*?[\>\]]", "", ref_string).strip()    
    refs.append(len(ref_string))

  0%|          | 0/1951 [00:00<?, ?it/s]

In [38]:
sum(inputs) / len(inputs)

183.46386468477704

In [39]:
sum(refs) / len(refs)

162.73910814966683

In [40]:
prefix = f"Use a Caesar cipher with shift 2 to decipher the following text: "
len(prefix)

65

In [43]:
input_string = tokenizer.decode(dataset['test'][0]['input_ids'])
input_string = re.sub("[\<\[].*?[\>\]]", "", input_string).strip()
input_string

'Use a Caesar cipher with shift 17 to decipher the following text: argre j evt tfig reu leb tfdglkvi tfig fw kyv lezkvu jkrkvj jrzu nvuevjurp kyvp yru rxivvu kf afze wfitvj ze jlgvitfdglkvi jrcvj'

In [44]:
len('Use a Caesar cipher with shift 17 to decipher the following text:')

65

In [46]:
import numpy as np
np.mean(inputs)

183.46386468477704

In [47]:
inputs = []
refs = []
count = 0
for i in tqdm(range(len(dataset['train']))):
    input_string = tokenizer.decode(dataset['train'][i]['input_ids'])
    input_string = re.sub("[\<\[].*?[\>\]]", "", input_string).strip()
    inputs.append(len(input_string))
    ref_string = tokenizer.decode(dataset['train'][i]['labels'])
    ref_string = re.sub("[\<\[].*?[\>\]]", "", ref_string).strip()    
    refs.append(len(ref_string))
    count += 1
    if count >= 10000:
        break
print(np.mean(inputs))
print(np.mean(refs))

  0%|          | 0/3803957 [00:00<?, ?it/s]

191.5129
172.9865


In [48]:
inputs = []
refs = []
count = 0
for i in tqdm(range(len(dataset['validation']))):
    input_string = tokenizer.decode(dataset['validation'][i]['input_ids'])
    input_string = re.sub("[\<\[].*?[\>\]]", "", input_string).strip()
    inputs.append(len(input_string))
    ref_string = tokenizer.decode(dataset['validation'][i]['labels'])
    ref_string = re.sub("[\<\[].*?[\>\]]", "", ref_string).strip()    
    refs.append(len(ref_string))
    count += 1
    if count >= 10000:
        break
print(np.mean(inputs))
print(np.mean(refs))

  0%|          | 0/189651 [00:00<?, ?it/s]

191.4036
172.3924
