# Data Prep/Training/Evaluation Notebook for Reduced Caesar Cipher Shifts Seen in Training
### Purpose is to download, process, and write out instruction tuning examples for Caesar ciphers
### Then train a model without the full range of cipher shifts in the training set

In [1]:
# Imports
import re
import random
from datasets import load_from_disk, load_dataset
from transformers import T5Tokenizer

## Prep Data

In [2]:
# Enciphering/deciphering helpers
char_to_num = {
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
}


# Remove all non alphabet text except spaces
def format_text(text):
    plaintext = re.sub(r'[^A-Za-z ]+', '', text)
    return plaintext.lower()


# NOTE: shift can be negative (left) or positive (right)
# If encode=True, encipher text, otherwise decipher
def caesar_cipher(original, shift, encode):
    if encode:
        myshift = shift
    else:
        myshift = shift * -1
    newtext = ''
    for i in original:
        if i == ' ':  # Preserve spaces
            newtext += ' '
        else:
            newnum = (char_to_num[i] + myshift) % 26
            newchar = list(char_to_num.keys())[list(char_to_num.values()).index(newnum)]
            newtext += newchar
    return newtext

In [3]:
# Download gigaword dataset from huggingface
DATA_NAME = "gigaword"
gigaword = load_dataset(DATA_NAME)
gigaword

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 3803957
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 189651
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 1951
    })
})

In [4]:
# Reduce the length of GIGAWORD dataset to make training faster
small_dataset = gigaword.filter(lambda example, idx: idx < 38040, with_indices=True)
small_dataset['validation'] = small_dataset['validation'].filter(lambda example, idx: idx < 2000, with_indices=True)
small_dataset

Filter:   0%|          | 0/3803957 [00:00<?, ? examples/s]

Filter:   0%|          | 0/189651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1951 [00:00<?, ? examples/s]

Filter:   0%|          | 0/38040 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 38040
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 1951
    })
})

In [5]:
# Get the base model tokenizer
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
## Define the preprocessing function
is_test = False  # Global var to control if enciphering for test dataset or not
def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # Create lists of data of instructions w/ ciphered text and the corresponding plaintext
    inputs = []
    targets = []
    for doc in examples["document"]:
        # NOTE: non-test data shift is only positive to ensure that negative wraparounds don't show examples 
        # of effectively other positive shifts (e.g. shift of -2 = shift of 24)
        if is_test:
            shift = random.randint(-25, 25)
        else:
            shift = random.randint(0, 13)
        prefix = f"Use a Caesar cipher with shift {shift} to decipher the following text: "
        text = format_text(doc)
        inputs.append(prefix + caesar_cipher(text, shift, True))
        targets.append(text)

    # Tokenize
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [28]:
# Training/validation dataset encipherment
tokenized_dataset = small_dataset.map(preprocess_function, batched=True, remove_columns=["document", "summary"])

Map:   0%|          | 0/38040 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

In [29]:
# Test datset encipherment (different than above because want to test additional unseen during training cipher shifts)
is_test = True
tokenized_dataset['test'] = small_dataset['test'].map(preprocess_function, batched=True, remove_columns=["document", "summary"])

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

In [30]:
# Save dataset to disk
tokenized_dataset.save_to_disk(dataset_dict_path='/home/as6734/langgen_class_project/data/half_caesar')

Saving the dataset (0/1 shards):   0%|          | 0/38040 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1951 [00:00<?, ? examples/s]

## Train Model

In [31]:
# Imports
from datasets import load_from_disk
import torch
from evaluate import load
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [34]:
# Check CUDA working
torch.cuda.is_available()

True

In [32]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=128)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
# Load saved dataset
small_dataset = load_from_disk('/home/as6734/langgen_class_project/data/half_caesar')

In [35]:
# Define evaluation metric wrapper function using character error rate (CER)
cer = load("cer")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # decode preds and labels
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {'cer': cer.compute(predictions=decoded_preds, references=decoded_labels)}

In [36]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir='/home/as6734/langgen_class_project/results/half_caesar',
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [37]:
# Instantiate trainer
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=small_dataset["train"],
   eval_dataset=small_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [39]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Cer
1,1.4872,1.306866,0.670902
2,0.9602,0.954309,0.628372
3,0.7794,0.892751,0.614901




TrainOutput(global_step=14265, training_loss=1.331752453258152, metrics={'train_runtime': 6796.2868, 'train_samples_per_second': 16.792, 'train_steps_per_second': 2.099, 'total_flos': 1.953612726534144e+16, 'train_loss': 1.331752453258152, 'epoch': 3.0})

## Evaluate

In [40]:
# Imports
from transformers import T5Tokenizer, T5ForConditionalGeneration
from evaluate import load
from datasets import load_from_disk
import torch
from tqdm.notebook import tqdm
import re
import random

In [44]:
# Load model from fine tuning checkpoint
last_checkpoint = '/home/as6734/langgen_class_project/results/half_caesar/checkpoint-14000'
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
finetuned_model.to("cuda")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [42]:
# Load dataset
dataset = load_from_disk('/home/as6734/langgen_class_project/data/half_caesar')

In [45]:
# Qualitative example within test dataset
input_string = tokenizer.decode(dataset['test'][0]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][0]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -12 to decipher the following text: xodob g bsq qcfd obr iby qcadihsf qcfd ct hvs ibwhsr ghohsg gowr ksrbsgrom hvsm vor oufssr hc xcwb tcfqsg wb gidsfqcadi</s>'
Model Output: '<pad>oder v rogu bor vyud vyud vyudyuv juu vyudyuv won gusudhu vyudyuv won comfortably while ji jiangfuu won kuomintang pt vyudyuv vyudyuv won the unk unk unk unk vyudyuv vyudyuv vyudyuv'
True Output: 'japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'


In [54]:
# Qualitative example within test dataset and shift seen in training
input_string = tokenizer.decode(dataset['test'][7]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][7]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 4 to decipher the following text: mwveip tvitevih wyrhec jsv tvmqi qmrmwxiv cmxdleo vefmr w wxexi jyrivep almgl ampp fi exxirhih fc e lswx sj asvph piehivw mrgpyhmrk yw</s>'
Model Output: '<pad> israel promised sunday for prime minister yitzhak rabin s state funeral which will be attended by a host of world leaders including us president george w bush and israeli prime minister ariel sharon</s>'
True Output: 'israel prepared sunday for prime minister yitzhak rabin s state funeral which will be attended by a host of world leaders including us president bill clinton and the jordanian and egyptian heads of state</s>'


In [67]:
# Qualitative example within test dataset and equivalent negative shift seen in training
input_string = tokenizer.decode(dataset['test'][4]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][4]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -21 to decipher the following text: ymj gfsp tk ofufs fuujfqji yt knsfshnfq rfwpjyx yt wjrfns hfqr kwnifd ktqqtbnsl ymj zx ijhnxnts yt twijw ifnbf gf</s>'
Model Output: '<pad> the bank of japan appealed to financial markets to remain calm friday following the us decision to order daiwa bank to sell its stake in a unk bank in a bid to restructure the financial system</s>'
True Output: 'the bank of japan appealed to financial markets to remain calm friday following the us decision to order daiwa bank ltd to close its us operations</s>'


In [55]:
# Load error metrics
cer = load("cer")
bleu = load("bleu")

In [69]:
# Metrics over entire test dataset
predictions = []
references = []
in_predictions = []
in_references = []
out_predictions = []
out_references = []
for i in tqdm(range(len(dataset['test']))):
    input_string = tokenizer.decode(dataset['test'][i]['input_ids'])
    # Get the shift value
    ls = re.findall(r'\d+', input_string)
    shift = int(ls[0])
    if '-' in input_string:
        shift *= -1

    # Get pred and reference text
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()
    ref = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(dataset['test'][i]['labels'])).strip()

    # Add to corresponding list for metrics
    if len(ref) > 1 and len(pred) > 1:
        predictions.append(pred)
        references.append(ref)
        # Is within training distribution of shifts (if you include equivalent negatives)
        if shift % 26 <= 13:
            in_predictions.append(pred)
            in_references.append(ref)
        # Shift not seen in training data
        else:
            out_predictions.append(pred)
            out_references.append(ref)

  0%|          | 0/1951 [00:00<?, ?it/s]

In [70]:
# All test data metrics
cer_score = cer.compute(predictions=predictions, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions, references=references)
print(results)

CER Score: 0.627223594033461
{'bleu': 0.2500741925531097, 'precisions': [0.328927611132451, 0.2632311474224568, 0.22696018735362997, 0.19901610017889088], 'brevity_penalty': 1.0, 'length_ratio': 1.0879079132317746, 'translation_length': 57274, 'reference_length': 52646}


In [71]:
# Only seen in training
cer_score = cer.compute(predictions=in_predictions, references=in_references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=in_predictions, references=in_references)
print(results)

CER Score: 0.34732339810540236
{'bleu': 0.5034863882115634, 'precisions': [0.656563108195776, 0.5446100831522961, 0.47224171539961013, 0.4160230875538574], 'brevity_penalty': 0.9779727416472119, 'length_ratio': 0.9782118177972078, 'translation_length': 27746, 'reference_length': 28364}


In [72]:
# Never seen in training
cer_score = cer.compute(predictions=out_predictions, references=out_references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=out_predictions, references=out_references)
print(results)

CER Score: 0.9535392948013943
{'bleu': 0.0, 'precisions': [0.02106475209970198, 0.0008034653811220568, 3.606853020739405e-05, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.2160448068528127, 'translation_length': 29528, 'reference_length': 24282}
