# Data Prep/Training/Evaluation Notebook for Caesar Cipher w/ Partially Obfuscated Text
### Purpose is to download, process, and write out instruction tuning examples for Caesar ciphers
### Data has had a percentage of characters enciphered at random from the original message
### Then train a model and evaluate

## Prep Data

In [2]:
# Imports
import re
import random
from datasets import load_from_disk, load_dataset
from transformers import T5Tokenizer

In [3]:
# Enciphering/deciphering helpers
char_to_num = {
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
}


# Remove all non alphabet text except spaces
def format_text(text):
    plaintext = re.sub(r'[^A-Za-z ]+', '', text)
    return plaintext.lower()


# NOTE: shift can be negative (left) or positive (right)
# If encode=True, encipher text, otherwise decipher
def caesar_cipher(original, shift, encode):
    if encode:
        myshift = shift
    else:
        myshift = shift * -1
    newtext = ''
    for i in original:
        if i == ' ':  # Preserve spaces
            newtext += ' '
        else:
            newnum = (char_to_num[i] + myshift) % 26
            newchar = list(char_to_num.keys())[list(char_to_num.values()).index(newnum)]
            newtext += newchar
    return newtext


# Randomly enciphers
def random_caesar_encipher(original, shift, partial):
    newtext = ''
    for i in original:
        if i == ' ':  # Preserve spaces
            newtext += ' '
        else:
            # Generate rand float [0, 1) and encipher char only if value <= partial
            if random.uniform(0, 1) <= partial:
                newnum = (char_to_num[i] + shift) % 26
                newchar = list(char_to_num.keys())[list(char_to_num.values()).index(newnum)]
                newtext += newchar
            # If rand value not large enough, keep old char in place
            else:
                newtext += i
    return newtext

In [4]:
# Download gigaword dataset from huggingface
DATA_NAME = "gigaword"
gigaword = load_dataset(DATA_NAME)
gigaword

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 3803957
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 189651
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 1951
    })
})

In [5]:
# Reduce the length of GIGAWORD dataset to make training faster
small_dataset = gigaword.filter(lambda example, idx: idx < 38040, with_indices=True)
small_dataset['validation'] = small_dataset['validation'].filter(lambda example, idx: idx < 2000, with_indices=True)
small_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 38040
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 1951
    })
})

In [6]:
# Get the base model tokenizer
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
## Define the preprocessing function
partial_frac = 0.5  # The fraction of text to be enciphered at random per image
def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # Create lists of data of instructions w/ ciphered text and the corresponding plaintext
    inputs = []
    targets = []
    for doc in examples["document"]:
        shift = random.randint(-25, 25)
        prefix = f"Use a Caesar cipher with shift {shift} to decipher the following text: "
        text = format_text(doc)
        inputs.append(prefix + random_caesar_encipher(text, shift, partial_frac))
        targets.append(text)

    # Tokenize
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# Preprocess/tokenize data
tokenized_dataset = small_dataset.map(preprocess_function, batched=True, remove_columns=["document", "summary"])

Map:   0%|          | 0/38040 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

In [9]:
# Save dataset to disk
tokenized_dataset.save_to_disk(dataset_dict_path='/home/as6734/langgen_class_project/data/partial_caesar')

Saving the dataset (0/1 shards):   0%|          | 0/38040 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1951 [00:00<?, ? examples/s]

## Train Model

In [10]:
# Imports
from datasets import load_from_disk
import torch
from evaluate import load
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [11]:
# Check CUDA working
torch.cuda.is_available()

True

In [12]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=128)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# Load saved dataset
small_dataset = load_from_disk('/home/as6734/langgen_class_project/data/partial_caesar')

In [14]:
# Define evaluation metric wrapper function using character error rate (CER)
cer = load("cer")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # decode preds and labels
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {'cer': cer.compute(predictions=decoded_preds, references=decoded_labels)}

In [15]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir='/home/as6734/langgen_class_project/results/partial_caesar',
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [16]:
# Instantiate trainer
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=small_dataset["train"],
   eval_dataset=small_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [18]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Cer
1,1.5574,1.376153,0.7135
2,1.0704,1.05831,0.66812
3,0.8665,0.962397,0.656876




TrainOutput(global_step=14265, training_loss=1.3368738986716286, metrics={'train_runtime': 6806.8836, 'train_samples_per_second': 16.765, 'train_steps_per_second': 2.096, 'total_flos': 1.95498224123904e+16, 'train_loss': 1.3368738986716286, 'epoch': 3.0})

## Evaluate

In [19]:
# Imports
from transformers import T5Tokenizer, T5ForConditionalGeneration
from evaluate import load
from datasets import load_from_disk
import torch
from tqdm.notebook import tqdm
import re
import random

In [22]:
# Load model from fine tuning checkpoint
last_checkpoint = '/home/as6734/langgen_class_project/results/partial_caesar/checkpoint-14000'
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
finetuned_model.to("cuda")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [21]:
# Load dataset
dataset = load_from_disk('/home/as6734/langgen_class_project/data/partial_caesar')

In [23]:
# Qualitative example 1
input_string = tokenizer.decode(dataset['test'][0]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][0]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -23 to decipher the following text: jdsan s qef coup dqg xqn comsuwer cous rf the unlthd swatev sald zednesgab theb hdd agreeg wr join irrces iq vxshrfopsxweu vdlev</s>'




Model Output: '<pad> japan s new coup and unk commander both of the united states said wednesday they had agreed to join forces in switzerland saeco</s>'
True Output: 'japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'


In [24]:
# Qualitative example 2
input_string = tokenizer.decode(dataset['test'][1]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][1]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift -23 to decipher the following text: tke vrl oankaq joyhrnpeqt rq zhgqesdab dnqoxnchd wke forsxue of joveunphqw vchooov zith ipmhgldwh eiiecw ds d mioiwaub cdmsdlgn djdinvt</s>'
Model Output: '<pad> the sri lankan government on wednesday announced the formation of government schools with private schools as a military campaign against terrorism</s>'
True Output: 'the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country</s>'


In [25]:
# Qualitative example 3
input_string = tokenizer.decode(dataset['test'][2]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][2]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 20 to decipher the following text: piliwe arlysnyx zivy uhnihowlyal jrotymtyrs nhormduy afnel nhyy siogbn no xcslopt louxiha iz u fryhwh untalcncc lyseulwh anx supjly vesmyl a mjokesguh zor</s>'
Model Output: '<pad> police arrested five antigovernment protesters thursday after they sought to disrupt holding of a french antiterror newspaper and supply video a spokesman for the spokesman said</s>'
True Output: 'police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said</s>'


In [26]:
# Load error metrics
cer = load("cer")
bleu = load("bleu")

In [27]:
# Construct test dataset w/ 25% encipherment percentage
g_test = gigaword['test']
partial_frac = 0.25
low_test = g_test.map(preprocess_function, batched=True, remove_columns=["document", "summary"])
partial_frac = 0.75
med_test = g_test.map(preprocess_function, batched=True, remove_columns=["document", "summary"])
partial_frac = 0.9
high_test = g_test.map(preprocess_function, batched=True, remove_columns=["document", "summary"])

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

In [29]:
# Metrics over test datasets
predictions_25 = []
predictions_50 = []
predictions_75 = []
predictions_90 = []
references = []
for i in tqdm(range(len(dataset['test']))):
    # Partial = 0.5 (same as training)
    input_string = tokenizer.decode(dataset['test'][i]['input_ids'])
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred_50 = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()

    # Partial = 0.25
    input_string = tokenizer.decode(low_test[i]['input_ids'])
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred_25 = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()

    # Partial = 0.75
    input_string = tokenizer.decode(med_test[i]['input_ids'])
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred_75 = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()

    # Partial = 0.9
    input_string = tokenizer.decode(high_test[i]['input_ids'])
    input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
    outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
    pred_90 = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(outputs[0])).strip()

    # Reference is the same so only need to do this once!
    ref = re.sub("[\<\[].*?[\>\]]", "", tokenizer.decode(dataset['test'][i]['labels'])).strip()

    # Add to corresponding list for metrics
    if len(ref) > 1 and len(pred_50) > 1:
        predictions_25.append(pred_25)
        predictions_50.append(pred_50)
        predictions_75.append(pred_75)
        predictions_90.append(pred_90)
        references.append(ref)

  0%|          | 0/1951 [00:00<?, ?it/s]

In [30]:
# Test data metrics for partial_frac = 0.25
cer_score = cer.compute(predictions=predictions_25, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions_25, references=references)
print(results)

CER Score: 0.1966337431969361
{'bleu': 0.5787161801952778, 'precisions': [0.7663049774105108, 0.6334363213225263, 0.5363919096620676, 0.46009961502490376], 'brevity_penalty': 0.9836847770016477, 'length_ratio': 0.9838164342970026, 'translation_length': 51794, 'reference_length': 52646}


In [31]:
# Test data metrics for partial_frac = 0.5 (same as training)
cer_score = cer.compute(predictions=predictions_50, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions_50, references=references)
print(results)

CER Score: 0.3540049889135255
{'bleu': 0.399262339741416, 'precisions': [0.6206869025524356, 0.4619904898357524, 0.3652077969841854, 0.29712582691743245], 'brevity_penalty': 0.9506306928884153, 'length_ratio': 0.9518102040041029, 'translation_length': 50109, 'reference_length': 52646}


In [32]:
# Test data metrics for partial_frac = 0.75
cer_score = cer.compute(predictions=predictions_75, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions_75, references=references)
print(results)

CER Score: 0.4312953537593227
{'bleu': 0.31911857326521176, 'precisions': [0.5519291509258686, 0.38596717644275236, 0.2939803384656821, 0.2343445719246381], 'brevity_penalty': 0.9168543699378167, 'length_ratio': 0.920126885233446, 'translation_length': 48441, 'reference_length': 52646}


In [33]:
# Test data metrics for partial_frac = 0.9
cer_score = cer.compute(predictions=predictions_90, references=references)
print(f'CER Score: {cer_score}')
results = bleu.compute(predictions=predictions_90, references=references)
print(results)

CER Score: 0.4620036283007458
{'bleu': 0.2955605861343824, 'precisions': [0.5222224501097278, 0.3561646762236418, 0.2680350337634552, 0.21059618071727992], 'brevity_penalty': 0.9233352378300547, 'length_ratio': 0.9261292405880789, 'translation_length': 48757, 'reference_length': 52646}
