# Demo Notebook
### Purpose: Show example prompting and generations of fine tuned model

In [1]:
# Imports
import re
import random
from datasets import load_from_disk
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
# Enciphering/deciphering helpers
char_to_num = {
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
}


# Remove all non alphabet text except spaces
def format_text(text):
    plaintext = re.sub(r'[^A-Za-z ]+', '', text)
    return plaintext.lower()


# NOTE: shift can be negative (left) or positive (right)
# If encode=True, encipher text, otherwise decipher
def caesar_cipher(original, shift, encode):
    if encode:
        myshift = shift
    else:
        myshift = shift * -1
    newtext = ''
    for i in original:
        if i == ' ':  # Preserve spaces
            newtext += ' '
        else:
            newnum = (char_to_num[i] + myshift) % 26
            newchar = list(char_to_num.keys())[list(char_to_num.values()).index(newnum)]
            newtext += newchar
    return newtext


# Randomly enciphers on word level
def random_caesar_encipher(original, shift, partial):
    newtext = ''
    cipher_new = False
    for i in original:
        if i == ' ':  # Preserve spaces
            newtext += ' '
            # Check if next word should be enciphered or not
            if random.uniform(0, 1) <= partial:
                cipher_new = True
            else:
                cipher_new = False
        else:
            # Only encipher if word has been selected
            if cipher_new:
                newnum = (char_to_num[i] + shift) % 26
                newchar = list(char_to_num.keys())[list(char_to_num.values()).index(newnum)]
                newtext += newchar
            # Keep old char in place
            else:
                newtext += i
    return newtext

In [3]:
# Name of model/data
exp_name = 'caesar'
# exp_name = 'caesar_long'
# exp_name = 'half_caesar'
# exp_name = 'partial_caesar'
# exp_name = 'partial_word_caesar'

In [4]:
# Load data
dataset = load_from_disk(f"/home/as6734/langgen_class_project/data/{exp_name}")

  table = cls._concat_blocks(blocks, axis=0)


In [5]:
# Load model
last_checkpoint = f"/home/as6734/langgen_class_project/results/{exp_name}/checkpoint-14000"
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
finetuned_model.to("cuda")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [6]:
# Example using test set
test_set_index = 0
input_string = tokenizer.decode(dataset['test'][test_set_index]['input_ids'])
print(f"Model Input: '{input_string}'")
input_ids = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")
outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[test_set_index])}'")
print(f"True Output: '{tokenizer.decode(dataset['test'][test_set_index]['labels'])}'")

Model Input: 'Use a Caesar cipher with shift 17 to decipher the following text: argre j evt tfig reu leb tfdglkvi tfig fw kyv lezkvu jkrkvj jrzu nvuevjurp kyvp yru rxivvu kf afze wfitvj ze jlgvitfdglkvi jrcvj</s>'




Model Output: '<pad> japan s new corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'
True Output: 'japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales</s>'


In [10]:
# Example using user defined text
shift = 1
message = "Try to translate a simple message just like this one!"

message = format_text(message)
print(f"Stripped Message: '{message}'")

ctext = caesar_cipher(message, shift, encode=True)
print(f"Enciphered Message: '{ctext}'")

input_text = f"Use a Caesar cipher with shift {shift} to decipher the following text: {ctext}"
print(f"Model Input: '{input_text}'")
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(input_ids.to("cuda"), max_length=128)
print(f"Model Output: '{tokenizer.decode(outputs[0])}'")
print(f"True output: '{message}'")

Stripped Message: 'try to translate a simple message just like this one'
Enciphered Message: 'usz up usbotmbuf b tjnqmf nfttbhf kvtu mjlf uijt pof'
Model Input: 'Use a Caesar cipher with shift 1 to decipher the following text: usz up usbotmbuf b tjnqmf nfttbhf kvtu mjlf uijt pof'
Model Output: '<pad> try to transform a sizable message just like this one</s>'
True output: 'try to translate a simple message just like this one'
