# Data Preparation

## Data exploration

In [1]:
!wget https://raw.githubusercontent.com/Skyhao6/IST664_Final_Proj/main/modern.nltktok
!wget https://raw.githubusercontent.com/Skyhao6/IST664_Final_Proj/main/original.nltktok

--2024-02-28 14:10:09--  https://raw.githubusercontent.com/Skyhao6/IST664_Final_Proj/main/modern.nltktok
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1040850 (1016K) [text/plain]
Saving to: ‘modern.nltktok’


2024-02-28 14:10:10 (15.8 MB/s) - ‘modern.nltktok’ saved [1040850/1040850]

--2024-02-28 14:10:10--  https://raw.githubusercontent.com/Skyhao6/IST664_Final_Proj/main/original.nltktok
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1109065 (1.1M) [text/plain]
Saving to: ‘original.nltktok’


2024-02-28 14:10:10 (15.5 MB/s) -

In [2]:
# Read in the data
# https://github.com/harsh19/Shakespearizing-Modern-English/tree/master/data
# source: modern.nltktok
# target: original.nltktok
# Task: Translate middle English to modern English
source = []
target = []

with open('original.nltktok', 'r', encoding='utf-8') as f:
    content = f.read()
    source = content.split('\n')

with open('modern.nltktok', 'r', encoding='utf-8') as f:
    content = f.read()
    target = content.split('\n')

# Add start and end tokens to target
for i in range(len(target)):
    target[i] = '[start] ' + target[i] + ' [end]'

# Examples
print('number of source sentences: ', len(source))
print('number of target sentences: ', len(target))
print('1st three sentences in the source and target: ')
print(source[:3])
print(target[:3])

number of source sentences:  21076
number of target sentences:  21076
1st three sentences in the source and target: 
["I have a mind to strike thee ere thou speak'st .", "Yet if thou say Antony lives , is well , Or friends with Caesar , or not captive to him , I'll set thee in a shower of gold and hail Rich pearls upon thee .", "Madam , he's well ."]
['[start] I have half a mind to hit you before you speak again . [end]', "[start] But if Antony is alive , healthy , friendly with Caesar , and not Caesar's prisoner , I'll shower you with gold and pearls . [end]", "[start] Madam , he's well . [end]"]


In [None]:
# Statistical information
min_len = 100000
max_len = 0
for sentence in source:
    if len(sentence) < min_len:
        min_len = len(sentence.split())
    if len(sentence) > max_len:
        max_len = len(sentence.split())

print("min length of source: ", min_len)
print("max length of source: ", max_len)

min_len = 100000
max_len = 0
for sentence in target:
    if len(sentence) < min_len:
        min_len = len(sentence.split())
    if len(sentence) > max_len:
        max_len = len(sentence.split())

print("min length of target: ", min_len)
print("max length of target: ", max_len)

min length of source:  0
max length of source:  19
min length of target:  2
max length of target:  18


## Tokenization

In [None]:
from keras.layers import TextVectorization
import string
import tensorflow as tf
import re

# Vocabulary size of source and target
INPUT_MAX_TOKENS = 13000
OUTPUT_MAX_TOKENS = 10000
# Max length of source and target sentence
# If the sentence is shorter than MAX_LEN, it will be padded with 0
# If the sentence is longer than MAX_LEN, it will be truncated
MAX_LEN = 30
batch_size = 64

# Build source tokenizer
source_tokenizer = TextVectorization(
    max_tokens=INPUT_MAX_TOKENS,
    output_mode='int',
    output_sequence_length=MAX_LEN)

source_tokenizer.adapt(source)


# Build target tokenizer
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

target_tokenizer = TextVectorization(
    max_tokens=OUTPUT_MAX_TOKENS,
    output_sequence_length=MAX_LEN + 1,
    standardize=custom_standardization,
    )
target_tokenizer.adapt(target)

In [None]:
source_vocab_size = len(source_tokenizer.get_vocabulary())
target_vocab_size = len(target_tokenizer.get_vocabulary())
print("source_vocab_size: ", source_vocab_size)
print("target_vocab_size: ", target_vocab_size)

source_vocab_size:  12443
target_vocab_size:  9977


In [None]:
# Example
a = "[start] What o'clock tomorrow Shall I send to thee ? [end]"
print(custom_standardization(a).numpy().decode('utf-8'))

[start] what oclock tomorrow shall i send to thee  [end]


In [None]:
# Randomly shuffle data
import random

random.seed(1)
# Combine the lists into pairs
combined_lists = list(zip(source, target))

# Shuffle the combined list
random.shuffle(combined_lists)

# Unzip the shuffled list back into separate lists
source, target = zip(*combined_lists)

source = list(source)
target = list(target)
print(source[:3])
print(target[:3])

['Let him be the devil , an he will , I care not .', 'Niggard of question , but of our demands Most free in his reply .', 'Wherefore ?']
["[start] Let him be the devil if he wants to , I don't care . [end]", "[start] He didn't ask questions , but answered ours at length . [end]", '[start] A reason ? [end]']


In [None]:
# Split data into training and test set
print("Total number of source sentences: ", len(source))
print("Total number of target sentences: ", len(source))

source_train = source[:int(len(source)*0.9)]
target_train = target[:int(len(target)*0.9)]

source_test = source[int(len(source)*0.9):]
target_test = target[int(len(target)*0.9):]


Total number of source sentences:  21076
Total number of target sentences:  21076


In [None]:
src_word_to_idx = dict([(v,k) for k, v in enumerate(source_tokenizer.get_vocabulary())])
src_idx_to_word = dict([(k,v) for k, v in enumerate(source_tokenizer.get_vocabulary())])
tar_word_to_idx = dict([(v,k) for k, v in enumerate(target_tokenizer.get_vocabulary())])
tar_idx_to_word = dict([(k,v) for k, v in enumerate(target_tokenizer.get_vocabulary())])

In [None]:
encoder_input_data = source_tokenizer(source_train)
decoder_input_data = target_tokenizer(target_train)[:, :-1]
decoder_output_data = target_tokenizer(target_train)[:, 1:]

In [None]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_output_data.shape)

(18968, 30)
(18968, 30)
(18968, 30)


In [None]:
# Examples
# Encoder input: i have half a mind to hit you before you speak again
# Decoder input: [start] i have a mind to strike thee ere thou speakst
# Decoder output: i have a mind to strike thee ere thou speakst [end]
print("Source: ", source_train[664])
print("Target: ", target_train[664])



print("Decoder input: ", decoder_input_data[664])
print("Decoder output: ", decoder_output_data[664])


Source:  For us , you know Whose he is we are , and that is Caesar's .
Target:  [start] As for us , you know we are Antony's , and he is Caesar's . [end]
Decoder input:  tf.Tensor(
[   2   24   18   91    4   69   47   28 1318    8   22   14  482    3
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(30,), dtype=int64)
Decoder output:  tf.Tensor(
[  24   18   91    4   69   47   28 1318    8   22   14  482    3    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(30,), dtype=int64)


# Modeling

In [None]:
# Hyperparameters

# d_model
embedding_size = 512

# dff = d_model * 4
dense_dim = 2048

# num_heads
n_head = 8

# num_layers
n_layer = 1

# dropout rate
dropout = 0.1

## Build the model

In [None]:
from keras import layers
from keras import Model
import keras
import numpy as np
import tensorflow as tf

# Positional encoding
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(pos, d_model):

    def get_angles(position, i):
        return position / np.power(10000., 2. * (i // 2.) / np.float32(d_model))

    angle_rates = get_angles(np.arange(pos)[:, np.newaxis],
                             np.arange(d_model)[np.newaxis, :])
    pe_sin = np.sin(angle_rates[:, 0::2])
    pe_cos = np.cos(angle_rates[:, 1::2])
    pos_encoding = np.concatenate([pe_sin, pe_cos], axis=-1)
    pos_encoding = tf.cast(pos_encoding[np.newaxis, ...], tf.float32)
    return pos_encoding

# Masking
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, np.newaxis, np.newaxis, :]


# Look-ahead mask for decoder
def create_look_ahead_mask(size):

    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # shape=[seq_len, seq_len]

def create_mask(inputs, targets):
    encoder_padding_mask = create_padding_mask(inputs)
    decoder_padding_mask = create_padding_mask(inputs)
    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
    decoder_targets_padding_mask = create_padding_mask(targets)
    combined_mask = tf.maximum(decoder_targets_padding_mask, look_ahead_mask)
    return encoder_padding_mask, combined_mask, decoder_padding_mask

# Split tensor into (batch_size, n_head, seq_len, d_head)
def splite_tensor(tensor):
    shape = tf.shape(tensor)
    tensor = tf.reshape(
        tensor, shape=[shape[0], -1, n_head, embedding_size//n_head])
    tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
    return tensor

class MultiHeadAttentionLayer(layers.Layer):
    def __init__(self):
        super(MultiHeadAttentionLayer, self).__init__()

    # Define the layers needed for the computation, including the input shape
    def build(self, input_shape):
        # Input shape: [batch_size, seq_len, embedding_size] for query, key, value
        self.dense_query = layers.Dense(
            units=embedding_size, activation='relu')
        self.dense_key = layers.Dense(units=embedding_size, activation='relu')
        self.dense_value = layers.Dense(
            units=embedding_size, activation='relu')

        self.layer_norm = layers.LayerNormalization()
        super(MultiHeadAttentionLayer, self).build(input_shape)

    def call(self, inputs):
        query, key, value, mask = inputs
        shape = tf.shape(query)

        query_dense = self.dense_query(query)
        key_dense = self.dense_key(key)
        value_dense = self.dense_value(value)

        query_dense = splite_tensor(query_dense)
        key_dense = splite_tensor(key_dense)
        value_dense = splite_tensor(value_dense)

        attention = tf.matmul(query_dense, key_dense, transpose_b=True) / \
            tf.math.sqrt(tf.cast(embedding_size, tf.float32))
        attention += (mask*-1e9)
        attention = tf.nn.softmax(attention)
        attention = layers.Dropout(0.1)(attention)
        attention = tf.matmul(attention, value_dense)
        attention = tf.transpose(attention, [0, 2, 1, 3])
        attention = tf.reshape(attention, [shape[0], -1, embedding_size])

        attention = self.layer_norm((attention+query))
        return attention


# Base encoder layer
class EncoderLayer(layers.Layer):
    def __init__(self, n_head, emb_dim, dense_dim, dropout):
        super(EncoderLayer, self).__init__()
        self.attn = MultiHeadAttentionLayer()
        self.drop_attn = layers.Dropout(dropout)
        self.dense1 = layers.Dense(dense_dim, activation='relu')
        self.dense2 = layers.Dense(emb_dim)
        self.drop_dense = layers.Dropout(dropout)
        self.layer_norm_attn = layers.LayerNormalization()
        self.layer_norm_dense = layers.LayerNormalization()

    def call(self, inputs,training=None):

        encoder_inputs,mask = inputs
        att_out = self.attn([encoder_inputs, encoder_inputs,encoder_inputs,mask])
        att_out = self.drop_attn(att_out, training=training)
        att_out = self.layer_norm_attn(encoder_inputs+att_out)

        dense = self.dense1(att_out)
        dense = self.dense2(dense)
        dense = self.drop_dense(dense, training=training)
        x = self.layer_norm_dense(att_out+dense)

        return x

# Base encoder using multiple encoder layers
class Encoder(layers.Layer):
    def __init__(self, vocab, emb_dim, dense_dim, n_layers, n_head, dropout=0.1):
        super(Encoder, self).__init__()
        self.emb_dim = emb_dim

        self.emb = layers.Embedding(input_dim=vocab, output_dim=emb_dim)
        self.pos = positional_encoding(MAX_LEN, emb_dim)

        # multi encoder layers
        self.encoder_layers = [EncoderLayer(
            emb_dim=emb_dim, n_head=n_head, dense_dim=dense_dim, dropout=dropout) for _ in range(n_layers)]
        self.dropout = layers.Dropout(dropout)

    def call(self, inputs,training=False):

        encoder_inputs,mask = inputs
        # shape=[batch_size, seq_len, d_model]
        seq_len = encoder_inputs.shape[1]
        # shape=[batch_size, seq_len, d_model]
        word_embedding = self.emb(encoder_inputs)
        word_embedding *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        emb = word_embedding + self.pos[:, :seq_len, :]

        # emb = self.emb(inputs)
        x = self.dropout(emb, training=training)
        # print('pos:',x)

        for encoder_layer in self.encoder_layers:
            x = encoder_layer([x,mask])
            #print('x:',x.shape)
        return x

    # def compute_mask(self, inputs, mask=None):
    #     return self.emb.compute_mask(inputs)

class DecoderLayer(layers.Layer):
    def __init__(self, n_head, emb_dim, dense_dim, dropout):
        super(DecoderLayer, self).__init__()
        self.attn1 = MultiHeadAttentionLayer()
        self.layer_norm_attn1 = layers.LayerNormalization()
        self.drop_attn1 = layers.Dropout(dropout)

        self.attn2 = MultiHeadAttentionLayer()
        self.layer_norm_attn2 = layers.LayerNormalization()
        self.drop_attn2 = layers.Dropout(dropout)

        self.dense1 = layers.Dense(dense_dim, activation='relu')
        self.dense2 = layers.Dense(emb_dim)
        self.drop_dense = layers.Dropout(dropout)
        self.layer_norm_dense = layers.LayerNormalization()

    def call(self, inputs,training=None):
        # causal_mask = self.get_causal_attention_mask(inputs)
        # if mask is not None:
        #     padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        #     padding_mask = tf.minimum(padding_mask, causal_mask)
        decoder_inputs, encoder_outputs,mask1,mask2 = inputs
        att_out1 = self.attn1([decoder_inputs,decoder_inputs,decoder_inputs, mask1])
        att_out1 = self.drop_attn1(att_out1, training=training)
        att_out1 = self.layer_norm_attn1(decoder_inputs+att_out1)

        att_out2 = self.attn2([att_out1,encoder_outputs, encoder_outputs,mask2])
        att_out2 = self.drop_attn2(att_out2, training=training)
        att_out2 = self.layer_norm_attn2(att_out1+att_out2)

        dense = self.dense1(att_out2)
        dense = self.dense2(dense)
        dense = self.drop_dense(dense, training=training)
        x = self.layer_norm_dense(att_out2+dense)

        return x


class Decoder(layers.Layer):

    def __init__(self, vocab, n_head, n_layers, emb_dim, dense_dim, dropout=0.1):
        super(Decoder, self).__init__()
        self.emb_dim = emb_dim
        self.emb = layers.Embedding(input_dim=vocab, output_dim=emb_dim)
        self.pos = positional_encoding(MAX_LEN, emb_dim)

        self.decoder_layers = [DecoderLayer(
            n_head=n_head, emb_dim=emb_dim, dense_dim=dense_dim, dropout=dropout) for _ in range(n_layers)]
        self.dropout = layers.Dropout(dropout)

    def call(self, inputs,training=False):

        decoder_inputs, encoder_outputs,mask1,mask2 = inputs

        seq_len = decoder_inputs.shape[1]

        word_embedding = self.emb(decoder_inputs)
        word_embedding *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        emb = word_embedding + self.pos[:, :seq_len, :]

        x = self.dropout(emb, training=training)

        for decoder_layer in self.decoder_layers:
            x = decoder_layer([x, encoder_outputs,mask1,mask2])

        return x

    # def compute_mask(self, inputs, mask=None):
    #     return self.emb.compute_mask(inputs)
class Transformer(layers.Layer):
    def __init__(self):
        super(Transformer,self).__init__()

        self.encoder = Encoder(vocab=INPUT_MAX_TOKENS, n_head=n_head, n_layers=n_layer,
                      emb_dim=embedding_size, dense_dim=dense_dim, dropout=dropout)

        self.decoder = Decoder(vocab=OUTPUT_MAX_TOKENS, n_head=n_head, n_layers=n_layer,
                      emb_dim=embedding_size, dense_dim=dense_dim, dropout=dropout)

        self.dense = layers.Dense(OUTPUT_MAX_TOKENS, activation='softmax')

    def call(self,encoder_inputs,decoder_inputs):
        encoder_padding_mask, look_ahead_mask, decoder_padding_mask = create_mask(
            encoder_inputs, decoder_inputs)

        encoder_outputs = self.encoder([encoder_inputs,encoder_padding_mask])
        x = self.decoder([decoder_inputs, encoder_outputs,look_ahead_mask,decoder_padding_mask])
        x = layers.Dropout(0.1)(x)
        decoder_outputs = self.dense(x)
        return decoder_outputs


## Train the model

In [None]:
encoder_inputs = layers.Input((None,))
decoder_inputs = layers.Input((None,))

transformer = Transformer()
decoder_outputs=transformer(encoder_inputs,decoder_inputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()
model.compile(optimizer='rmsprop', metrics=['acc'],
              loss="sparse_categorical_crossentropy")
model.fit([encoder_input_data, decoder_input_data], decoder_output_data,
          validation_split=0.1, epochs=50, batch_size=batch_size)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 transformer (Transformer)      (None, 30, 10000)    23477520    ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
Total params: 23,477,520
Trainable params: 23,477,520
Non-trainable params: 0
________________

<keras.callbacks.History at 0x28ea60dded0>

## Save the model

In [None]:
model.save('shakespearish')



INFO:tensorflow:Assets written to: shakespearish\assets


INFO:tensorflow:Assets written to: shakespearish\assets


In [None]:
model = keras.models.load_model('shakespearish')

## Test the model

In [None]:
def decode_sequence(input_sentence):
    tokenized_input_sentence = source_tokenizer([input_sentence])
    decoded_sentence = "[start]"
    for i in range(MAX_LEN):
        tokenized_target_sentence = target_tokenizer([decoded_sentence])[:, :-1]
        predictions = model(
            [tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = tar_idx_to_word[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence

In [None]:
sentences = source_test[-7:]
for sentence in sentences:
    print("="*50)
    print("source sentence:", sentence)
    print("target sentence:", target_test[source_test.index(sentence)])
    print("translated sentence:", decode_sequence(sentence))


source sentence: Thanks , you the valiant of this warlike isle That so approve the Moor .
target sentence: [start] Thanks , you brave men who defend this island and respect Othello . [end]
translated sentence: [start] thanks for the brave moor [end]
source sentence: What , so brief ?
target sentence: [start] That's it ? [end]
translated sentence: [start] so what is it [end]
source sentence: So long ?
target sentence: [start] As long as that ? [end]
translated sentence: [start] so long that long a long time [end]
source sentence: I shall attend you presently at your tent .
target sentence: [start] I'll meet you at your tent . [end]
translated sentence: [start] ill wait for you at your sleep [end]
source sentence: Good day and happiness , dear Rosalind .
target sentence: [start] Good day and happiness to you , darling Rosalind . [end]
translated sentence: [start] good day and happiness will rosalind [end]
source sentence: Ha ?
target sentence: [start] What ? [end]
translated sentence: [s