# Setup

In [64]:
from google.colab import drive

drive.mount("/content/gdrive/")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [65]:
%cd /content/gdrive/MyDrive/ML/ML-9

/content/gdrive/MyDrive/ML/ML-9


# Assignment 9, Qilin Zhou, 2024-03-08

## Question 1: Train an Encoder-Decoder model that can convert a date string from one format - April 22, 2019 - to another format - 2019-04-22

In [75]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed
from datetime import datetime, timedelta
import random

tf.random.set_seed(42)
random.seed(42)
np.random.seed(42)

### Generate the dataset

In [19]:
def generate_random_date(start_date, end_date):
    random_num_days = random.randrange((end_date - start_date).days)
    random_date = start_date + timedelta(days=random_num_days)
    return random_date.strftime("%B %d, %Y"), random_date.strftime("%Y-%m-%d")


def create_date_dataset(number_of_samples=20000):
    start_date = datetime(1900, 1, 1)
    end_date = datetime(2024, 3, 8)
    return [
        generate_random_date(start_date, end_date) for _ in range(number_of_samples)
    ]

In [20]:
date_dataset = create_date_dataset()

In [25]:
x, y = zip(*date_dataset)

### Create a basic Encoder–Decoder model with preprocessed inputs

In [97]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [115]:
input_texts, target_texts = zip(*date_dataset)

target_texts = ["\t" + text + "\n" for text in target_texts]

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(list(input_texts) + list(target_texts))
vocab_size = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# Pad sequences for consistent length
max_encoder_seq_length = max([len(txt) for txt in input_sequences])
max_decoder_seq_length = max([len(txt) for txt in target_sequences])

encoder_input_data = pad_sequences(
    input_sequences, maxlen=max_encoder_seq_length, padding="post"
)
decoder_input_data = pad_sequences(
    [seq[:-1] for seq in target_sequences],
    maxlen=max_decoder_seq_length,
    padding="post",
)
decoder_target_data = pad_sequences(
    [seq[1:] for seq in target_sequences], maxlen=max_decoder_seq_length, padding="post"
)

In [116]:
embedding_size = 32
encoder_seq_length = max_encoder_seq_length
decoder_seq_length = max_decoder_seq_length

# encoder
encoder_inputs = Input(
    shape=(encoder_seq_length,), dtype="int32", name="encoder_inputs"
)
encoder_embedding = Embedding(
    input_dim=vocab_size, output_dim=embedding_size, name="encoder_embedding"
)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(128, return_state=True, name="encoder_lstm")(
    encoder_embedding
)
encoder_states = [state_h, state_c]

# decoder
decoder_inputs = Input(
    shape=(decoder_seq_length,), dtype="int32", name="decoder_inputs"
)
decoder_embedding = Embedding(
    input_dim=vocab_size, output_dim=embedding_size, name="decoder_embedding"
)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = TimeDistributed(
    Dense(vocab_size, activation="softmax", name="decoder_dense")
)
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(
    optimizer="nadam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [117]:
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, 18)]                 0         []                            
 )                                                                                                
                                                                                                  
 decoder_inputs (InputLayer  [(None, 12)]                 0         []                            
 )                                                                                                
                                                                                                  
 encoder_embedding (Embeddi  (None, 18, 32)               1184      ['encoder_inputs[0][0]']      
 ng)                                                                                       

### Train the model

In [118]:
from sklearn.model_selection import train_test_split

(
    encoder_input_train,
    encoder_input_val,
    decoder_input_train,
    decoder_input_val,
    decoder_target_train,
    decoder_target_val,
) = train_test_split(
    encoder_input_data,
    decoder_input_data,
    decoder_target_data,
    test_size=0.2,
    random_state=42,
)

In [119]:
model.fit(
    [encoder_input_train, decoder_input_train],
    np.expand_dims(decoder_target_train, -1),
    batch_size=64,
    epochs=10,
    validation_data=(
        [encoder_input_val, decoder_input_val],
        np.expand_dims(decoder_target_val, -1),
    ),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cdc6ee301f0>

In [120]:
char_to_token = tokenizer.word_index
token_to_char = {token_id: char for char, token_id in char_to_token.items()}


def prepare_input(input_date_str, tokenizer, max_encoder_seq_length):
    input_tokens = [char for char in input_date_str]
    input_token_ids = [tokenizer.word_index.get(token, 0) for token in input_tokens]
    padded_input_token_ids = pad_sequences(
        [input_token_ids], maxlen=max_encoder_seq_length, padding="post"
    )

    return padded_input_token_ids

In [123]:
def predict_date(
    model, input_date_str, tokenizer, max_encoder_seq_length, max_decoder_seq_length
):
    input_seq = prepare_input(input_date_str, tokenizer, max_encoder_seq_length)

    sos_token_id = tokenizer.word_index["\t"]
    decoder_input_seq = np.zeros((1, max_decoder_seq_length))
    decoder_input_seq[0, 0] = sos_token_id

    predicted_sequence = []

    for i in range(1, max_decoder_seq_length):
        output_tokens = model.predict([input_seq, decoder_input_seq])
        sampled_token_index = np.argmax(output_tokens[0, i - 1, :])
        sampled_char = token_to_char.get(sampled_token_index)

        if sampled_char == "\n":
            break
        predicted_sequence.append(sampled_char)

        decoder_input_seq[0, i] = sampled_token_index

    return "".join(predicted_sequence)

In [129]:
input_date_str = "April 19, 2019"
predicted_date = predict_date(
    model, input_date_str, tokenizer, max_encoder_seq_length, max_decoder_seq_length
)
print(f"Input date: {input_date_str}")
print(f"Predicted date: {predicted_date}")

Input date: April 19, 2019
Predicted date: 2019-04-19


## Question 2: Use BERT or GPT-2 language models to generate more convincing Shakespearean text than what we did in the lecture.

### Prepare Dataset

In [164]:
from tensorflow import keras
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

In [159]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

### Adjusting pretrained GPT2 model to generate Shakespearean text

In [167]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


def generate_text(prompt_text, max_length=100, temperature=0.9, num_return_sequences=1):
    # Encode the prompt text with attention mask
    encoded_input = tokenizer(
        prompt_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
    )
    input_ids = encoded_input["input_ids"]
    attention_mask = encoded_input["attention_mask"]

    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        temperature=temperature,
        top_k=30,
        top_p=0.92,
        repetition_penalty=1.5,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        num_return_sequences=num_return_sequences,
    )

    generated_texts = [
        tokenizer.decode(output_sequence, skip_special_tokens=True)
        for output_sequence in output_sequences
    ]

    return generated_texts


# Example usage
input_text = "To be or not to be, that is the question:"
completed_texts = generate_text(
    input_text, max_length=100, temperature=0.8, num_return_sequences=6
)

for i, text in enumerate(completed_texts, 1):
    print(f"Completed text {i}:", text)
    print("\n---\n")

Completed text 1: To be or not to be, that is the question: Is it worth spending your hard-earned money on a brand new car?
The answer for us here at DHL has been yes. We're still testing our own version of this concept in Europe and China so you can't expect much else from them – but we'll continue checking out what's going into creating an authentic model based upon these latest prototypes!

---

Completed text 2: To be or not to be, that is the question: are you in love with your spouse? If so (or if I am and want nothing), will it change my life?"
The answer seems pretty obvious. However a great many people who have been married for 20 years think this does seem quite strange at first glance… but how can they possibly know when their loved one has changed his/her mind from being happy on top of everything else about them?! They would probably find out because we

---

Completed text 3: To be or not to be, that is the question: Is it true? It depends upon how you measure.
The "fact"