## Get dataset from Kaggle

In [None]:
!mkdir -p ~/.kaggle
!touch ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!echo '{"username":"rohitawate","key":"14a69194fa4cd4e37490796b1f37ff69"}' > ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d nasirkhalid24/the-office-us-complete-dialoguetranscript

Downloading the-office-us-complete-dialoguetranscript.zip to /content
 73% 1.00M/1.37M [00:00<00:00, 1.83MB/s]
100% 1.37M/1.37M [00:00<00:00, 2.37MB/s]


In [None]:
!ls

sample_data  the-office-us-complete-dialoguetranscript.zip


In [None]:
!unzip the-office-us-complete-dialoguetranscript.zip
!ls

Archive:  the-office-us-complete-dialoguetranscript.zip
  inflating: The-Office-Lines-V4.csv  
sample_data		 the-office-us-complete-dialoguetranscript.zip
The-Office-Lines-V4.csv


In [None]:
!mv The-Office-Lines-V4.csv data.csv

# EDA and Pre-processing

In [None]:
import pandas as pd
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,season,episode,title,scene,speaker,line,Unnamed: 6
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,
3,1,1,Pilot,1,Jim,"Actually, you called me in here, but yeah.",
4,1,1,Pilot,1,Michael,"All right. Well, let me show you how it's done.",


### Drop unnecessary columns

In [None]:
df = df.drop(columns=["season", "episode", "Unnamed: 6"], axis=1)
df.head()

Unnamed: 0,title,scene,speaker,line
0,Pilot,1,Michael,All right Jim. Your quarterlies look very good...
1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So..."
2,Pilot,1,Michael,So you've come to the master for guidance? Is ...
3,Pilot,1,Jim,"Actually, you called me in here, but yeah."
4,Pilot,1,Michael,"All right. Well, let me show you how it's done."


## Sanitize speaker names

### Collapse "Michael: " into "Michael" and similar examples

Note: We decided against doing this because the lines corresponding to these
speaker names with colons are poor quality as can be observed below. This is likely because the dataset
is compiled from a variety of sources.

No need to explicitly drop these, we filter out the top 40 characters in terms of number of lines and these ones don't make that cut either way.


In [None]:
df[df["speaker"] == "Michael: "]

Unnamed: 0,title,scene,speaker,line
31793,Happy Hour,4846,Michael:,w many is that?
31795,Happy Hour,4846,Michael:,unt the last one.
31797,Happy Hour,4846,Michael:,", new record!"
31799,Happy Hour,4846,Michael:,", what did you do today?"
31801,Happy Hour,4846,Michael:,", yeah, sitting on your big fat butt. Alright,..."
...,...,...,...,...
32088,Happy Hour,4888,Michael:,is I.
32090,Happy Hour,4888,Michael:,", hey guys."
32102,Happy Hour,4890,Michael:,"y, Julie! You having fun?"
32145,Happy Hour,4896,Michael:,"lperts, wait up. Oh, what a great night. Got t..."


### Correct typos: Deangelo > DeAngelo

In [None]:
typos = {
    "Deangelo": "DeAngelo"
}

df["speaker"] = df["speaker"].replace(typos, regex=True)

### Drop lines from characters that don't fall in the top 40 in terms of number of lines

In [None]:
TOP_COUNT = 40

top_speakers = df.value_counts("speaker").head(TOP_COUNT).keys()

In [None]:
df = df[df["speaker"].isin(top_speakers)]
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50300 entries, 0 to 50299
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    50300 non-null  object
 1   scene    50300 non-null  int64 
 2   speaker  50300 non-null  object
 3   line     50300 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


### Save new CSV to disk

In [None]:
with open("processed_data.csv", "w") as out_fd:
    df.to_csv(out_fd, index=False)

In [None]:
!ls -lh

total 10M
-rw-r--r-- 1 root root 4.6M Jan 18  2021 data.csv
-rw-r--r-- 1 root root 4.0M Apr 15 22:27 processed_data.csv
drwxr-xr-x 1 root root 4.0K Apr 13 13:30 sample_data
-rw-r--r-- 1 root root 1.4M Apr 15 22:27 the-office-us-complete-dialoguetranscript.zip


In [None]:
!head -10 processed_data.csv

title,scene,speaker,line
Pilot,1,Michael,All right Jim. Your quarterlies look very good. How are things at the library?
Pilot,1,Jim,"Oh, I told you. I couldn't close it. So..."
Pilot,1,Michael,"So you've come to the master for guidance? Is this what you're saying, grasshopper?"
Pilot,1,Jim,"Actually, you called me in here, but yeah."
Pilot,1,Michael,"All right. Well, let me show you how it's done."
Pilot,2,Michael," Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just wanted to talk to you manager-a-manger.  All right. Done deal. Thank you very much, sir. You're a gentleman and a scholar. Oh, I'm sorry. OK. I'm sorry. My mistake.  That was a woman I was talking to, so... She had a very low voice. Probably a smoker, so...  So that's the way it's done."
Pilot,3,Michael,"I've, uh, I've been at Dunder Mifflin for 12 years, the last four as Regional Manager. If you want to come through here

# Convert .csv to a textual script for tokenization

### Meta tokens for the script text

In [None]:
SCENE_START = "<scene_start>"
SCENE_END = "<scene_end>"

SPEAKER_START = "<speaker_start>"
SPEAKER_END = "<speaker_end>"

LINE_START = "<line_start>"
LINE_END = "<line_end>"

SENT_START = "<sent_start>"
SENT_END = "<sent_end>"

NEWLINE = "<NEWLINE>"

In [None]:
import csv
import nltk
import string
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Used to remove punctuation from strings
translator = str.maketrans('', '', string.punctuation)

with open("script.txt", "w") as out_fd:
    out_fd.write(SCENE_START + " ")

    with open("processed_data.csv") as in_fd:
        csv_reader = csv.DictReader(in_fd)

        scene = 1
        for row in csv_reader:
            if int(row["scene"]) > scene:
                scene = int(row["scene"])
                out_fd.write(SCENE_END + " " + SCENE_START + " ")

            out_fd.write(f"{SPEAKER_START} {row['speaker']} {SPEAKER_END} {LINE_START} ")
            
            # A line may have multiple sentences
            sentences = sent_tokenize(row['line'])
            for sentence in sentences:
                sentence = sentence.translate(translator)
                out_fd.write(f"{SENT_START} {sentence} {SENT_END} ")

            out_fd.write(LINE_END + " ")

    out_fd.write(SCENE_END + " ")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Checking the first 500 characters of the file
!head -c500 script.txt

<scene_start> <speaker_start> Michael <speaker_end> <line_start> <sent_start> All right Jim <sent_end> <sent_start> Your quarterlies look very good <sent_end> <sent_start> How are things at the library <sent_end> <line_end> <speaker_start> Jim <speaker_end> <line_start> <sent_start> Oh I told you <sent_end> <sent_start> I couldnt close it <sent_end> <sent_start> So <sent_end> <line_end> <speaker_start> Michael <speaker_end> <line_start> <sent_start> So youve come to the master for guidance <sent

In [None]:
import csv
import nltk
import string
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Used to remove punctuation from strings
translator = str.maketrans('', '', string.punctuation)

with open("script_simple.txt", "w") as out_fd:
    with open("processed_data.csv") as in_fd:
        csv_reader = csv.DictReader(in_fd)

        scene = 1
        for row in csv_reader:
            if int(row["scene"]) > scene:
                scene = int(row["scene"])

            out_fd.write(f"{row['speaker']}: {row['line']}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Checking the first 500 characters of the file
!head -c500 script_simple.txt

Michael: All right Jim. Your quarterlies look very good. How are things at the library?
Jim: Oh, I told you. I couldn't close it. So...
Michael: So you've come to the master for guidance? Is this what you're saying, grasshopper?
Jim: Actually, you called me in here, but yeah.
Michael: All right. Well, let me show you how it's done.
Michael:  Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just w

# Train word embedding model

In [None]:
from gensim.models import Word2Vec

In [None]:
# Load the training script
filename = "script.txt"
raw_text = open(filename, "r").read()

### Separate punctuation from words

In [None]:
PUNCTUATIONS = set(['.', '[', ']', '(', ')', ';', ':', "'", '/', '"', ',', '?', '*', '!', '-', '$', '%', '&'])

for punct in PUNCTUATIONS:
    raw_text = raw_text.replace(punct, f" {punct} ")

# Keras' tokenizer gets rid of \n
raw_text = raw_text.replace("\n", f" {NEWLINE} ")

In [None]:
raw_text[:120]

'<scene_start> <speaker_start> Michael <speaker_end> <line_start> <sent_start> All right Jim <sent_end> <sent_start> Your'

In [None]:
# Convert all tokens to lower-case
tokens = raw_text.split()
tokens = [token.lower() for token in tokens]

In [None]:
tokens[:20]

['<scene_start>',
 '<speaker_start>',
 'michael',
 '<speaker_end>',
 '<line_start>',
 '<sent_start>',
 'all',
 'right',
 'jim',
 '<sent_end>',
 '<sent_start>',
 'your',
 'quarterlies',
 'look',
 'very',
 'good',
 '<sent_end>',
 '<sent_start>',
 'how',
 'are']

In [None]:
EMBED_SIZE = 512
EMBED_WINDOW = 5

embed_model = Word2Vec(
    sentences=[tokens],
    window=EMBED_WINDOW,
    vector_size=EMBED_SIZE,
    min_count=1
)

In [None]:
embed_model.wv["jim"]

array([-2.91421311e-03, -2.36648843e-02, -1.36602083e-02,  6.55734614e-02,
        3.42263491e-04,  2.95747295e-02,  6.12691529e-02, -3.43051031e-02,
       -3.35325263e-02,  9.35531631e-02, -8.73894431e-03, -2.17769239e-02,
        5.09277172e-02, -1.05928339e-01, -3.92229296e-02, -5.52094821e-03,
       -6.88646501e-03,  1.56655997e-01, -1.03827134e-01,  2.57804841e-02,
        6.40802681e-02, -5.34819812e-02,  8.47180486e-02, -8.91114399e-02,
       -7.82372355e-02,  5.64162247e-02,  2.36978903e-02,  7.66482279e-02,
        4.01230268e-02, -7.46122375e-02,  2.78319567e-02,  3.27802673e-02,
        5.94246574e-02, -2.17035646e-03, -5.47019616e-02,  3.95347811e-02,
        6.42746873e-03, -4.39346656e-02, -5.55619262e-02, -6.38927892e-02,
       -6.26534522e-02,  1.11770101e-01, -8.83168876e-02, -5.32935746e-02,
        1.62211657e-01, -1.23856543e-02, -2.71085817e-02, -3.01735196e-02,
        1.74484357e-01, -6.76258579e-02, -3.96151543e-02, -1.97892748e-02,
        3.54288481e-02,  

### Tokenize and encode the text

In [None]:
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, Input
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([tokens])
encoded = tokenizer.texts_to_sequences([tokens])[0]

In [None]:
encoded[:10]

[14, 3, 11, 4, 5, 1, 49, 54, 19, 2]

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

20222


### Generate input and output sequences

In [None]:
INPUT_SEQ_LEN = 50
STEP_SIZE = 5

X = []
Y = []

for idx in range(0, len(encoded) - INPUT_SEQ_LEN, STEP_SIZE):
    X.append(encoded[idx : idx + INPUT_SEQ_LEN])
    Y.append(encoded[idx + INPUT_SEQ_LEN])

X = np.array(X)
Y = np.array(Y)

### Build embeddings matrix

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
for word, idx in tokenizer.word_index.items():
    embedding_vector = embed_model.wv[word]
    embedding_matrix[idx] = embedding_vector

# Transformer Model

In [None]:
!pip install keras_nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import keras_nlp
import tensorflow as tf

The following is adapted from [GPT text generation from scratch with KerasNLP](https://keras.io/examples/generative/text_generation_gpt/).

In [None]:
TRANSFORMER_NUM_LAYERS = 2
TRANSFORMER_NUM_HEADS = 3
TRANSFORMER_FEED_FORWARD_DIM = 256

inputs = Input(shape=(INPUT_SEQ_LEN,), dtype=tf.int32)

# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=vocab_size,
    sequence_length=INPUT_SEQ_LEN,
    embedding_dim=EMBED_SIZE,
    mask_zero=True,
)
x = embedding_layer(inputs)

# Transformer decoders.
for _ in range(TRANSFORMER_NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=TRANSFORMER_NUM_HEADS,
        intermediate_dim=TRANSFORMER_FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)

# Output.
outputs = Dense(vocab_size)(x)
model = Model(inputs=inputs, outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

Query tensor shape: (None, 50, 512)
Query tensor shape: (None, 50, 512)


In [None]:
model.summary()

Model: "model_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_35 (InputLayer)       [(None, 50)]              0         
                                                                 
 token_and_position_embeddin  (None, 50, 512)          10379264  
 g_32 (TokenAndPositionEmbed                                     
 ding)                                                           
                                                                 
 transformer_decoder_56 (Tra  (None, 50, 512)          1311482   
 nsformerDecoder)                                                
                                                                 
 transformer_decoder_57 (Tra  (None, 50, 512)          1311482   
 nsformerDecoder)                                                
                                                                 
 dense_28 (Dense)            (None, 50, 20222)         103

In [None]:
def data_generator(X, Y, batch_size, shuffle):
    while True:
        # Shuffle the data
        if shuffle:
            indices = np.random.permutation(len(X))
            X = X[indices]
            Y = Y[indices]

        # Generate batches
        for i in range(0, len(X), batch_size):
            X_batch = X[i:i+batch_size]
            Y_batch = Y[i:i+batch_size]

            # Convert labels to one-hot vectors
            X_batch = to_categorical(X_batch, vocab_size)
            Y_batch = to_categorical(Y_batch, vocab_size)

            # Yield the batch
            yield X_batch, Y_batch

In [None]:
BATCH_SIZE = 128
EPOCHS = 1
STEPS_PER_EPOCH = len(X) // BATCH_SIZE

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

LEARNING_RATE = 0.01
model.optimizer.lr = LEARNING_RATE
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.8,
                              patience=1, min_lr=0.001)

callbacks = [checkpoint, reduce_lr]

In [None]:
train_generator = data_generator(X, Y, BATCH_SIZE, shuffle=False)
model.fit(train_generator, steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS, callbacks=callbacks)

ValueError: ignored

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def generate(seed_speaker, n_scenes, temperature=1.0, max_words=500):
    assert seed_speaker in set(top_speakers), "Seed speaker is not a top speaker"

    seed_text = f"{SCENE_START} {SPEAKER_START} {seed_speaker} {SPEAKER_END} {LINE_START} {SENT_START}".lower()
    generated_text = seed_text.split()

    scenes_generated = 0
    words_generated = 0

    while True:
        if scenes_generated == n_scenes or words_generated == max_words:
            break

        # Tokenize and encode the seed text
        encoded = tokenizer.texts_to_sequences([seed_text.split(" ")])
        encoded = to_categorical(encoded, vocab_size)
        padded_seq = pad_sequences(encoded, maxlen=INPUT_SEQ_LEN, truncating='pre')

        # Generate the predicted word
        predicted = model.predict(padded_seq, verbose=0)[0]
        predicted = np.log(predicted) / temperature
        predicted = np.exp(predicted) / np.sum(np.exp(predicted))
        predicted_idx = np.random.choice(len(predicted), p=predicted)
        predicted_word = tokenizer.index_word[predicted_idx]

        words_generated += 1

        if predicted_word == SCENE_END.lower():
            scenes_generated += 1

        # Append the predicted word to the generated text
        generated_text.append(predicted_word)

        # Update the seed text for the next iteration
        seed_text += " " + predicted_word

    return generated_text

In [None]:
def post_process_line(text):
    punctuation_to_attach_to_previous_word = ['.', ':', '!', ';', ')', ']', '?', ',', '%']
    for punctuation in punctuation_to_attach_to_previous_word:
        text = text.replace(' ' + punctuation, punctuation)
        
    punctuation_to_attach_to_following_word = ['[', '(', '$']
    for punctuation in punctuation_to_attach_to_following_word:
        text = text.replace(punctuation + ' ', punctuation)
        
    punctuation_to_attach_to_same_word = ["'", '-']
    for punctuation in punctuation_to_attach_to_same_word:
        text = text.replace(' ' + punctuation + ' ', punctuation)
        
    text = text.replace(NEWLINE.lower(), "\n")
    text = text.replace("\n ", "\n")
    
    return text

In [None]:
def post_process(text):
    output = ""
    idx = 0

    while True:
        token = text[idx]
        
        if token == SCENE_START.lower():
            output += "\n=== SCENE START ==="
        elif token == SCENE_END.lower():
            output += "\n=== SCENE END ===\n"
        elif token == SPEAKER_START.lower():
            idx += 1; token = text[idx]
            output += f"\n{token}: "
        elif token == LINE_END.lower():
            output += "."
        elif token == SENT_START.lower():
            idx += 1; token = text[idx]

            line = ""
            while token != SENT_END.lower():
                line += token + " "
                idx += 1; token = text[idx]

            output += post_process_line(line)

        idx += 1
        
        if idx == len(text):
            break
        
    return output

In [None]:
sent = generate(seed_speaker="Michael", n_scenes=10, temperature=0.7)
print(post_process(sent))

In [None]:
# model.save('the_office_model')

In [None]:
# !zip the_office_model.zip the_office_model/*