## Get dataset from Kaggle

In [None]:
!mkdir -p ~/.kaggle
!touch ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!echo '{"username":"rohitawate","key":"14a69194fa4cd4e37490796b1f37ff69"}' > ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d nasirkhalid24/the-office-us-complete-dialoguetranscript

Downloading the-office-us-complete-dialoguetranscript.zip to /content
 73% 1.00M/1.37M [00:00<00:00, 1.14MB/s]
100% 1.37M/1.37M [00:00<00:00, 1.46MB/s]


In [None]:
!ls

sample_data  the-office-us-complete-dialoguetranscript.zip


In [None]:
!unzip the-office-us-complete-dialoguetranscript.zip
!ls

Archive:  the-office-us-complete-dialoguetranscript.zip
  inflating: The-Office-Lines-V4.csv  
sample_data		 the-office-us-complete-dialoguetranscript.zip
The-Office-Lines-V4.csv


In [None]:
!mv The-Office-Lines-V4.csv data.csv

# EDA and Pre-processing

In [None]:
import pandas as pd
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,season,episode,title,scene,speaker,line,Unnamed: 6
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,
3,1,1,Pilot,1,Jim,"Actually, you called me in here, but yeah.",
4,1,1,Pilot,1,Michael,"All right. Well, let me show you how it's done.",


### Drop unnecessary columns

In [None]:
df = df.drop(columns=["season", "episode", "Unnamed: 6"], axis=1)
df.head()

Unnamed: 0,title,scene,speaker,line
0,Pilot,1,Michael,All right Jim. Your quarterlies look very good...
1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So..."
2,Pilot,1,Michael,So you've come to the master for guidance? Is ...
3,Pilot,1,Jim,"Actually, you called me in here, but yeah."
4,Pilot,1,Michael,"All right. Well, let me show you how it's done."


## Sanitize speaker names

### Collapse "Michael: " into "Michael" and similar examples

Note: We decided against doing this because the lines corresponding to these
speaker names with colons are poor quality as can be observed below. This is likely because the dataset
is compiled from a variety of sources.

No need to explicitly drop these, we filter out the top 40 characters in terms of number of lines and these ones don't make that cut either way.


In [None]:
df[df["speaker"] == "Michael: "]

Unnamed: 0,title,scene,speaker,line
31793,Happy Hour,4846,Michael:,w many is that?
31795,Happy Hour,4846,Michael:,unt the last one.
31797,Happy Hour,4846,Michael:,", new record!"
31799,Happy Hour,4846,Michael:,", what did you do today?"
31801,Happy Hour,4846,Michael:,", yeah, sitting on your big fat butt. Alright,..."
...,...,...,...,...
32088,Happy Hour,4888,Michael:,is I.
32090,Happy Hour,4888,Michael:,", hey guys."
32102,Happy Hour,4890,Michael:,"y, Julie! You having fun?"
32145,Happy Hour,4896,Michael:,"lperts, wait up. Oh, what a great night. Got t..."


### Correct typos: Deangelo > DeAngelo

In [None]:
typos = {
    "Deangelo": "DeAngelo"
}

df["speaker"] = df["speaker"].replace(typos, regex=True)

### Drop lines from characters that don't fall in the top 40 in terms of number of lines

In [None]:
TOP_COUNT = 40

top_speakers = df.value_counts("speaker").head(TOP_COUNT).keys()

In [None]:
df = df[df["speaker"].isin(top_speakers)]
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50300 entries, 0 to 50299
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    50300 non-null  object
 1   scene    50300 non-null  int64 
 2   speaker  50300 non-null  object
 3   line     50300 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


### Save new CSV to disk

In [None]:
with open("processed_data.csv", "w") as out_fd:
    df.to_csv(out_fd, index=False)

In [None]:
!ls -lh

total 10M
-rw-r--r-- 1 root root 4.6M Jan 18  2021 data.csv
-rw-r--r-- 1 root root 4.0M Apr 12 18:50 processed_data.csv
drwxr-xr-x 1 root root 4.0K Apr 11 13:33 sample_data
-rw-r--r-- 1 root root 1.4M Apr 12 18:49 the-office-us-complete-dialoguetranscript.zip


In [None]:
!head -10 processed_data.csv

title,scene,speaker,line
Pilot,1,Michael,All right Jim. Your quarterlies look very good. How are things at the library?
Pilot,1,Jim,"Oh, I told you. I couldn't close it. So..."
Pilot,1,Michael,"So you've come to the master for guidance? Is this what you're saying, grasshopper?"
Pilot,1,Jim,"Actually, you called me in here, but yeah."
Pilot,1,Michael,"All right. Well, let me show you how it's done."
Pilot,2,Michael," Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just wanted to talk to you manager-a-manger.  All right. Done deal. Thank you very much, sir. You're a gentleman and a scholar. Oh, I'm sorry. OK. I'm sorry. My mistake.  That was a woman I was talking to, so... She had a very low voice. Probably a smoker, so...  So that's the way it's done."
Pilot,3,Michael,"I've, uh, I've been at Dunder Mifflin for 12 years, the last four as Regional Manager. If you want to come through here

# Convert .csv to a textual script for tokenization

### Meta tokens for the script text

In [None]:
SCENE_START = "<scene_start>"
SCENE_END = "<scene_end>"

SPEAKER_START = "<speaker_start>"
SPEAKER_END = "<speaker_end>"

LINE_START = "<line_start>"
LINE_END = "<line_end>"

SENT_START = "<sent_start>"
SENT_END = "<sent_end>"

NEWLINE = "<NEWLINE>"

In [None]:
import csv
import nltk
import string
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Used to remove punctuation from strings
translator = str.maketrans('', '', string.punctuation)

with open("script.txt", "w") as out_fd:
    out_fd.write(SCENE_START + " ")

    with open("processed_data.csv") as in_fd:
        csv_reader = csv.DictReader(in_fd)

        scene = 1
        for row in csv_reader:
            if int(row["scene"]) > scene:
                scene = int(row["scene"])
                out_fd.write(SCENE_END + " " + SCENE_START + " ")

            out_fd.write(f"{SPEAKER_START} {row['speaker']} {SPEAKER_END} {LINE_START} ")
            
            # A line may have multiple sentences
            sentences = sent_tokenize(row['line'])
            for sentence in sentences:
                sentence = sentence.translate(translator)
                out_fd.write(f"{SENT_START} {sentence} {SENT_END} ")

            out_fd.write(LINE_END + " ")

    out_fd.write(SCENE_END + " ")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Checking the first 500 characters of the file
!head -c500 script.txt

<scene_start> <speaker_start> Michael <speaker_end> <line_start> <sent_start> All right Jim <sent_end> <sent_start> Your quarterlies look very good <sent_end> <sent_start> How are things at the library <sent_end> <line_end> <speaker_start> Jim <speaker_end> <line_start> <sent_start> Oh I told you <sent_end> <sent_start> I couldnt close it <sent_end> <sent_start> So <sent_end> <line_end> <speaker_start> Michael <speaker_end> <line_start> <sent_start> So youve come to the master for guidance <sent

In [None]:
import csv
import nltk
import string
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Used to remove punctuation from strings
translator = str.maketrans('', '', string.punctuation)

with open("script_simple.txt", "w") as out_fd:
    with open("processed_data.csv") as in_fd:
        csv_reader = csv.DictReader(in_fd)

        scene = 1
        for row in csv_reader:
            if int(row["scene"]) > scene:
                scene = int(row["scene"])

            out_fd.write(f"{row['speaker']}: {row['line']}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Checking the first 500 characters of the file
!head -c500 script_simple.txt

Michael: All right Jim. Your quarterlies look very good. How are things at the library?
Jim: Oh, I told you. I couldn't close it. So...
Michael: So you've come to the master for guidance? Is this what you're saying, grasshopper?
Jim: Actually, you called me in here, but yeah.
Michael: All right. Well, let me show you how it's done.
Michael:  Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just w

# Train word embedding model

In [None]:
from gensim.models import Word2Vec

In [None]:
# Load the training script
filename = "script_simple.txt"
raw_text = open(filename, "r").read()

### Separate punctuation from words

In [None]:
PUNCTUATIONS = set(['.', '[', ']', '(', ')', ';', ':', "'", '/', '"', ',', '?', '*', '!', '-', '$', '%', '&'])

for punct in PUNCTUATIONS:
    raw_text = raw_text.replace(punct, f" {punct} ")

# Keras' tokenizer gets rid of \n
raw_text = raw_text.replace("\n", f" {NEWLINE} ")

In [None]:
raw_text[:120]

'Michael :  All right Jim .  Your quarterlies look very good .  How are things at the library ?  <NEWLINE> Jim :  Oh ,  I'

In [None]:
# Convert all tokens to lower-case
tokens = raw_text.split()
tokens = [token.lower() for token in tokens]

In [None]:
tokens[:20]

['michael',
 ':',
 'all',
 'right',
 'jim',
 '.',
 'your',
 'quarterlies',
 'look',
 'very',
 'good',
 '.',
 'how',
 'are',
 'things',
 'at',
 'the',
 'library',
 '?',
 '<newline>']

In [None]:
EMBED_SIZE = 200
EMBED_WINDOW = 5

embed_model = Word2Vec(
    sentences=[tokens],
    window=EMBED_WINDOW,
    vector_size=EMBED_SIZE,
    min_count=1
)

In [None]:
embed_model.wv["jim"]

array([ 4.39689681e-02, -9.64623988e-02, -9.09160376e-02,  2.49501839e-01,
        2.68273413e-01, -1.47977322e-01,  1.91438600e-01,  4.03586864e-01,
       -2.04520881e-01,  6.19583428e-02, -8.59338418e-02, -1.03006348e-01,
        5.38071617e-02,  1.82757139e-01,  1.30591355e-02, -8.53403434e-02,
       -1.14948153e-01,  4.96131368e-02, -1.42454267e-01, -4.61933762e-01,
        1.86512068e-01, -1.09709911e-02,  8.91167745e-02,  5.59945479e-02,
        8.13712999e-02, -5.65909855e-02,  1.15503073e-01, -9.32086855e-02,
       -3.00256014e-01,  1.31183825e-02,  1.36739060e-01,  2.81858239e-02,
        2.19347671e-01, -4.00805436e-02, -1.04333699e-01,  1.22275271e-01,
        2.12237507e-01, -5.02038375e-02, -7.11665377e-02, -2.78267324e-01,
       -8.31846148e-02,  6.43602461e-02, -1.67608246e-01,  9.05463845e-02,
        2.14199737e-01, -1.18799359e-01, -8.38251878e-03, -1.40764087e-01,
        1.06701784e-01,  1.71764225e-01,  5.98359928e-02, -1.37249559e-01,
       -1.13440372e-01, -

### Tokenize and encode the text

In [None]:
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout, Input
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([tokens])
encoded = tokenizer.texts_to_sequences([tokens])[0]

In [None]:
encoded[:10]

[11, 2, 53, 58, 19, 1, 49, 10237, 122, 127]

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

18549


### Generate input and output sequences

In [None]:
from nltk.util import ngrams

N = 10
sequences = list(ngrams(encoded, N))

In [None]:
X = []
Y = []

for sequence in sequences:
    sequence = list(sequence)

    x = sequence[:-1]
    y = sequence[-1]

    X.append(x)
    Y.append(y)

X = np.array(X)
Y = np.array(Y)

### Build embeddings matrix

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
for word, idx in tokenizer.word_index.items():
    embedding_vector = embed_model.wv[word]
    embedding_matrix[idx] = embedding_vector

# LSTM Model

In [None]:
def data_generator(X, Y, batch_size, shuffle):
    while True:
        # Shuffle the data
        if shuffle:
            indices = np.random.permutation(len(X))
            X = X[indices]
            Y = Y[indices]

        # Generate batches
        for i in range(0, len(X), batch_size):
            X_batch = X[i:i+batch_size]
            Y_batch = Y[i:i+batch_size]

            # Convert labels to one-hot vectors
            Y_batch = to_categorical(Y_batch, vocab_size)

            # Yield the batch
            yield X_batch, Y_batch

In [None]:
LSTM_UNITS = 300

input = Input((None,))
embed = Embedding(input_dim=vocab_size, output_dim=EMBED_SIZE, input_length=N-1, weights=[embedding_matrix], trainable=True)
lstm1 = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
lstm2 = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
lstm3 = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
lstm4 = LSTM(LSTM_UNITS)
dense = Dense(vocab_size, activation="softmax")

net = embed(input)
net, h1, c1 = lstm1(net)
net, h2, c2 = lstm2(net)
net, h3, c3 = lstm3(net)
net = lstm4(net)
output = dense(net)

In [None]:
model = Model(input, output)
model.compile(optimizer="adam", loss="categorical_crossentropy")

In [None]:
BATCH_SIZE = 128
EPOCHS = 1
STEPS_PER_EPOCH = len(X) // BATCH_SIZE
LEARNING_RATE = 0.01

train_generator = data_generator(X, Y, BATCH_SIZE, shuffle=True)
model.optimizer.lr = LEARNING_RATE
model.fit(train_generator, steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS)



<keras.callbacks.History at 0x7f48459a5f40>

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def generate(seed_speaker, n_lines, temperature=1.0, max_words=500):
    assert seed_speaker in set(top_speakers), "Seed speaker is not a top speaker"

    seed_text = generated_text = f"{seed_speaker.lower()} : "

    lines_generated = 0
    words_generated = 0

    while True:
        if lines_generated == n_lines or words_generated == max_words:
            break

        # Tokenize and encode the seed text
        encoded = tokenizer.texts_to_sequences([seed_text.split(" ")])
        padded_seq = pad_sequences(encoded, maxlen=model.input_shape[1], truncating='pre')

        # Generate the predicted word
        predicted = model.predict(padded_seq, verbose=0)[0]
        predicted = np.log(predicted) / temperature
        predicted = np.exp(predicted) / np.sum(np.exp(predicted))
        predicted_idx = np.random.choice(len(predicted), p=predicted)
        predicted_word = tokenizer.index_word[predicted_idx]

        words_generated += 1

        if predicted_word == NEWLINE.lower():
            lines_generated += 1

        # Append the predicted word to the generated text
        generated_text += " " + predicted_word

        # Update the seed text for the next iteration
        seed_text += " " + predicted_word

    return generated_text

In [None]:
def post_process_text(text):
    punctuation_to_attach_to_previous_word = ['.', ':', '!', ';', ')', ']', '?', ',', '%']
    for punctuation in punctuation_to_attach_to_previous_word:
        text = text.replace(' ' + punctuation, punctuation)
        
    punctuation_to_attach_to_following_word = ['[', '(', '$']
    for punctuation in punctuation_to_attach_to_following_word:
        text = text.replace(punctuation + ' ', punctuation)
        
    punctuation_to_attach_to_same_word = ["'", '-']
    for punctuation in punctuation_to_attach_to_same_word:
        text = text.replace(' ' + punctuation + ' ', punctuation)
        
    text = text.replace(NEWLINE.lower(), "\n")
    text = text.replace("\n ", "\n")
    
    return text

In [None]:
sent = generate(seed_speaker="Pam", n_lines=10, temperature=0.9)
print(post_process_text(sent))

pam:  stanley ryan dead your., edward, football:. 
: describe 
t eat 
'tonight so:: or was ': 
good. little re into i: being i saying, no to impression you'' with 
., " something '.: t.'know 
know s operates what oh you s you:? s: erin a:'with man no. you, that t:. cool. in just i with just: with describe with andy me: does.. to 
that? to no your kind s 
with i me t. too if do co place.'oh my you, have,, oscar have'very. flip you get you,, angela. you. that stuff dog michael kind: you you you up describe. " co describe'saying 
: you edward very nickels 



In [None]:
model.save('the_office_model')



In [None]:
!zip the_office_model.zip the_office_model/*

updating: the_office_model/assets/ (stored 0%)
updating: the_office_model/fingerprint.pb (stored 0%)
updating: the_office_model/keras_metadata.pb (deflated 92%)
updating: the_office_model/saved_model.pb (deflated 91%)
updating: the_office_model/variables/ (stored 0%)
