In [1]:
import pandas as pd
import nltk

from sklearn.utils import shuffle
from collections import Counter
import itertools

from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, TimeDistributed, Bidirectional
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using TensorFlow backend.


In [0]:
df  = pd.read_csv("wiki_movie_plots_deduped.csv")

In [3]:
df.head(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [0]:
del df['Wiki Page']
del df['Director']
del df['Genre']
del df['Wiki Page']

In [0]:
df.drop(df.index[1000:], inplace = True)

In [6]:
length = len(df.Plot)
length

1000

In [0]:
plots = list(df.Plot.apply(word_tokenize))

In [0]:
# words = set(word_tokenize(str(plots)))
corpus_raw= list(itertools.chain.from_iterable(plots))

In [0]:
counts = Counter(corpus_raw)
tokens = []
for i in counts:
    if counts[i] != 1:
        tokens.append(i)

In [10]:
len(tokens)

10487

In [11]:
list(tokens)[:3]

['A', 'bartender', 'is']

In [0]:
maxlen = 3
step = 1
ngrams = []
next_tokens = []
for plot in plots:
    for i in range(0, len(plot) - maxlen, step):
        ngrams.append(plot[i: i + maxlen])
        next_tokens.append(plot[i + maxlen])

In [0]:
tokenizer = Tokenizer(num_words=len(tokens))
tokenizer.fit_on_texts(ngrams)

In [0]:
index2token = {i: t for t, i in tokenizer.word_index.items()}

In [0]:
sequences = tokenizer.texts_to_sequences(ngrams)
X_train = pad_sequences(sequences, maxlen=maxlen)

In [0]:
sequences = tokenizer.texts_to_sequences(next_tokens)
y_train = tokenizer.sequences_to_matrix(sequences)

In [0]:
model = Sequential()

model.add(Embedding(len(tokens), 50, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation = 'softmax'))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'softmax'))
model.add(Dropout(0.2))
model.add(Dense(len(tokens), activation = 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
for iteration in range(1, 100):
    X_train_shuffled, y_train_shuffled = shuffle(X_train,y_train)
    model.fit(X_train_shuffled, y_train_shuffled, batch_size=len(X_train), epochs=1, verbose = 0)

In [0]:
sequences = tokenizer.texts_to_sequences([' '.join('a bartender is')])
x_pred = pad_sequences(sequences, maxlen=3)

In [0]:
preds = model.predict(x_pred, verbose=0)

In [0]:
import random
import sys

def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) #/ temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.choice(range(len(corpus)), p = preds)
    return probas

In [0]:
generated = ''
seed = 'a bartender is'
generated += seed
print('----- Generating with seed: "' + seed + '"')
print(generated)

In [0]:
for i in range(8):
    sequences = tokenizer.texts_to_sequences([' '.join([char for char in generated[3:]])])
    x_pred = pad_sequences(sequences, maxlen=3)
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds)
    next_word = index_word[next_index]
    generated += next_word
    print(generated)