In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from tensorflow import keras
from tensorflow.keras import backend as K 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, Dense, Dropout, Embedding, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
''' reading data '''
df = pd.read_csv("wiki_movie_plots_deduped.csv")

In [3]:
''' displaying first five rows of data '''
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
''' checking shape of data '''
df.shape

(34886, 8)

In [5]:
''' checking null values '''
df.isnull().sum()

Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1422
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64

In [6]:
''' checking info of data '''
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [8]:
df = df[(df['Origin/Ethnicity'] == 'American') | (df['Origin/Ethnicity'] == 'Telugu') | (df['Origin/Ethnicity'] == 'Bollywood') 
          | (df['Origin/Ethnicity'] == 'Tamil') | (df['Origin/Ethnicity'] == 'Malyalam')]

In [11]:
df_txt = df.loc[:len(df)/2, :]

In [18]:
''' Tokenizer '''
token = Tokenizer()
''' fitting on data '''
token.fit_on_texts(df_txt['Plot'])

''' converting text into sequence '''
seq = token.texts_to_sequences(df_txt['Plot'])
max_len = max([len(x) for x in X_seq])

''' adding paddng to make the lenght of all words equal '''
pad = pad_sequences(X_seq)

In [19]:
print("Maximum ssequence lenght: ", max_len)
print("shape of  pad: ", pad.shape)

Maximum ssequence lenght:  2978
shape of  pad:  (12110, 2978)


In [20]:
voc_size = len(token.word_index) + 1

In [23]:
token2 = Tokenizer()
token2.fit_on_texts(df_txt['Title'])
seq2 = token2.texts_to_sequences(df_txt['Title'])
max_len_2 = max([len(y) for y in seq2])
pad2 = pad_sequences(seq2, maxlen = max_len_2)

In [24]:
print("max_len_2: ", max_len_2)
print('shape of pad2: ', np.shape(pad2))

max_len_2:  15
shape of pad2:  (12110, 15)


In [25]:
voc_size2 = len(token2.word_index) + 1

In [30]:
K.clear_session() 
latent_dim = 120 

''' Encoder '''
enc_inp = Input(shape=(max_len,)) 

''' Embedding Layer '''
out = Embedding(voc_size, 40,trainable=True)(enc_inp) 

''' LSTM1 '''
enc_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True) 
enc_out1, state_h1, state_c1 = enc_lstm1(out) 

''' LSTM 2 '''
enc_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True) 
enc_out2, state_h2, state_c2 = enc_lstm2(enc_out1) 

''' LSTM 3''' 
enc_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
enc_out, state_h, state_c= enc_lstm3(enc_out2) 

enc_states = [state_h, state_c]

''' Decoder '''
dec_inp = Input(shape=(None,)) 
''' Embebedding layer '''
dec_layer = Embedding(voc_size2, 20,trainable=True) 
dec_emb = dec_layer(dec_inp) 

'''LSTM using encoder_states as initial state'''
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
dec_out, dec_fwd_state, dec_back_state = dec_lstm(dec_emb, initial_state=enc_states) 

dec_out = Dense(voc_size, activation='softmax')(dec_out)

''' Model '''
model = Model([enc_inp, dec_inp], dec_out) 

''' compile the model '''
model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

''' lets see how model looks like '''
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2978)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 2978, 40)     3299560     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 2978, 120),  77280       embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [None]:
''' EarlyStopping '''
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

pad.shape, pad2[: : -1].shape, (pad2.reshape(pad2.shape[0], pad2.shape[1], 1)).shape

''' training '''
history = model.fit([pad, pad2[: : -1]], pad2.reshape(pad2.shape[0], pad2.shape[1], 1), epochs=10,  batch_size=16, 
                    callbacks=[es], validation_split=0.2)

Epoch 1/10
  8/606 [..............................] - ETA: 2:26:07 - loss: 11.0126 - accuracy: 0.5317