Importing Libraries

In [2]:

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
#from tensorflow import set_random_seed
from numpy.random import seed
#set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np


import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

#Unzipping the IMDB dataset

In [3]:
!unzip '/content/drive/MyDrive/archive (3).zip'

Archive:  /content/drive/MyDrive/archive (3).zip
replace IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [4]:
curr_dir = '/content/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'IMDB' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.review.values))
        break

all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)


50000

#I take 880 sentences to train on and slice the length of each sentence upto 200

In [5]:
all_headlines = all_headlines[:880]

In [9]:
all_headlines1 = []
for i in range(0,880):
  all_headlines[i]= all_headlines[i][0:200]

In [10]:


all_headlines

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo",
 'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece',
 'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is wi',
 "Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, J",
 'Petter Mattei\'s "Love in the Time of Money" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be t

#Cleaning the text by removing punctuations and converting all the words to lower

In [12]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with mebr br the first thing that struck me abo',
 'a wonderful little production br br the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece',
 'i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air conditioned theater and watching a lighthearted comedy the plot is simplistic but the dialogue is wi',
 'basically theres a family where a little boy jake thinks theres a zombie in his closet  his parents are fighting all the timebr br this movie is slower than a soap opera and suddenly j',
 'petter matteis love in the time of money is a visually stunning film to watch mr mattei offers us a vivid portrait about human relations this is a movie that seems to be telling us what money p',
 'probably my alltime fa

In [13]:
len(corpus)

880

#Performing tokenization

In [14]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[23, 4],
 [23, 4, 1],
 [23, 4, 1, 103],
 [23, 4, 1, 103, 762],
 [23, 4, 1, 103, 762, 38],
 [23, 4, 1, 103, 762, 38, 978],
 [23, 4, 1, 103, 762, 38, 978, 13],
 [23, 4, 1, 103, 762, 38, 978, 13, 92],
 [23, 4, 1, 103, 762, 38, 978, 13, 92, 90],
 [23, 4, 1, 103, 762, 38, 978, 13, 92, 90, 35]]

In [15]:
total_words

6158

#Padding the sequences to make all of them of the same length

In [16]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = 92
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [17]:
max_sequence_len

92

#Defining the LSTM model

In [20]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 50, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(300))
    model.add(Dropout(0.2))
    #model.add(LSTM(100))

    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', )
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 91, 50)            307900    
_________________________________________________________________
lstm_2 (LSTM)                (None, 300)               421200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6158)              1853558   
Total params: 2,582,658
Trainable params: 2,582,658
Non-trainable params: 0
_________________________________________________________________


#Fitting the model on the training data

In [21]:
model.fit(predictors, label, epochs=50, batch_size=100,verbose=1 )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f7fa1d814e0>

#Function for generating text using a seed

In [22]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

#Examples of text generation

In [23]:
generate_text('movie',10 , model, max_sequence_len)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'Movie I Had To Admit That I Cant Believe It Is'

In [24]:
generate_text('production', 20, model, max_sequence_len)

'Production In The Decrepit Ranks Of The Already Oversaturated Hillybilly Horror Subgenre Comes This Woeful Tale Of A Vacationing Family Terrorized'

In [25]:
generate_text('star', 50, model, max_sequence_len)

'Star Mordrid Is One Of Those Rare Films That Is Completely Under The Radar But Is Totally Worthwhile It Really Reminds Me Of The Old Serials From The 30S And 40S Which Is Why Id Have Loved To Se Out Of This Movie And It Was A Huge Improvement On The'

In [26]:
generate_text('How', 50, model, max_sequence_len)

'How I Ever Heard About Driving Lessons By The Book And What I Saw It A Lot Of This Movie Made And A Few Horror I Just Could Have Been A Fan Of Bad Movies Ie Me It Was So It Was The First Film That You Have T Seen In'

In [28]:
generate_text('Boom', 20, model, max_sequence_len)

'Boom I Saw This Movie When I Was About 12 When I Just Had A Big Fan Of The Farcry Game'

In [29]:
generate_text('spider', 20, model, max_sequence_len)

'Spider Marjoriefarrah Fawcettlives In Fear After Being Accosted By A Lone Biker She Is Mortally Shaken With The Fact Her Attacker'

In [30]:
generate_text('good', 20, model, max_sequence_len)

'Good Lord What Is Not Sure What The Producers Needed To Trade On The Name Of A Somewhat Successful Movie Franchise'