In [1]:
import pandas as pd
import numpy as np
import torch
import re
import spacy
import string
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
data  = pd.read_csv("imdb_master.csv",encoding='iso-8859-1')

In [113]:
limit = 500
data.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,0,0_2.txt
1,1,test,This is an example of why the majority of acti...,0,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",0,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,0,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,0,10003_3.txt


In [114]:
nlp = spacy.load('en_core_web_sm')

In [115]:
dict_y = {e:i for i,e in enumerate(set(data['label']))}
data['label'] = data['label'].map(dict_y)

In [116]:
punt = string.punctuation

def cleaner(doc):
    sent =  nlp(doc ,disable = ['parser', 'ner'])
    words = [re.sub('[^0-9a-zA-Z]','',tok.lemma_.lower().strip()) for tok in sent if tok.lemma_ not in punt and tok.lemma_ != '-PRON-']
    return words

In [117]:
corpus = []
for i in range(0,limit):
    corpus.append(cleaner(data.iloc[i]['review']))

In [118]:
words = set()
for sent in corpus:
    words.update(sent)

In [119]:
stoi = {e:i for i,e in enumerate(words)}
itos = {i:e for i,e in enumerate(words)}

In [120]:
text_corpus=[]
for sent in corpus:
    text_corpus.append([stoi[tok] for tok in sent])

In [186]:
no_of_batches = limit // 64

text_corpus = text_corpus[: no_of_batches * 64]
len(text_corpus)

448

In [187]:
minx, maxx= 10000, 0

for i in range(0, len(corpus)):
    if len(corpus[i]) < minx : minx = len(corpus[i])
    if len(corpus[i]) > maxx : maxx = len(corpus[i])
        
print("Max count :",maxx)
print("Min Count :",minx)

Max count : 200
Min Count : 200


In [188]:
def pad_sequence(corpus, max_length):
    
    feature = np.zeros((len(corpus), max_length), dtype=int)
    
    for i, row in enumerate(corpus):
        feature[i][-len(row):] = np.array(row)[:max_length]
    
    return feature

In [189]:
corpus = pad_sequence(text_corpus, 200)

### Deep LSTM-RNN Network

In [190]:
class SentRNN(nn.Module):
    
    def __init__(self,vocab_size,embed_size,n_hidden,n_layers,output_size, seq_length, lstm_dropout=0.2, batch_first=True):
        
        super(SentRNN,self).__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_batch = 64
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm_net =  nn.LSTM(embed_size,n_hidden,n_layers,dropout =lstm_dropout , batch_first = batch_first)
        self.linear = nn.Linear(seq_length * n_hidden , output_size)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self,X, hidden):
        
        embed = self.embedding(X)
        
        lstm_out, hidden =  self.lstm_net(embed,hidden)
        
        lstm_out =  lstm_out.contiguous().view(self.n_batch,-1)
        
        dropout = self.dropout(lstm_out)
        
        fc = self.linear(lstm_out)
        
        return fc , hidden
    
    def init_hidden(self):
        
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers,self.n_batch,self.n_hidden).zero_(),
                  weight.new(self.n_layers,self.n_batch,self.n_hidden).zero_())
        
        return hidden

In [193]:
dataset = TensorDataset(torch.from_numpy(corpus),torch.from_numpy(np.asarray(data['label'][:len(text_corpus)])))
train_loader = DataLoader(dataset,shuffle=True, batch_size=64)

In [194]:
model = SentRNN(vocab_size =  len(stoi), embed_size = 200 ,n_hidden =128 ,n_layers=3 ,output_size=3, seq_length=200)

In [195]:
print(model)

SentRNN(
  (embedding): Embedding(9395, 200)
  (lstm_net): LSTM(200, 128, num_layers=3, batch_first=True, dropout=0.2)
  (linear): Linear(in_features=25600, out_features=3, bias=True)
  (dropout): Dropout(p=0.3)
)


In [196]:
loss = nn.CrossEntropyLoss()
optimizer  = torch.optim.Adam(model.parameters(),lr=0.001)

In [200]:

losses=[]
for X , Y in train_loader:
    
    h = model.init_hidden() 
    hidden =tuple([each.data for each in h])
    
    #print(X.size())
    lstm_out, hidden = model(X.long(), hidden)
    #print(lstm_out.size())
    cost = loss(lstm_out,  Y)
    cost.backward()
    optimizer.step()
    print(cost.item())
    losses.append(cost.item())

0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [198]:
len(corpus)

448

In [199]:
lstm_out

tensor([[ 14.1139, -13.9945, -13.9592],
        [ 24.5893, -24.4610, -24.2406],
        [ 29.0094, -28.9005, -28.7207],
        [ 24.7019, -24.5614, -24.3065],
        [ 32.2401, -32.0582, -31.8641],
        [ 13.6327, -13.5409, -13.4829],
        [ 24.4856, -24.3404, -24.0940],
        [ 20.3122, -20.2107, -20.0148],
        [ 14.1162, -14.0173, -13.9585],
        [ 24.4172, -24.2869, -24.0668],
        [ 24.0377, -23.8727, -23.6514],
        [ 13.8027, -13.7087, -13.6616],
        [ 26.3495, -26.1851, -25.9515],
        [ 14.0682, -13.9724, -13.9172],
        [ 13.7048, -13.6167, -13.5763],
        [ 20.0507, -19.9929, -19.7630],
        [ 13.8485, -13.7624, -13.7157],
        [ 16.6125, -16.4635, -16.3466],
        [ 14.0885, -13.9997, -13.9527],
        [ 21.5946, -21.4638, -21.2441],
        [ 13.4843, -13.4148, -13.3463],
        [ 22.5352, -22.3851, -22.1347],
        [ 18.6986, -18.5995, -18.3842],
        [ 23.4180, -23.2936, -23.0466],
        [ 20.7612, -20.6488, -20.4520],
