In [28]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import torchtext
from torchtext.data import get_tokenizer

import re

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from collections import Counter

from torch.utils.data import TensorDataset, DataLoader

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
imdb = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')
imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [30]:
X,y = imdb['review'].values,imdb['sentiment'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(f'shape of train data is {X_train.shape}')
print(f'shape of test data is {X_test.shape}')

shape of train data is (37500,)
shape of test data is (12500,)


In [31]:
def preprocess(s):
    s = re.sub(r"[^\w\s]", '', s)
    s = re.sub(r"\s+", '', s)
    s = re.sub(r"\d", '', s)

    return s

def tokenize(x_train,y_train,x_val,y_val):
    word_list = []

    stop_words = set(stopwords.words('english'))
    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess(word)
            if word not in stop_words and word != '':
                word_list.append(word)

    freq = Counter(word_list)
    word_freq = sorted(freq,key=freq.get,reverse=True)[:1000]
    onehot_dict = {w:i+1 for i,w in enumerate(word_freq)}

    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[preprocess(word)] for word in sent.lower().split()
                                     if preprocess(word) in onehot_dict.keys()])
    for sent in x_val:
            final_list_test.append([onehot_dict[preprocess(word)] for word in sent.lower().split()
                                    if preprocess(word) in onehot_dict.keys()])

    encoded_train = [1 if label =='positive' else 0 for label in y_train]
    encoded_test = [1 if label =='positive' else 0 for label in y_val]
    return np.array(final_list_train), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict

In [32]:
X_train,y_train,X_test,y_test,vocab = tokenize(X_train,y_train,X_test,y_test)

  return np.array(final_list_train), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict


In [33]:
print(f'Length of vocabulary is {len(vocab)}')
print(vocab)

Length of vocabulary is 1000
{'br': 1, 'movie': 2, 'film': 3, 'one': 4, 'like': 5, 'good': 6, 'even': 7, 'would': 8, 'time': 9, 'really': 10, 'see': 11, 'story': 12, 'well': 13, 'much': 14, 'get': 15, 'great': 16, 'also': 17, 'bad': 18, 'people': 19, 'first': 20, 'dont': 21, 'made': 22, 'films': 23, 'movies': 24, 'make': 25, 'could': 26, 'way': 27, 'characters': 28, 'think': 29, 'watch': 30, 'many': 31, 'seen': 32, 'character': 33, 'two': 34, 'never': 35, 'love': 36, 'acting': 37, 'best': 38, 'plot': 39, 'little': 40, 'know': 41, 'show': 42, 'ever': 43, 'life': 44, 'better': 45, 'still': 46, 'scene': 47, 'say': 48, 'end': 49, 'man': 50, 'scenes': 51, 'something': 52, 'go': 53, 'im': 54, 'back': 55, 'real': 56, 'watching': 57, 'thing': 58, 'doesnt': 59, 'actors': 60, 'didnt': 61, 'years': 62, 'actually': 63, 'another': 64, 'funny': 65, 'though': 66, 'makes': 67, 'nothing': 68, 'find': 69, 'look': 70, 'work': 71, 'going': 72, 'every': 73, 'lot': 74, 'new': 75, 'old': 76, 'part': 77, 'us'

In [34]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

x_train_pad = padding_(X_train,500)
x_test_pad = padding_(X_test,500)

In [35]:

train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [36]:
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,output_dim,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.no_layers = no_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)


        self.dropout = nn.Dropout(0.3)

        self.lin = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self,input,hidden):
        batch_size = input.size(0)

        embeds = self.embedding(input)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        out = self.dropout(lstm_out)
        out = self.lin(out)

        sig_out = self.sig(out)

        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1]

        return sig_out, hidden

    def init_hidden(self, batch_size):

        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim))
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim))
        hidden = (h0,c0)
        return hidden

In [37]:
no_layers = 1
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256

model = SentimentRNN(no_layers,vocab_size,output_dim,hidden_dim,embedding_dim,drop_prob=0.5)

print(model)

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (lin): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [38]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [39]:
clip = 5
epochs = 5

#training loop
for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0

    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:


        h = tuple([each.data for each in h])

        model.zero_grad()
        output,h = model(inputs,h)

        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())

        accuracy = acc(output,labels)
        train_acc += accuracy

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()



    test_h = model.init_hidden(batch_size)
    test_losses = []
    test_acc = 0.0
    model.eval()
    for inputs, labels in test_loader:
            test_h = tuple([each.data for each in test_h])

            output, test_h = model(inputs, test_h)
            test_loss = criterion(output.squeeze(), labels.float())

            test_losses.append(test_loss.item())

            accuracy = acc(output,labels)
            test_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_test_loss = np.mean(test_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_test_acc = test_acc/len(test_loader.dataset)

    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} test_loss : {epoch_test_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} test_accuracy : {epoch_test_acc*100}')


Epoch 1
train_loss : 0.5169673895637195 test_loss : 0.42572581535577775
train_accuracy : 74.53066666666666 test_accuracy : 81.104
Epoch 2
train_loss : 0.3835217460195223 test_loss : 0.3885291995406151
train_accuracy : 83.408 test_accuracy : 83.12
Epoch 3
train_loss : 0.332069266974926 test_loss : 0.3533663983345032
train_accuracy : 85.97333333333333 test_accuracy : 84.19200000000001
Epoch 4
train_loss : 0.2998346304992835 test_loss : 0.3299584139585495
train_accuracy : 87.464 test_accuracy : 85.824
Epoch 5
train_loss : 0.26592094752192497 test_loss : 0.33927978563308714
train_accuracy : 89.19466666666666 test_accuracy : 86.024


In [68]:
def predict_text(text):
        word_seq = np.array([vocab[preprocess(word)] for word in text.split()
                         if preprocess(word) in vocab.keys()])
        word_seq = np.expand_dims(word_seq,axis=0)
        pad =  torch.from_numpy(padding_(word_seq,500))
        batch_size = 1
        h = model.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        output, h = model(pad, h)
        return(output.item())

In [86]:
index = 6875
print(imdb['review'][index])

print(f'Actual sentiment is : {imdb["sentiment"][index]}')

prediction = predict_text(imdb['review'][index])
status = "positive" if prediction > 0.5 else "negative"
print(f'Predicted sentiment is : {status} ')

I was really looking forward to this show given the quality of the actors and the fact that The Scott brothers were involved. Unfortunately my hopes were dashed! Yet again we are led to believe that the KGB are a group of inept morons who don't have a clue what they are doing. At one point there is a laughable scene where 4 KGB agents couldn't handle one CIA agent. I grow weary of these biased, one sided and completely inaccurate portrayals of the Spy game that went on during the cold war. I find it laughable that the US is incapable of making objective movies about their involvement in WW2 and beyond. Just like the pathetic U-571, where we are led to believe that the US obtained the Enigma machine, again, utterly false.<br /><br />To its credit, "The Company" is very well filmed and acted. The locales are also exceptionally well realised. Alfred Molina puts in a great performance as does Keaton (The conflict between them is very well done). I really wanted to like this show and no dou