In [None]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.legacy.data import Field,LabelField,TabularDataset,BucketIterator
from torch.autograd import Variable
import torch.nn.functional as F
import random


In [None]:
dirs="/content/drive/MyDrive/dataset/Movie_Review_Dataset/"

In [None]:
data=pd.read_csv("/content/drive/MyDrive/dataset/Movie_Review_Dataset/Rnn_Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there is a family where a little boy...,0
4,petter mattei s love in the time of money is a...,1


In [None]:
TEXT=Field(tokenize="spacy",tokenizer_language="en_core_web_sm",batch_first=True)
LABEL=LabelField(dtype=torch.float)

In [None]:
field=[("text",TEXT),("label",LABEL)]

In [None]:
training_data=TabularDataset(path="/content/drive/MyDrive/dataset/Movie_Review_Dataset/Rnn_Dataset.csv",format="csv",
                             fields=field,skip_header=True)

print(vars(training_data[3]))

{'text': ['basically', 'there', 'is', 'a', 'family', 'where', 'a', 'little', 'boy', 'jake', 'thinks', 'there', 'is', 'a', 'zombie', 'in', 'his', 'closet', 'his', 'parents', 'are', 'fighting', 'all', 'the', 'time', 'this', 'movie', 'is', 'slower', 'than', 'a', 'soap', 'opera', 'and', 'suddenly', 'jake', 'decides', 'to', 'become', 'rambo', 'and', 'kill', 'the', 'zombie', 'ok', 'first', 'of', 'all', 'when', 'you', 'are', 'going', 'to', 'make', 'a', 'film', 'you', 'must', 'decide', 'if', 'its', 'a', 'thriller', 'or', 'a', 'drama', 'as', 'a', 'drama', 'the', 'movie', 'is', 'watchable', 'parents', 'are', 'divorcing', 'arguing', 'like', 'in', 'real', 'life', 'and', 'then', 'we', 'have', 'jake', 'with', 'his', 'closet', 'which', 'totally', 'ruins', 'all', 'the', 'film', 'i', 'expected', 'to', 'see', 'a', 'boogeyman', 'similar', 'movie', 'and', 'instead', 'i', 'watched', 'a', 'drama', 'with', 'some', 'meaningless', 'thriller', 'spots', '3', 'out', 'of', '10', 'just', 'for', 'the', 'well', 'play

In [None]:
train_data,valid_data=training_data.split(random_state=random.seed(2020))

In [None]:
TEXT.build_vocab(train_data,max_size=25_000,vectors="glove.6B.100d",unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399280/400000 [00:17<00:00, 22650.80it/s]

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_iterator,valid_iterator=BucketIterator.splits((train_data,valid_data),
                                                sort_key=lambda x:len(x.text),
                                                batch_size=64,
                                                device=device)

In [None]:
class CNN(nn.Module):
  def __init__(self,vocab_size,embedding_dim,n_filters,filter_sizes,output_dim,dropout,pad_token):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_token)

    self.conv1=nn.Conv2d(in_channels=1,
                         out_channels=n_filters,
                         kernel_size=(filter_sizes[0],embedding_dim))
    
    self.conv2=nn.Conv2d(in_channels=1,
                         out_channels=n_filters,
                         kernel_size=(filter_sizes[1],embedding_dim))
    
    self.conv3=nn.Conv2d(in_channels=1,
                         out_channels=n_filters,
                         kernel_size=(filter_sizes[2],embedding_dim))
    


    self.fc=nn.Linear(len(filter_sizes)*n_filters,output_dim)

    self.dropout=nn.Dropout(dropout)


  def forward(self,text):
    #now embedding layer
    #[batch_size,sent_len]
    embedded=self.embedding(text)
    #[batch_size,sent_len,emb_dim]
    #[batch_size,1,sent_len,emd_dim]
    
    embedded=embedded.unsqueeze(1)
    conved_1=F.relu(self.conv1(embedded).squeeze(3))
    conved_2=F.relu(self.conv2(embedded).squeeze(3))
    conved_3=F.relu(self.conv3(embedded).squeeze(3))
    #[batch_size,n_filter,sent len,filter_size[n]+1]

    pooled_1=F.max_pool1d(conved_1,conved_1.shape[2]).squeeze(2)
    pooled_2=F.max_pool1d(conved_2,conved_2.shape[2]).squeeze(2)
    pooled_3=F.max_pool1d(conved_3,conved_3.shape[2]).squeeze(2)
    
    cat=self.dropout(torch.cat((pooled_1,pooled_2,pooled_3),dim=1))

    return self.fc(cat)




In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model

CNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (conv1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
  (conv2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  (conv3): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
'''
1.  torch.Size([64, 760])
2.  torch.Size([64, 760, 100])
3.  torch.Size([64, 1, 760, 100])
4.  torch.Size([64, 100, 758])
5.  torch.Size([64, 100, 757])
6.  torch.Size([64, 100, 756])
7.  torch.Size([64, 100])
8.  torch.Size([64, 100])
9.  torch.Size([64, 100])
10  torch.Size([64, 300])

'''

'\n1.  torch.Size([64, 760])\n2.  torch.Size([64, 760, 100])\n3.  torch.Size([64, 1, 760, 100])\n4.  torch.Size([64, 100, 758])\n5.  torch.Size([64, 100, 757])\n6.  torch.Size([64, 100, 756])\n7.  torch.Size([64, 100])\n8.  torch.Size([64, 100])\n9.  torch.Size([64, 100])\n10  torch.Size([64, 300])\n\n'

In [None]:
for i in train_iterator:
  txt=i.text
  print(model(txt))
  break

tensor([[ 0.9083],
        [-0.6104],
        [-0.0227],
        [-1.5625],
        [-1.1194],
        [-0.9592],
        [-1.8388],
        [-1.1911],
        [-0.1971],
        [-0.7892],
        [ 0.5156],
        [ 0.2901],
        [ 0.9840],
        [-0.8118],
        [-1.4829],
        [-0.7820],
        [-1.0490],
        [-0.5827],
        [-0.4956],
        [-1.1284],
        [-0.7720],
        [ 0.4526],
        [-1.1008],
        [-0.0111],
        [ 0.2275],
        [-1.0507],
        [-1.5554],
        [-1.4688],
        [-1.4689],
        [-1.0205],
        [ 0.5472],
        [ 0.8430],
        [-0.4240],
        [-1.5074],
        [-0.4183],
        [-0.8738],
        [-1.9993],
        [ 0.2618],
        [ 1.0797],
        [ 0.7723],
        [-0.4115],
        [-0.4705],
        [ 0.4618],
        [-0.8248],
        [-0.8999],
        [-1.7742],
        [-0.5283],
        [ 0.1934],
        [-0.8424],
        [-0.5047],
        [-0.2371],
        [-2.6910],
        [-0.

In [None]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 2,620,801 trainable parameters


In [None]:

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.4795,  0.9588, -0.5235,  ...,  1.4800,  0.0436,  0.3592],
        [-0.9299,  0.3776, -1.2845,  ...,  0.8045,  0.8316, -0.0865],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4474,  0.0548, -0.3747,  ...,  0.6502, -1.0164, -0.1385],
        [ 0.7078,  0.9445,  0.0374,  ...,  0.2431,  0.1647,  0.6960],
        [-0.0940,  0.4738,  0.5370,  ..., -0.0193,  0.5154,  0.0811]])

In [None]:

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), dirs+'Convolutional_Sentiment_Analysis.pt')
    
    # print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.160 | Train Acc: 93.90%
	 Val. Loss: 0.237 |  Val. Acc: 90.67%
	Train Loss: 0.106 | Train Acc: 96.16%
	 Val. Loss: 0.255 |  Val. Acc: 90.47%
	Train Loss: 0.106 | Train Acc: 96.16%
	 Val. Loss: 0.255 |  Val. Acc: 90.47%


KeyboardInterrupt: ignored

KeyboardInterrupt: ignored

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, "This film is terrible")

0.028501778841018677

In [None]:
predict_sentiment(model, "This film is great")

0.8359347581863403

In [None]:
predict_sentiment(model, "i love this film")

0.8440243601799011

In [None]:
predict_sentiment(model, "this is outstanding")

0.9153926372528076

In [None]:
predict_sentiment(model, "this is bad")

0.05162129923701286

In [None]:
predict_sentiment(model, "i love this actor")

0.7052640318870544

In [None]:
predict_sentiment(model, "acting was not good")

0.4672795832157135