In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from collections import Counter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
df = pd.read_csv("D:/Intern/DataSets/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Cleaning

In [4]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    return text

In [6]:
df["review"] = df["review"].apply(clean_text)

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is...,1


### Voacb

In [8]:
all_words = " ".join(df["review"]).split()

In [9]:
len(all_words)

11980071

In [10]:
vocab_size = 20000

In [11]:
most_common = Counter(all_words).most_common(vocab_size-2) # -2 for <pad> and <unk>

In [12]:
most_common[:10]

[('the', 668009),
 ('and', 324443),
 ('a', 323036),
 ('of', 289414),
 ('to', 268125),
 ('is', 211092),
 ('br', 201954),
 ('it', 190862),
 ('in', 186787),
 ('i', 175643)]

In [13]:
word2idx = {word : idx+2 for idx, (word, _) in enumerate(most_common)} # 0 = <pad>, 1 = <unk>
word2idx["<pad>"] = 0
word2idx["<unk>"] = 1

In [14]:
def encode_text(text):
    return [word2idx.get(word, 1) for word in text.split()] # 1 = <unk>

In [15]:
df["encoded"] = df["review"].apply(encode_text)

In [16]:
df.head()

Unnamed: 0,review,sentiment,encoded
0,one of the other reviewers has mentioned that ...,1,"[30, 5, 2, 79, 2063, 48, 1065, 13, 102, 151, 4..."
1,a wonderful little production br br the...,1,"[4, 396, 122, 356, 8, 8, 2, 1384, 2983, 7, 55,..."
2,i thought this was a wonderful way to spend ti...,1,"[11, 194, 12, 15, 4, 396, 97, 6, 1154, 59, 24,..."
3,basically there s a family where a little boy ...,0,"[684, 41, 14, 4, 229, 117, 4, 122, 403, 3362, ..."
4,petter mattei s love in the time of money is...,1,"[1, 10639, 14, 112, 10, 2, 59, 5, 293, 7, 4, 2..."


### Pad Sequence

In [17]:
def pad_sequence(seq, max_len=200):
    return seq[:max_len] + [0] * (max_len - len(seq)) # max_len - len(seq) = 5 - 7 = -2 * 0 = 0

In [18]:
df["padded"] = df["encoded"].apply(pad_sequence)

In [19]:
df.head()

Unnamed: 0,review,sentiment,encoded,padded
0,one of the other reviewers has mentioned that ...,1,"[30, 5, 2, 79, 2063, 48, 1065, 13, 102, 151, 4...","[30, 5, 2, 79, 2063, 48, 1065, 13, 102, 151, 4..."
1,a wonderful little production br br the...,1,"[4, 396, 122, 356, 8, 8, 2, 1384, 2983, 7, 55,...","[4, 396, 122, 356, 8, 8, 2, 1384, 2983, 7, 55,..."
2,i thought this was a wonderful way to spend ti...,1,"[11, 194, 12, 15, 4, 396, 97, 6, 1154, 59, 24,...","[11, 194, 12, 15, 4, 396, 97, 6, 1154, 59, 24,..."
3,basically there s a family where a little boy ...,0,"[684, 41, 14, 4, 229, 117, 4, 122, 403, 3362, ...","[684, 41, 14, 4, 229, 117, 4, 122, 403, 3362, ..."
4,petter mattei s love in the time of money is...,1,"[1, 10639, 14, 112, 10, 2, 59, 5, 293, 7, 4, 2...","[1, 10639, 14, 112, 10, 2, 59, 5, 293, 7, 4, 2..."


In [20]:
X = np.array(df["padded"].tolist())
y = df["sentiment"].values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40000, 200) (40000,)
(10000, 200) (10000,)


In [23]:
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [24]:
train_ds = IMDBDataset(X_train, y_train)
test_ds = IMDBDataset(X_test, y_test)

In [25]:
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=16)

In [26]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5, bidirectional=True):
        super().__init__()
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers=num_layers, 
                            dropout=dropout, 
                            batch_first=True, 
                            bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim) # 2 for bidirectional
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(embedded)
        
        # concat the both foreward and backward hidden states from both directions
        if self.bidirectional:
            hidden_concat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden_concat = hidden[-1,:,:]
            
        fc_output = self.fc(hidden_concat)
        return fc_output

In [27]:
model = SentimentRNN(vocab_size, embedding_dim=128, hidden_dim=128, output_dim=1, num_layers=2)
model = model.to(device)

In [28]:
criterian = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
def binary_acc(preds, y):
    rounded = torch.round(torch.sigmoid(preds))
    correct = (rounded == y).float()
    return correct.sum() / len(correct)

In [30]:
epochs = 5

for epoch in range(epochs):
    
    # Training
    model.train()
    train_loss, train_acc = 0, 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device, dtype=torch.float)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward
        preds = model(X_batch).squeeze(1) # shape [batch_size]
        loss = criterian(preds, y_batch)
        
        # Calculate accuracy
        acc = binary_acc(preds, y_batch)
        
        # Backward
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_acc += acc.item()
    
    # Validation
    model.eval()
    val_loss, val_acc = 0, 0

    with torch.no_grad():
        
        for X_batch, y_batch in test_loader:
            
            X_batch, y_batch = X_batch.to(device), y_batch.to(device, dtype=torch.float)
            
            preds = model(X_batch).squeeze(1)
            loss = criterian(preds, y_batch)
            
            acc = binary_acc(preds, y_batch)
            
            val_loss += loss.item()
            val_acc += acc.item()
            
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss/len(train_loader)}" , 
          f"Train Acc: {train_acc/len(train_loader)}", 
          f"Val Loss: {val_loss/len(test_loader)}", 
          f"Val Acc: {val_acc/len(test_loader)}")
    
    

Epoch: 1, Train Loss: 0.6795559959173203 Train Acc: 0.559125 Val Loss: 0.6667189787864685 Val Acc: 0.5959
Epoch: 2, Train Loss: 0.506737742215395 Train Acc: 0.752075 Val Loss: 0.3859276170015335 Val Acc: 0.8263
Epoch: 3, Train Loss: 0.3817715517908335 Train Acc: 0.8295 Val Loss: 0.3304046890676022 Val Acc: 0.8612
Epoch: 4, Train Loss: 0.3220999955847859 Train Acc: 0.858975 Val Loss: 0.30459997407197953 Val Acc: 0.8728
Epoch: 5, Train Loss: 0.2871860074907541 Train Acc: 0.87885 Val Loss: 0.28987975727915766 Val Acc: 0.8806


In [50]:
sample = "this movie is so awesome."

# Clean and encode the text
sample = clean_text(str(sample))
encoded = encode_text(sample)
padded = pad_sequence(encoded)

# Convert to tensor and prepare for model
sample = torch.tensor(padded).unsqueeze(0).to(device)

with torch.no_grad():
	output = model(sample)
	output = torch.sigmoid(output)
	print(f"Probability of positive sentiment: {output.item():.4f}")
	prediction = "Positive" if torch.round(output).item() == 1 else "Negative"
	print(f"Prediction: {prediction}")
 

Probability of positive sentiment: 0.9894
Prediction: Positive


In [41]:
sample = "this movie is descent."

# Clean and encode the text
sample = clean_text(str(sample))
encoded = encode_text(sample)
padded = pad_sequence(encoded)

# Convert to tensor and prepare for model
sample = torch.tensor(padded).unsqueeze(0).to(device)

with torch.no_grad():
	output = model(sample)
	output = torch.sigmoid(output)
	print(f"Probability of positive sentiment: {output.item():.4f}")
	prediction = "Positive" if torch.round(output).item() == 1 else "Negative"
	print(f"Prediction: {prediction}")
 

Probability of positive sentiment: 0.4429
Prediction: Negative


In [47]:
sample_list = list(df["review"].sample(2))
sample_list

['on top of the fact that skylar is a complete douche bag and his cons are unimaginative  his schemes require way to much preparation to make any of his scams worth while  without giving away any spoilers  as if it matters with this piece of crap  his cons are such a sham because it takes the effort of days and days of planning  and the use of multiple accomplices and an entire camera crew etc  just to scam someone into a service that would cost less than a hundred bucks     in addition if you read in the credits they re stage some of the phone calls etc  because they don t pan out   the whole concept of this show is bunk because all of his cons have the cost in both the crew and the effort of ten times the actual cost of the service he is trying to get for free   what is the con ',
 'this off beat horror movie seems to be getting nothing but bad reviews  my question is  why  i think this movie is pretty good  dee snider did very well for his first  and only  time directing  he also pl

In [48]:
sample = """ on top of the fact that skylar is a complete douche bag and his cons are unimaginative  
			his schemes require way to much preparation to make any of his scams worth while  without giving away any spoilers  
   			as if it matters with this piece of crap  his cons are such a sham because it takes the effort of days and days of planning  
      		and the use of multiple accomplices and an entire camera crew etc  
        	just to scam someone into a service that would cost less than a hundred bucks     
         	in addition if you read in the credits they re stage some of the phone calls etc  because they don t pan out   
           	the whole concept of this show is bunk because all of his cons have the cost in both the crew and the effort of ten times the actual cost of the service he is trying to get for free   
            what is the con """

# Clean and encode the text
sample = clean_text(str(sample))
encoded = encode_text(sample)
padded = pad_sequence(encoded)

# Convert to tensor and prepare for model
sample = torch.tensor(padded).unsqueeze(0).to(device)

with torch.no_grad():
	output = model(sample)
	output = torch.sigmoid(output)
	print(f"Probability of positive sentiment: {output.item():.4f}")
	prediction = "Positive" if torch.round(output).item() == 1 else "Negative"
	print(f"Prediction: {prediction}")
 

Probability of positive sentiment: 0.1837
Prediction: Negative
