Import Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

Using: cuda


In [3]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [4]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

In [6]:
stop_words = set(stopwords.words("english"))

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return words

In [8]:
df['tokens'] = df['review'].apply(clean_text)

In [9]:
all_words = [word for tokens in df['tokens'] for word in tokens]
word_counts = Counter(all_words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_to_int = {word: idx+1 for idx, word in enumerate(vocab)} 

In [11]:
def encode_tokens(tokens):
    return [vocab_to_int.get(word, 0) for word in tokens]

In [12]:
df['encoded'] = df['tokens'].apply(encode_tokens)

In [13]:
max_len = 200

def pad_sequence(seq, max_len):
    return seq[:max_len] + [0]*(max_len-len(seq))

features = np.array([pad_sequence(seq, max_len) for seq in df['encoded']])
labels = np.array(df['sentiment'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.long), torch.tensor(y_test, dtype=torch.float))

train_loader = DataLoader(train_data, shuffle=True, batch_size=64)
test_loader = DataLoader(test_data, shuffle=False, batch_size=64)

In [42]:
class LSTM_GRU_Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1, dropout=0.3):
        super(LSTM_GRU_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size+1, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, 
                            batch_first=True, dropout=dropout)
        self.gru = nn.GRU(hidden_dim, hidden_dim, num_layers=n_layers, 
                          batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        gru_out, _ = self.gru(lstm_out)
        out = gru_out[:, -1, :]   # take last hidden state
        out = self.dropout(out)
        out = self.fc(out)        # raw logits
        return out


In [43]:
# GPU setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# vocab_to_int is a dict: word -> index
max_idx = max(vocab_to_int.values())

# Parameters
embed_dim = 128
hidden_dim = 128
output_dim = 2   # Positive, Neutral, Negative

# Create model
model = LSTM_GRU_Model(vocab_size=max_idx+1, embed_dim=embed_dim, 
                       hidden_dim=hidden_dim, output_dim=1).to(device)

Using device: cuda




In [44]:
import torch.optim as optim
import torch.nn as nn

# ✅ Loss function for binary sentiment classification
criterion = nn.BCEWithLogitsLoss()

# ✅ Optimizer (Adam works well for NLP tasks)
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [45]:
epochs = 3
for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    
    for inputs, targets in tqdm(train_loader):
        inputs, targets = inputs.to(device), targets.to(device).float()  # BCE expects float targets
        
        optimizer.zero_grad()
        output = model(inputs).squeeze()   # shape: [batch_size]
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(train_loader):.4f}")


100%|██████████| 625/625 [00:10<00:00, 61.52it/s]


Epoch 1/3, Loss: 0.6928


100%|██████████| 625/625 [00:08<00:00, 71.00it/s]


Epoch 2/3, Loss: 0.6422


100%|██████████| 625/625 [00:08<00:00, 69.95it/s]

Epoch 3/3, Loss: 0.4786





In [46]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        outputs = model(inputs).squeeze()
        preds = torch.sigmoid(outputs) > 0.5
        correct += (preds == targets.bool()).sum().item()
        total += targets.size(0)

print(f"Accuracy: {correct / total * 100:.2f}%")


Accuracy: 83.64%


In [47]:
from sklearn.metrics import classification_report

all_preds = []
all_targets = []

model.eval()
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        outputs = model(inputs).squeeze()
        preds = (torch.sigmoid(outputs) > 0.5).int()
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

print(classification_report(all_targets, all_preds, target_names=['Negative', 'Positive']))


              precision    recall  f1-score   support

    Negative       0.87      0.79      0.83      4961
    Positive       0.81      0.88      0.84      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



Custom review

In [48]:
def preprocess_review(review, word2idx, max_len):
    # 1. Lowercase and simple split (you can add more preprocessing)
    tokens = review.lower().split()
    
    # 2. Convert to indices (unknown words get index 0)
    indices = [word2idx.get(token, 0) for token in tokens]
    
    # 3. Pad / truncate to max_len
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    
    return torch.tensor(indices).unsqueeze(0)  # shape: [1, max_len]


In [50]:
torch.save(model.state_dict(), "sentiment_model.pth")
print("Model saved successfully!")


Model saved successfully!


In [51]:
# Load model (replace class definition if needed)
model = LSTM_GRU_Model(vocab_size=len(word2idx)+1, embed_dim=100, hidden_dim=128, output_dim=1).to(device)
model.load_state_dict(torch.load("sentiment_model.pth"))
model.eval()


NameError: name 'word2idx' is not defined