In [2]:
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
#Cleaning 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
    text = [word for word in word_tokenize(text) if word.isalpha()]
    return text

In [12]:
tqdm.pandas()
df['cleaned_review'] = df['review'].progress_apply(preprocess_text)

100%|██████████| 50000/50000 [20:46<00:00, 40.11it/s]


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review          50000 non-null  object
 1   sentiment       50000 non-null  object
 2   cleaned_review  50000 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [14]:
df.to_csv('cleaned_IMDB.csv', index=False)

In [3]:
df = pd.read_csv('cleaned_IMDB.csv')

In [4]:
print(pd.unique(df['sentiment']))

['positive' 'negative']


In [5]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [6]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,"['one', 'reviewers', 'mentioned', 'watching', ..."
1,A wonderful little production. <br /><br />The...,1,"['wonderful', 'little', 'production', 'br', 'b..."
2,I thought this was a wonderful way to spend ti...,1,"['thought', 'wonderful', 'way', 'spend', 'time..."
3,Basically there's a family where a little boy ...,0,"['basically', 'theres', 'family', 'little', 'b..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"['petter', 'matteis', 'love', 'time', 'money',..."


In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [44]:
from collections import Counter

# Build a word-to-index mapping
word_counts = Counter(word for review in df['cleaned_review'] for word in review)
vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common())}
vocab['<PAD>'] = 0  # Add a padding token


In [48]:
max_len = 200  # Maximum sequence length

def encode_review(review):
    encoded = [vocab.get(word, 0) for word in review]
    if len(encoded) < max_len:
        encoded += [0] * (max_len - len(encoded))  # Pad shorter sequences
    else:
        encoded = encoded[:max_len]  # Truncate longer sequences
    return encoded

encoded_reviews = [encode_review(review) for review in df['cleaned_review']]
labels = torch.tensor(df['sentiment'], dtype=torch.long)


In [55]:
inputs = torch.tensor(encoded_reviews, dtype=torch.long)
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [50]:
embedding_dim = 100
embedding_matrix = torch.zeros(len(vocab), embedding_dim)

with open('glove.6B.100d.txt', 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = torch.tensor([float(val) for val in values[1:]], dtype=torch.float)
        idx = vocab.get(word)
        if idx is not None:
            embedding_matrix[idx] = vector


In [51]:
class SentimentRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_size, num_layers=1):
        super(SentimentRNN, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.size()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):
        embeds = self.embedding(x)
        out, _ = self.rnn(embeds)
        out = self.fc(out[:, -1, :])  # Use the last time step's output
        return out

In [52]:
hidden_dim = 128
output_size = 2  # Number of sentiment classes (positive, negative)
model = SentimentRNN(embedding_matrix, hidden_dim, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [56]:
epochs = 10

for epoch in range(epochs):
    total_loss = 0
    for inputs_batch, labels_batch in train_loader:
        outputs = model(inputs_batch)
        loss = criterion(outputs, labels_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')


Epoch 1/10, Loss: 0.5557
Epoch 2/10, Loss: 0.5559
Epoch 3/10, Loss: 0.5605
Epoch 4/10, Loss: 0.5596
Epoch 5/10, Loss: 0.5579
Epoch 6/10, Loss: 0.5586
Epoch 7/10, Loss: 0.5583
Epoch 8/10, Loss: 0.5579
Epoch 9/10, Loss: 0.5590
Epoch 10/10, Loss: 0.5585


In [57]:

correct = 0
total = 0

with torch.no_grad():
    for inputs_batch, labels_batch in test_loader:
        outputs = model(inputs_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Test Accuracy: 49.59%


In [58]:
class SentimentLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_size, num_layers=1):
        super(SentimentLSTM, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.size()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.fc(lstm_out[:, -1, :])  # Use the last time step's output
        return out

In [59]:
hidden_dim = 128
output_size = 2  # Positive and negative sentiments
model = SentimentLSTM(embedding_matrix, hidden_dim, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [60]:
epochs = 5

for epoch in range(epochs):
    total_loss = 0
    for inputs_batch, labels_batch in train_loader:
        outputs = model(inputs_batch)
        loss = criterion(outputs, labels_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')


Epoch 1/5, Loss: 0.6939
Epoch 2/5, Loss: 0.6933
Epoch 3/5, Loss: 0.6932
Epoch 4/5, Loss: 0.6932
Epoch 5/5, Loss: 0.6928


In [61]:
correct = 0
total = 0

with torch.no_grad():
    for inputs_batch, labels_batch in test_loader:
        outputs = model(inputs_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Test Accuracy: 51.04%


These models are not working, i will look at them later