In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [None]:
df = pd.read_csv("/content/IMDB-Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['review'] = df['review'].str.lower().str.split()

df.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",positive
1,"[a, wonderful, little, production., <br, /><br...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, there's, a, family, where, a, litt...",negative
4,"[petter, mattei's, ""love, in, the, time, of, m...",positive


In [None]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
df.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",1
1,"[a, wonderful, little, production., <br, /><br...",1
2,"[i, thought, this, was, a, wonderful, way, to,...",1
3,"[basically, there's, a, family, where, a, litt...",0
4,"[petter, mattei's, ""love, in, the, time, of, m...",1


In [None]:
test_data.head()

Unnamed: 0,review,sentiment
33553,"[i, really, liked, this, summerslam, due, to, ...",1
9427,"[not, many, television, shows, appeal, to, qui...",1
199,"[the, film, quickly, gets, to, a, major, chase...",0
12447,"[jane, austen, would, definitely, approve, of,...",1
39489,"[expectations, were, somewhat, high, for, me, ...",0


In [None]:





vocab = {word for phrase in df['review'] for word in phrase}
word_to_idx = {word: idx for idx, word in enumerate(vocab, start=1)}

max_length = df['review'].str.len().max()
max_length

2470

In [None]:
vocab

{'strawberry',
 'bob!<br',
 'audience_great',
 'bullying.',
 'g-man',
 'scourge,',
 'anansie',
 '/>poo',
 'jäniksen',
 '(repetitious',
 'borders!),',
 'alien-costume',
 'tawny',
 'too...whoa!',
 'multimillionaire',
 'shielded',
 'noblesse',
 'simon?)',
 'coast-meets-',
 'process,"',
 'governator',
 'jokes,"',
 'whatsoever...i',
 'rendering,',
 'offers.',
 'zoos',
 'nerf',
 'main-writer',
 "o'daniel.",
 'pouting',
 'al-mutairi"',
 '(shirdi',
 'loooooove',
 '"extras."',
 'complicit',
 'loomis.<br',
 '21.)<br',
 'europe:',
 'impassive',
 '(useless)',
 '"specialagentfoxmulder"',
 'warnercolor',
 '"little',
 'stage).',
 'wisecracks!)',
 'others").',
 'snooze-fests',
 'objective;',
 'hamburger.',
 'preparations.',
 'bookish.<br',
 '"incidents"',
 'co-ordinate',
 '(1992).',
 '/>wrong!',
 'ziplock',
 'killers!',
 'story".<br',
 'fabulous-',
 'firmer".',
 'firearms;',
 'screamy',
 'romeo&juliet',
 'deesh".',
 '200mph',
 'pairs,',
 'vidler',
 "bert's",
 'beaton)',
 'stupified',
 'gentle"',
 "'mu

In [None]:
word_to_idx

{'strawberry': 1,
 'bob!<br': 2,
 'audience_great': 3,
 'bullying.': 4,
 'g-man': 5,
 'scourge,': 6,
 'anansie': 7,
 '/>poo': 8,
 'jäniksen': 9,
 '(repetitious': 10,
 'borders!),': 11,
 'alien-costume': 12,
 'tawny': 13,
 'too...whoa!': 14,
 'multimillionaire': 15,
 'shielded': 16,
 'noblesse': 17,
 'simon?)': 18,
 'coast-meets-': 19,
 'process,"': 20,
 'governator': 21,
 'jokes,"': 22,
 'whatsoever...i': 23,
 'rendering,': 24,
 'offers.': 25,
 'zoos': 26,
 'nerf': 27,
 'main-writer': 28,
 "o'daniel.": 29,
 'pouting': 30,
 'al-mutairi"': 31,
 '(shirdi': 32,
 'loooooove': 33,
 '"extras."': 34,
 'complicit': 35,
 'loomis.<br': 36,
 '21.)<br': 37,
 'europe:': 38,
 'impassive': 39,
 '(useless)': 40,
 '"specialagentfoxmulder"': 41,
 'warnercolor': 42,
 '"little': 43,
 'stage).': 44,
 'wisecracks!)': 45,
 'others").': 46,
 'snooze-fests': 47,
 'objective;': 48,
 'hamburger.': 49,
 'preparations.': 50,
 'bookish.<br': 51,
 '"incidents"': 52,
 'co-ordinate': 53,
 '(1992).': 54,
 '/>wrong!': 55

In [None]:

def encode_and_pad(text):
    encoded = [word_to_idx[word] for word in text]
    return encoded + [0] * (max_length - len(encoded))

train_data['review'] = train_data['review'].apply(encode_and_pad)
test_data['review'] = test_data['review'].apply(encode_and_pad)




In [None]:
test_data.head()

Unnamed: 0,review,sentiment
33553,"[301370, 226075, 353859, 284744, 191603, 38463...",1
9427,"[238149, 365115, 334713, 269262, 299038, 31997...",1
199,"[143420, 357903, 224646, 329953, 319977, 16898...",0
12447,"[340320, 285135, 323088, 179240, 156834, 14806...",1
39489,"[208179, 218278, 222653, 267135, 248931, 39553...",0


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, data):
        self.texts = data['review'].values
        self.labels = data['sentiment'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_dataset = SentimentDataset(train_data)
test_dataset = SentimentDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

vocab_size = len(vocab) + 1
embed_size = 24
hidden_size = 24
output_size = 2
model = SentimentRNN(vocab_size, embed_size, hidden_size, output_size)

In [None]:
from tqdm.auto import tqdm
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 3
for epoch in tqdm(range(num_epochs)):
    model.train()
    epoch_loss = 0
    for texts, labels in tqdm(train_loader):

        outputs = model(texts)

        loss = criterion(outputs, labels)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for texts, labels in tqdm(test_loader):
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')

  0%|          | 0/1250 [00:00<?, ?it/s]

Accuracy: 50.39%
