# 1.Sample Data & Preprocessing

In [None]:
# Sample reviews and labels (0 = negative, 1 = positive)
sample_reviews = [
    "I loved the movie",
    "It was a terrible film",
    "Amazing acting and story",
    "Worst movie ever",
    "Best film I have seen",
    "Not good",
    "Loved it",
    "I hated it"
]
sample_labels = [1, 0, 1, 0, 1, 0, 1, 0]

# Basic text preprocessing
import string

def clean_text(text):
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    #print(text)
    return text

cleaned_reviews = [clean_text(r) for r in sample_reviews]
print(cleaned_reviews)

['i loved the movie', 'it was a terrible film', 'amazing acting and story', 'worst movie ever', 'best film i have seen', 'not good', 'loved it', 'i hated it']


2.Tokenize and Create Vocab


In [None]:
from collections import Counter

# Tokenize
all_words = ' '.join(cleaned_reviews).split()
counts = Counter(all_words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: i+1 for i, word in enumerate(vocab)}  # Start at 1
print(vocab_to_int)

# Convert reviews to sequences of integers
reviews_int = [[vocab_to_int[word] for word in review.split()] for review in cleaned_reviews]
print(reviews_int)

{'i': 1, 'it': 2, 'loved': 3, 'movie': 4, 'film': 5, 'the': 6, 'was': 7, 'a': 8, 'terrible': 9, 'amazing': 10, 'acting': 11, 'and': 12, 'story': 13, 'worst': 14, 'ever': 15, 'best': 16, 'have': 17, 'seen': 18, 'not': 19, 'good': 20, 'hated': 21}
[[1, 3, 6, 4], [2, 7, 8, 9, 5], [10, 11, 12, 13], [14, 4, 15], [16, 5, 1, 17, 18], [19, 20], [3, 2], [1, 21, 2]]


3.Pad Sequences

In [None]:
import numpy as np

def pad_features(reviews, seq_length):
    features = np.zeros((len(reviews), seq_length), dtype=int)
    for i, row in enumerate(reviews):
        features[i, -len(row):] = np.array(row)[:seq_length]  # right-pad
    return features

seq_length = 10
features = pad_features(reviews_int, seq_length)
print(features)

labels = np.array(sample_labels)
print(labels)

[[ 0  0  0  0  0  0  1  3  6  4]
 [ 0  0  0  0  0  2  7  8  9  5]
 [ 0  0  0  0  0  0 10 11 12 13]
 [ 0  0  0  0  0  0  0 14  4 15]
 [ 0  0  0  0  0 16  5  1 17 18]
 [ 0  0  0  0  0  0  0  0 19 20]
 [ 0  0  0  0  0  0  0  0  3  2]
 [ 0  0  0  0  0  0  0  1 21  2]]
[1 0 1 0 1 0 1 0]


4.Create DataLoaders

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 2

# Convert to tensors
feature_tensors = torch.from_numpy(features).long()
label_tensors = torch.from_numpy(labels).float()

dataset = TensorDataset(feature_tensors, label_tensors)
train_loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)


5.Build Simple RNN **Model**

In [None]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim)  # +1 for padding
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded)
        out = self.fc(out[:, -1, :])  # Use output from last timestep
        return self.sigmoid(out)


Train the Model

In [None]:
# Hyperparameters
vocab_size = len(vocab_to_int)
embed_dim = 16
hidden_dim = 32
output_dim = 1

model = SentimentRNN(vocab_size, embed_dim, hidden_dim, output_dim)............

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    for inputs, labels in train_loader:
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/5, Loss: 0.6120
Epoch 2/5, Loss: 0.6243
Epoch 3/5, Loss: 0.6885
Epoch 4/5, Loss: 0.5328
Epoch 5/5, Loss: 0.7255


Try a Prediction

In [None]:
def predict_sentiment(model, sentence):
    model.eval()
    sentence = clean_text(sentence)
    tokens = [vocab_to_int.get(word, 0) for word in sentence.split()]
    padded = pad_features([tokens], seq_length)
    input_tensor = torch.from_numpy(padded).long()
    with torch.no_grad():
        output = model(input_tensor).item()
    return "Positive" if output >= 0.5 else "Negative"

print(predict_sentiment(model, "I absolutely loved it"))
print(predict_sentiment(model, "This was awful"))


Positive
Positive
