# **"Step-by-Step Guide to Sentiment Classification with PyTorch RNN"**

## 1. Setup and Libraries

In [None]:
%%capture
!pip install pandas textblob nltk scikit-learn torch torchvision torchaudio torchtext datasets tokenizers torchmetrics tensorboard altair wandb spacy torchinfo

In [None]:
# Import all necessary libraries for our task
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import nltk
import re
import string
from nltk.corpus import stopwords
from collections import Counter

# Download nltk resources too for some text processing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

## 2. Load and Inspect Data

Load our dataset and select the 'text' and 'sentiment' columns.

In [None]:
# Our data is in a CSV file named 'train.csv'
# We are using 'ISO-8859-1' encoding, because 'utf-8' can't decode properly our data
df = pd.read_csv('train.csv', encoding='ISO-8859-1')

# Select only 'text' and 'sentiment' columns for our purpose
df = df[['text', 'sentiment']]

df.head()

In [None]:
# It's always a good practice to check our
# target variable's (label) distribution for imbalances
print(df['sentiment'].value_counts())

# As we can see below there are not any major imbalaces in our dataset

## 3. Data Preprocessing

This is a crucial step. We'll perform the following preprocessing steps:

   - Lowercase: Convert all text to lowercase.

   - Remove Punctuation: Remove punctuation marks.

   - Remove Stop Words: Remove common words that don't carry much sentiment information (like "the", "a", "is").

   - Remove URLs: Generally a very good practice when preprocessing text data for sentiment analysis, especially for tweets (Noise Reduction, Focus on Textual Content, Vocabulary Size Reduction, Improved Generalization)

In [None]:
def preprocess_text_st(text):
    if isinstance(text, str):

        # Lowercase
        text = text.lower()

        # Remove URLs using regular expression
        text = re.sub(r'http\S+|www.\S+', '', text)

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove stop words
        stop_words = set(stopwords.words('english'))

        text_list = text.split()
        filtered_words = [word for word in text_list if word not in stop_words]
        text = ' '.join(filtered_words)
        return text
    else:
        return " "

df['processed_text_st'] = df['text'].apply(preprocess_text_st)
df.head()

## 4. Sentiment Encoding

We need to convert the sentiment labels (e.g., 'negative', 'neutral', 'positive') into numerical values that our model can understand. Let's create a mapping:

   - 'negative': 0

   - 'neutral': 1

   - 'positive': 2

In [None]:
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment_encoded'] = df['sentiment'].map(sentiment_map)
df.head()

## 5. Generate Sentence Embeddings using Sentence Transformer

We will use `SentenceTransformer` to encode our processed tweets into sentence embeddings.

In [None]:
# Load the Sentence Transformer model
model_name = 'all-MiniLM-L6-v2'
sentence_model = SentenceTransformer(model_name)

# Generate sentence embeddings for all processed texts
# convert_to_tensor=True makes it return PyTorch tensors directly, which is convenient
sentence_embeddings = sentence_model.encode(df['processed_text_st'].tolist(), convert_to_tensor=True)

# Move the tensor to CPU before converting to list for DataFrame
sentence_embeddings_cpu = sentence_embeddings.cpu()

# Add embeddings to the DataFrame (that's optional, but can be useful for inspection)
df['sentence_embedding'] = list(sentence_embeddings_cpu)

print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")
df.head()

## 6. Split Data into Training and Testing Sets (using Embeddings)

We split the data based on the generated sentence embeddings and encoded sentiments.

In [None]:
# Since df['sentence_embedding'] is a list of PyTorch tensors,
# we use torch.stack() to combine them into a single tensor X_st.
# This tensor X_st will be our input features for the model.
X_st = torch.stack(list(df['sentence_embedding']))
y_st = df['sentiment_encoded']

X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(X_st, y_st, test_size=0.2, random_state=42, stratify=y_st)

print(f"Training embeddings shape: {X_train_st.shape}")
print(f"Testing embeddings shape: {X_test_st.shape}")
print(f"Training samples: {len(X_train_st)}")
print(f"Testing samples: {len(X_test_st)}")

## 7. Create PyTorch Dataset and DataLoader (for Sentence Embeddings)

The Dataset and DataLoader are simplified because we are now dealing with fixed-size sentence embeddings, not sequences. No padding needed!

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        # embedding is already a tensor
        embedding = self.embeddings[idx]
        label = self.labels.iloc[idx]
        return embedding, torch.tensor(label)

In [None]:
# Create datasets
train_dataset = SentimentDataset(X_train_st, y_train_st)
test_dataset = SentimentDataset(X_test_st, y_test_st)

# Create DataLoaders
batch_size = 64
train_dataloader_st = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader_st = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Example of a batch
for embedding_batch, label_batch in train_dataloader_st:
    print("Embedding batch shape:", embedding_batch.shape)
    print("Label batch shape:", label_batch.shape)
    break

## 8. Define a Simple Feedforward Neural Network Model (MLP)

   - This is a simple MLP with two fully connected layers, a ReLU activation, and dropout for regularization.

   - embedding_dim: The input dimension is the dimension of the sentence embeddings (e.g., 384 for all-MiniLM-L6-v2).

   - We don't need an embedding layer here because the input is already sentence embeddings.

   - The output layer self.fc2 directly outputs logits (raw scores), and we rely on nn.CrossEntropyLoss to handle the Softmax internally.

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, embedding_dim, output_dim, hidden_dim, dropout_prob):
        super().__init__()

        # First fully connected layer
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)

        # Second fully connected layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, embedding):

        # Input is sentence embedding directly
        # embedding shape: [batch_size, embedding_dim]
        out = self.fc1(embedding)
        out = self.relu(out)
        out = self.dropout(out)

        # No softmax here, CrossEntropyLoss will handle it
        out = self.fc2(out)
        return out

# Model parameters
embedding_dim = sentence_embeddings.shape[1]
hidden_dim = 256
output_dim = len(sentiment_map)
dropout_prob = 0.5

model = SentimentClassifier(embedding_dim, output_dim, hidden_dim, dropout_prob)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(model)

## Step 9: Training the Sentiment Classifier Model

This step focuses on training our feedforward neural network (`SentimentClassifier`) to classify sentiment based on the sentence embeddings generated by the `Sentence Transformer`. We will define the loss function, optimizer, and implement the training and evaluation loops.

**9.1 Loss Function:**

We will use **`nn.CrossEntropyLoss()`** as our loss function. This is a standard loss function for multi-class classification problems. It's well-suited for our task because:

*   Sentiment analysis here is a multi-class problem (negative, neutral, positive). `CrossEntropyLoss` is designed for this scenario.
*   `CrossEntropyLoss` expects the model's output to be raw, unnormalized scores which it then internally converts into probability distributions using a Softmax function. It compares this predicted probability distribution with the true class label (which we have encoded as integers: 0, 1, 2).
*   The loss function quantifies how well our model's predictions match the true sentiment labels. The training process will aim to minimize this loss.

**9.2 Optimizer:**

We will use the **`optim.Adam()`** optimizer. **Adam** is a popular and effective optimization algorithm for training neural networks. **Adam** adapts the learning rate for each parameter individually during training, which often leads to faster convergence and better performance compared to optimizers with a fixed learning rate (like Stochastic Gradient Descent, SGD, with a constant learning rate).

**9.3 Training Function (`train`):**

The `train` function performs one epoch of training on the training dataset. Let's break down its steps:

1.  **`model.train()`:**  Sets the model to training mode. This is important because some layers (like Dropout and BatchNorm, if used) behave differently during training and evaluation.
2.  **`epoch_loss = 0`, `epoch_acc = 0`:** Initializes variables to accumulate the loss and accuracy for the entire epoch.
3.  **`for batch_idx, (embedding_batch, label_batch) in enumerate(dataloader):`:** Iterates through the training data loader, which provides batches of sentence embeddings and corresponding sentiment labels.
4.  **`embedding_batch = embedding_batch.to(device)`, `label_batch = label_batch.to(device)`:** Moves the input batch of embeddings and labels to the specified device (GPU if available, otherwise CPU). This ensures that computations are performed on the correct device.
5.  **`optimizer.zero_grad()`:** Clears the gradients from the previous batch. Gradients accumulate by default in PyTorch, so we need to reset them before each new batch.
6.  **`predictions = model(embedding_batch)`:** Performs the forward pass. The `embedding_batch` is fed into the `model`, and it outputs the predicted logits for each sentiment class.
7.  **`loss = criterion(predictions, label_batch)`:** Calculates the loss by comparing the model's `predictions` with the `label_batch` using the `criterion` (CrossEntropyLoss).
8.  **`acc = calculate_accuracy(predictions, label_batch)`:** Calculates the accuracy of the predictions for the current batch using the `calculate_accuracy` function we defined earlier.
9.  **`loss.backward()`:** Performs the backward pass (backpropagation). This calculates the gradients of the loss with respect to the model's parameters.
10. **`optimizer.step()`:** Updates the model's parameters using the calculated gradients and the optimization algorithm (Adam). This is where the model learns from the data.
11. **`epoch_loss += loss.item()`, `epoch_acc += acc.item()`:** Accumulates the loss and accuracy for the current batch to calculate the average loss and accuracy for the entire epoch later.
12. **`return epoch_loss / len(dataloader), epoch_acc / len(dataloader)`:** After processing all batches in the epoch, it returns the average training loss and average training accuracy for the epoch.

**9.4 Evaluation Function (`evaluate`):**

The `evaluate` function evaluates the model's performance on a given dataset (typically the test dataset) *without* updating the model's parameters.  It's crucial to evaluate on a separate dataset to assess how well the model generalizes to unseen data.

1.  **`model.eval()`:** Sets the model to evaluation mode. This ensures that layers like Dropout and BatchNorm behave in evaluation mode (e.g., Dropout is turned off).
2.  **`epoch_loss = 0`, `epoch_acc = 0`:** Initializes variables to accumulate loss and accuracy for the evaluation dataset.
3.  **`with torch.no_grad():`:**  This context manager is important during evaluation. It disables gradient calculations. This is because we don't need to compute gradients during evaluation, which saves memory and computation time.
4.  **The rest of the steps within the `with torch.no_grad():` block are very similar to the training function**, except:
    *   **`optimizer.zero_grad()`, `loss.backward()`, `optimizer.step()` are *not* present.**  We are only performing a forward pass to get predictions and calculate metrics, not updating the model's weights.
5.  **`return epoch_loss / len(dataloader), epoch_acc / len(dataloader)`:** Returns the average evaluation loss and accuracy.

**9.5 Training Loop:**

The main training loop orchestrates the training process over multiple epochs:

1.  **`for epoch in range(num_epochs):`:** Iterates for a predefined number of epochs. An epoch is one complete pass through the entire training dataset.
2.  **`train_loss, train_acc = train(...)`:** Calls the `train` function to train the model for one epoch on the training data and get the training loss and accuracy.
3.  **`test_loss, test_acc = evaluate(...)`:** Calls the `evaluate` function to evaluate the model on the test data and get the test loss and accuracy.
4.  **`print(f'Epoch: ...')`:** Prints the epoch number, training loss, training accuracy, test loss, and test accuracy for each epoch. This allows us to monitor the training progress and observe how the model is learning and generalizing.

By running this training loop, we iteratively refine our `SentimentClassifier` model to become better at classifying the sentiment of tweets based on their sentence embeddings. We will monitor the training and test set performance to decide when to stop training and to evaluate the final model's effectiveness.

In [None]:
# Hyper parameters
criterion = nn.CrossEntropyLoss()
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)
num_epochs = 100

def calculate_accuracy(predictions, labels):
    "Calculates the accuracy of predictions."

    # Get the class with highest probability
    _, predicted_classes = torch.max(predictions, 1)
    correct_predictions = (predicted_classes == labels).sum().item()
    accuracy = correct_predictions / len(labels)
    return torch.tensor(accuracy)

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch_idx, (embedding_batch, label_batch) in enumerate(dataloader):
        embedding_batch = embedding_batch.to(device)
        label_batch = label_batch.to(device)

        optimizer.zero_grad()
        predictions = model(embedding_batch) # Pass embeddings
        loss = criterion(predictions, label_batch)
        acc = calculate_accuracy(predictions, label_batch)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for batch_idx, (embedding_batch, label_batch) in enumerate(dataloader):
            embedding_batch = embedding_batch.to(device)
            label_batch = label_batch.to(device)

            predictions = model(embedding_batch)
            loss = criterion(predictions, label_batch)
            acc = calculate_accuracy(predictions, label_batch)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


# Training loop
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_dataloader_st, optimizer, criterion, device)
    test_loss, test_acc = evaluate(model, test_dataloader_st, criterion, device)

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc*100:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc*100:.2f}%')

## 10. Evaluate on Test Set

In [None]:
test_loss, test_acc = evaluate(model, test_dataloader_st, criterion_st, device)
print(f'Final Test Loss: {test_loss:.4f}, Test Acc: {test_acc*100:.2f}%')

## 11. Make Predictions on New Text

In [None]:
def predict_sentiment(model, sentence_transformer_model, sentence, device, sentiment_map_inv):
    model.eval()

    # Preprocess using ST preprocessing
    processed_sentence = preprocess_text_st(sentence)

    # Encode sentence
    embedding = sentence_transformer_model.encode([processed_sentence], convert_to_tensor=True).to(device)

    prediction = model(embedding)
    _, predicted_class = torch.max(prediction, 1)
    predicted_sentiment_index = predicted_class.item()
    predicted_sentiment = sentiment_map_inv[predicted_sentiment_index]
    return predicted_sentiment


sentiment_map_inv = {v: k for k, v in sentiment_map.items()}

# Test for a positive tweet
positive_tweet = "This movie was absolutely fantastic! I loved it."

predicted_sentiment = predict_sentiment(model, sentence_model, positive_tweet, device, sentiment_map_inv)
print(f"Predicted sentiment for: '{positive_tweet}' is: {predicted_sentiment}")

# Test for a negative tweet
tweet_negative = "I'm feeling really down and upset today."

predicted_sentiment_neg = predict_sentiment(model, sentence_model, tweet_negative, device, sentiment_map_inv)
print(f"Predicted sentiment for: '{tweet_negative}' is: {predicted_sentiment_neg}")

# Test for a neutral tweet
tweet_neutral = "I'm feeling normal today."

predicted_sentiment_neu = predict_sentiment(model, sentence_model, tweet_neutral, device, sentiment_map_inv)
print(f"Predicted sentiment for: '{tweet_neutral}' is: {predicted_sentiment_neu}")