# LSTM for Sentiment Classification

In this notebook, we perform sentiment classification using an LSTM model and the `sentiment_classification` dataset.

### Prerequisites:
1. Download the `sentiment_classification` dataset from [Link].
2. Save the dataset in the following directory structure: 8_Transformer/sentiment_classification/

In [None]:
import lightning as L
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np


class LSTM_Sentiment_Classification(L.LightningModule):
    def __init__(self, embedding_dim, hidden_size, output_size, num_layers):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )

        # Fully connected layer to convert LSTM output to sentiment scores
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        lstm_out, (hn, cn) = self.lstm(input)

        # Use the hidden state from the last time step
        final_output = lstm_out[:, -1, :]  # [batch_size, hidden_size]

        # Fully connected layer to classify sentiment
        prediction = self.fc(final_output)  # [batch_size, output_size]
        return prediction

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)

    def training_step(self, batch, batch_idx):
        input_i, label_i = batch

        # Forward pass
        output_i = self.forward(input_i)

        # Cross entropy loss for multi-class classification
        loss = F.cross_entropy(
            output_i, label_i.argmax(dim=1)
        )  # Use argmax to get class indices

        # Log the training loss
        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        input_i, label_i = batch

        # Forward pass
        output_i = self.forward(input_i)

        # Calculate cross entropy loss for multi-class classification
        loss = F.cross_entropy(
            output_i, label_i.argmax(dim=1)
        )  # Use argmax to get class indices

        # Log the loss
        self.log("test_loss", loss)

        # Calculate accuracy
        preds = torch.argmax(output_i, dim=1)  # Get predicted classes
        accuracy = (
            (preds == label_i.argmax(dim=1)).float().mean()
        )  # Compare with one-hot encoding
        self.log("test_accuracy", accuracy)

        return {"test_loss": loss, "test_accuracy": accuracy}


class SentimentDataset(Dataset):
    def __init__(self, data_path, glove_embeddings, max_length=50):
        self.data = pd.read_csv(data_path, encoding="unicode_escape")
        self.data["text"].fillna("", inplace=True)
        self.texts = self.data.get("text")
        self.labels = self.data.get("sentiment")
        self.glove_embeddings = glove_embeddings
        self.max_length = max_length  # Maximum length for padding

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if not isinstance(text, str):
            print(
                f"Warning: Expected string for text at index {idx}, but got {type(text).__name__}: {text}"
            )
            return torch.zeros(self.max_length, 300), torch.zeros(3)

        label_encoding = torch.zeros(3)
        if label == "negative":
            label_encoding[0] = 1
        elif label == "neutral":
            label_encoding[1] = 1
        elif label == "positive":
            label_encoding[2] = 1

        embeddings = []
        for word in text.lower().split():
            embedding = self.glove_embeddings.get(word, torch.zeros(300))
            embeddings.append(embedding)

        # Pad the embeddings list to max_length
        if len(embeddings) < self.max_length:
            padding = [torch.zeros(300)] * (self.max_length - len(embeddings))
            embeddings.extend(padding)
        elif len(embeddings) > self.max_length:
            embeddings = embeddings[: self.max_length]

        # Convert list of embeddings to a tensor
        embeddings_tensor = torch.stack(embeddings)

        return embeddings_tensor, label_encoding


def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype="float32")
            embeddings[word] = torch.tensor(vector)
    return embeddings


glove_embeddings = load_glove_embeddings("sentiment_classification/glove.42B.300d.txt")

training_data = SentimentDataset(
    "sentiment_classification/data/train.csv", glove_embeddings
)
test_data = SentimentDataset("sentiment_classification/data/test.csv", glove_embeddings)

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

classifier = LSTM_Sentiment_Classification(
    embedding_dim=300, hidden_size=128, output_size=3, num_layers=1
)
classifier.train()

trainer = L.Trainer(
    max_epochs=20,
    accelerator="auto",
    logger=False,
    enable_checkpointing=False,
)
trainer.fit(classifier, train_dataloader)
trainer.test(classifier, test_dataloader)
