# LSTM for Text Classification with Attention
In this notebook, we will implement an LSTM model with attention for text classification using PyTorch.

## Step 1: Import Libraries

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import KeyedVectors

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## Step 2: Load and Preprocess the Dataset
Here, we
load
the
dataset
from the CSV

In [None]:
# Load the preprocessed data from CSV files
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("val.csv")
test_data = pd.read_csv("test.csv")
X_train = train_data['text']
y_train = train_data['label']

X_val = val_data['text']
y_val = val_data['label']

X_test = test_data['text']
y_test = test_data['label']

## Step 3: Load Pre-trained Word2Vec Embeddings
We
use
pre - trained
Word2Vec
embeddings
to
represent
words as dense
vectors.
These
embeddings
improve
the
performance
of
the
model
by
leveraging
semantic
relationships
between
words.

In [None]:
# Load pre-trained Word2Vec model
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [None]:
# Create a vocabulary
embedding_dim = 300
vocab = {"<PAD>": 0, "<UNK>": 1}  # Special tokens
embedding_matrix = [np.zeros(embedding_dim),
                    np.random.uniform(-0.01, 0.01, embedding_dim)]  # Initialize <PAD> and <UNK>

In [None]:
# Build vocabulary from Word2Vec
for text in X_train:
    for word in text.split():
        if word not in vocab and word in word2vec:
            vocab[word] = len(vocab)
            embedding_matrix.append(word2vec[word])

embedding_matrix = np.array(embedding_matrix)
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")

## Step 4: Tokenize and Pad Sequences
Convert
the
text
into
sequences
of
integers
based
on
the
vocabulary.
We
also
pad
sequences
to
ensure
they
all
have
the
same
length
for batch processing.


In [None]:
# Tokenize and convert text to sequences
def text_to_sequence(text, vocab, max_len=200):
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
    if len(sequence) < max_len:
        sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
    return sequence[:max_len]

In [None]:
# Apply tokenization
max_len = 200
X_train_seq = [text_to_sequence(text, vocab, max_len) for text in X_train]
X_val_seq = [text_to_sequence(text, vocab, max_len) for text in X_val]
X_test_seq = [text_to_sequence(text, vocab, max_len) for text in X_test]


## Step 5: Create a Dataset and DataLoader
We define a custom Dataset class to handle our data and create DataLoader objects to efficiently load data during training and validation.

In [None]:
# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


In [None]:
# Create Dataset and DataLoader
batch_size = 32
train_dataset = TextDataset(X_train_seq, y_train)
val_dataset = TextDataset(X_val_seq, y_val)
test_dataset = TextDataset(X_test_seq, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Step 6: Define the LSTM Model
The
model
consists
of
an
embedding
layer
initialized
with Word2Vec embeddings,
followed
by
an
LSTM
layer, and a
fully
connected
output
layer.


In [None]:
# Define LSTM Classifier
class AdvancedLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, attention=True):
        super(AdvancedLSTMClassifier, self).__init__()
        # Embedding layer initialized with pre-trained Word2Vec embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)

        # Bidirectional LSTM
        self.lstm = nn.LSTM(input_size=embedding_matrix.shape[1], hidden_size=hidden_dim,
                            batch_first=True, bidirectional=True)

        # Attention mechanism (optional)
        self.attention = attention
        if attention:
            self.attention_weights = nn.Linear(2 * hidden_dim, 1)  # Compute attention scores

        # Dropout for regularization
        self.dropout = nn.Dropout(0.5)

        # Fully connected layers
        self.fc1 = nn.Linear(2 * hidden_dim, 128)  # First dense layer
        self.fc2 = nn.Linear(128, output_dim)  # Second dense layer
        self.sigmoid = nn.Sigmoid()  # Output layer for binary classification

    def forward(self, x):
        # Step 1: Embedding layer
        x = self.embedding(x)

        # Step 2: Bidirectional LSTM
        lstm_out, _ = self.lstm(x)  # lstm_out shape: [batch_size, seq_len, 2 * hidden_dim]

        # Step 3: Attention mechanism (if enabled)
        if self.attention:
            attention_scores = self.attention_weights(lstm_out).squeeze(-1)  # [batch_size, seq_len]
            attention_weights = torch.softmax(attention_scores, dim=1)  # Normalize scores
            x = torch.bmm(attention_weights.unsqueeze(1), lstm_out).squeeze(1)  # Weighted sum of LSTM outputs
        else:
            x = lstm_out[:, -1, :]  # Use the last hidden state if no attention

        # Step 4: Fully connected layers
        x = self.dropout(x)  # Apply dropout
        x = torch.relu(self.fc1(x))  # First dense layer with ReLU activation
        x = self.dropout(x)  # Apply dropout
        x = self.fc2(x)  # Second dense layer

        # Step 5: Output layer
        return self.sigmoid(x)


In [None]:
hidden_dim = 128
output_dim = 1
model = AdvancedLSTMClassifier(embedding_matrix, hidden_dim, output_dim, attention=True).to(device)

In [None]:
# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Step 7: Train the Model
Train
the
model
for multiple epochs and validate its performance on the validation set.

In [None]:
    
# Training Loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        predictions = model(texts).squeeze(1)  # Ensure predictions have shape [batch_size]

        # Compute loss
        loss = criterion(predictions, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")


In [None]:
    # Validation step
    model.eval()
    with torch.no_grad():
        val_predictions, val_labels = [], []
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts).squeeze(1)  # Ensure predictions have shape [batch_size]
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_predictions = np.round(val_predictions)
    accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Validation Accuracy: {accuracy:.4f}")

## Step 8: Test the Model
Evaluate
the
model
's performance on the unseen test set.

In [None]:
# Test the model
model.eval()
with torch.no_grad():
    test_predictions, test_labels = [], []
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        predictions = model(texts).squeeze()
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_predictions = np.round(test_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions)
test_recall = recall_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")


# Save the model

In [None]:
torch.save(model.state_dict(), "lstm_model_v1.pth")
