# This is the implementation for the advanced Model

## Step 1: Load the Required Libraries
In this step, we import the necessary libraries, including PyTorch for building the model,
and other utilities for data preprocessing, loading, and splitting.

In [27]:
# Import Libraries
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors 

In [28]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Step 2: Load and Preprocess the Dataset
Here, we load the dataset from the CSV files and preprocess it for training the model.


In [29]:
# Load the preprocessed data from CSV files
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("val.csv")
test_data = pd.read_csv("test.csv")

In [30]:
X_train = train_data['text']
y_train = train_data['label']

X_val = val_data['text']
y_val = val_data['label']

X_test = test_data['text']
y_test = test_data['label']

## Step 3: Load Pre-trained Word2Vec Embeddings
We use pre-trained Word2Vec embeddings to represent words as dense vectors.
These embeddings improve the performance of the model by leveraging semantic relationships between words.

In [32]:
# Load pre-trained Word2Vec model
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [33]:
# Create a vocabulary
embedding_dim = 300
vocab = {"<PAD>": 0, "<UNK>": 1}  # Special tokens
embedding_matrix = [np.zeros(embedding_dim), np.random.uniform(-0.01, 0.01, embedding_dim)]  # Initialize <PAD> and <UNK>

In [34]:
# Build vocabulary from Word2Vec
for text in X_train:
    for word in text.split():
        if word not in vocab and word in word2vec:
            vocab[word] = len(vocab)
            embedding_matrix.append(word2vec[word])

embedding_matrix = np.array(embedding_matrix)
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 75781


In [35]:
print(embedding_matrix.shape)
print(embedding_matrix[embedding_matrix.shape[0]-1])


(75781, 300)
[-0.0703125  -0.05419922  0.41601562  0.11669922  0.20117188 -0.22949219
  0.07275391  0.08154297  0.48828125  0.3359375   0.30859375  0.11376953
  0.31054688  0.17089844  0.16601562 -0.16699219  0.10058594 -0.27539062
 -0.09716797  0.265625    0.03320312  0.32226562 -0.33398438 -0.21777344
 -0.29882812  0.15234375  0.04956055  0.09765625  0.08496094 -0.33398438
  0.23632812  0.20703125 -0.16992188  0.34960938  0.26171875 -0.390625
 -0.16601562 -0.19238281  0.01226807  0.04907227 -0.06982422 -0.35742188
 -0.25195312  0.203125   -0.00415039  0.26367188 -0.08544922 -0.04223633
 -0.39257812  0.22851562  0.04248047  0.39453125 -0.00372314 -0.18066406
 -0.06347656 -0.27539062 -0.15234375 -0.58984375  0.15039062  0.16894531
  0.13183594  0.38671875 -0.53515625 -0.31445312  0.07568359 -0.390625
 -0.24804688  0.31640625 -0.28125     0.34375     0.05810547 -0.02197266
 -0.140625   -0.26757812  0.16992188 -0.12353516  0.16894531  0.16796875
 -0.33203125 -0.12451172  0.13085938  0.09

## Step 4: Tokenize and Pad Sequences
Convert the text into sequences of integers based on the vocabulary.
We also pad sequences to ensure they all have the same length for batch processing.

In [36]:
# Tokenize and convert text to sequences
def text_to_sequence(text, vocab, max_len=200):
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
    if len(sequence) < max_len:
        sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
    return sequence[:max_len]

# Apply tokenization
max_len = 200
X_train_seq = [text_to_sequence(text, vocab, max_len) for text in X_train]
X_val_seq = [text_to_sequence(text, vocab, max_len) for text in X_val]
X_test_seq = [text_to_sequence(text, vocab, max_len) for text in X_test]

## Step 5: Create a Dataset and DataLoader
We define a custom Dataset class to handle our data and create DataLoader objects
to efficiently load data during training and validation.

In [37]:
# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [38]:
# Create Dataset and DataLoader
batch_size = 32
train_dataset = TextDataset(X_train_seq, y_train)
val_dataset = TextDataset(X_val_seq, y_val)
test_dataset = TextDataset(X_test_seq, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Step 6: Define the LSTM Model
The model consists of an embedding layer initialized with Word2Vec embeddings,
followed by an LSTM layer, and a fully connected output layer.

In [39]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        x = self.fc(hidden[-1])
        return self.sigmoid(x)

In [40]:
# Initialize the model
hidden_dim = 128
output_dim = 1  # Binary classification
model = LSTMClassifier(embedding_matrix, hidden_dim, output_dim).to(device)

In [41]:
# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Step 7: Train the Model
Train the model for multiple epochs and validate its performance on the validation set.

In [42]:
# Training Loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        predictions = model(texts).squeeze(1)  # Ensure predictions have shape [batch_size]

        # Compute loss
        loss = criterion(predictions, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

    # Validation step
    model.eval()
    with torch.no_grad():
        val_predictions, val_labels = [], []
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts).squeeze(1)  # Ensure predictions have shape [batch_size]
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_predictions = np.round(val_predictions)
    accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Validation Accuracy: {accuracy:.4f}")

Epoch 1/10, Loss: 0.4622
Validation Accuracy: 0.5391
Epoch 2/10, Loss: 0.2720
Validation Accuracy: 0.9556
Epoch 3/10, Loss: 0.0868
Validation Accuracy: 0.9711
Epoch 4/10, Loss: 0.0569
Validation Accuracy: 0.9773
Epoch 5/10, Loss: 0.0250
Validation Accuracy: 0.9824
Epoch 6/10, Loss: 0.0272
Validation Accuracy: 0.9819
Epoch 7/10, Loss: 0.0085
Validation Accuracy: 0.9835
Epoch 8/10, Loss: 0.0031
Validation Accuracy: 0.9831
Epoch 9/10, Loss: 0.0024
Validation Accuracy: 0.9823
Epoch 10/10, Loss: 0.0034
Validation Accuracy: 0.9843


## Step 8: Test the Model
Evaluate the model's performance on the unseen test set.

In [43]:
# Test the model
model.eval()
with torch.no_grad():
    test_predictions, test_labels = [], []
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        predictions = model(texts).squeeze()
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_predictions = np.round(test_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9864


In [44]:
# Save the model
torch.save(model.state_dict(), "lstm_model.pth")