In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from sklearn.model_selection import KFold

nltk.download('stopwords')
nltk.download('punkt')

# Loading the vocabulary
def load_vocabulary(vocab_path):
    vocab = {}
    with open(vocab_path, 'r') as f:
        for line in f:
            word = line.strip()
            vocab[word] = len(vocab)
    return vocab

vocab_path = 'glove_model_vocab.txt'
vocab = load_vocabulary(vocab_path)
# Adding the [CLS] token to the vocabulary
vocab['[CLS]'] = len(vocab)
vocab_size = len(vocab)

# Loading the embedding matrix
def load_npy_embeddings(embedding_path, vocab):
    embeddings = np.load(embedding_path)
    embedding_dim = embeddings.shape[1]
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in vocab.items():
        if idx < embeddings.shape[0]:
            embedding_matrix[idx] = embeddings[idx]
    return embedding_matrix

embedding_path = 'glove_model_embeddings_300_d.npy'
embedding_matrix = load_npy_embeddings(embedding_path, vocab)

# Defining the embedding layer class
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, pretrained_embeddings=None):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))

    def forward(self, x):
        return self.embedding(x)

# Defining a class for adding positional encoding to the embeddings
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.embed_size = embed_size
        self.encoding = self.get_positional_encoding(max_len, embed_size)

    @staticmethod
    def get_positional_encoding(max_len, embed_size):
        encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        encoding[:, 0::2] = torch.sin(position * div_term)
        encoding[:, 1::2] = torch.cos(position * div_term)
        return encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        seq_len = x.size(1)
        if seq_len > self.encoding.size(0):
            self.encoding = self.get_positional_encoding(seq_len, self.embed_size)
        encoding = self.encoding[:seq_len, :].to(x.device)  # Ensure encoding is on the same device as x
        return x + encoding.transpose(0, 1)

# Defining the Encoder block class
class EncoderBlock(nn.Module):
    def __init__(self, embed_size, num_heads, ff_dim, dropout_rate=0.1):
        super(EncoderBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads, dropout=dropout_rate)
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_size)
        )
        self.layernorm1 = nn.LayerNorm(embed_size)
        self.layernorm2 = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        attn_output, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.layernorm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.layernorm2(x + self.dropout(ffn_output))
        return x

# Defining the Sentiment Analysis model class
class SentimentAnalysisModel(nn.Module):
    def __init__(self, embed_size, num_layers, num_heads, ff_dim, vocab_size, max_len, num_classes, dropout_rate):
        super(SentimentAnalysisModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_len)
        self.encoder_layers = nn.ModuleList(
            [EncoderBlock(embed_size, num_heads, ff_dim, dropout_rate) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout_rate)
        self.fc_out = nn.Linear(embed_size, num_classes)

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = x.transpose(0, 1)  # Convert to (seq_len, batch_size, embed_size) for attention layers
        for layer in self.encoder_layers:
            x = layer(x, mask)
        x = x.transpose(0, 1)  # Convert back to (batch_size, seq_len, embed_size)
        x = self.dropout(x[:, 0, :])  # Use the encoding of the first token ([CLS] token) for classification
        return self.fc_out(x)

# Data preprocessing
class SentimentPreprocessor:
    def __init__(self, file_path):
        self.data = pd.read_csv(file_path)

    def preprocess(self):
        # Lowercase the text
        self.data['review'] = self.data['review'].str.lower()

        # Removing punctuation
        self.data['review'] = self.data['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

        # Removing stopwords
        stop_words = set(stopwords.words('english'))
        self.data['review'] = self.data['review'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

        # Tokenizing the text
        self.data['review'] = self.data['review'].apply(word_tokenize)

        return self.data

def tokenize_and_pad(df, vocab, max_len):
    def tokenize_review(review, vocab):
        return [vocab.get(word, vocab['[CLS]']) for word in review]

    df['tokenized'] = df['review'].apply(lambda x: tokenize_review(x, vocab))
    df['tokenized'] = df['tokenized'].apply(lambda x: x[:max_len] + [0] * (max_len - len(x)) if len(x) < max_len else x[:max_len])
    return df

preprocessor = SentimentPreprocessor('IMDB Dataset.csv')
df_preprocessed = preprocessor.preprocess()
label_encoder = LabelEncoder()
df_preprocessed['label'] = label_encoder.fit_transform(df_preprocessed['sentiment'])
max_len = 250  # Ensure max_len is consistent across preprocessing and model
df_tokenized = tokenize_and_pad(df_preprocessed, vocab, max_len)

class IMDBDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = torch.tensor(self.data.iloc[idx]['tokenized'], dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        return review, label

dataset = IMDBDataset(df_tokenized)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# K-Fold Cross-Validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Model configuration
embed_size = 300
num_layers = 6
num_heads = 12
ff_dim = 1024
vocab_size = len(vocab)
num_classes = 2
dropout_rate = 0.2
batch_size = 256
num_epochs = 10
learning_rate = 3e-5

# Initialize lists to store results
fold_train_losses = []
fold_val_losses = []
fold_val_accuracies = []

all_fold_labels = []
all_fold_preds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f'Fold {fold + 1}/{k_folds}')
    
    train_subsampler = Subset(dataset, train_idx)
    val_subsampler = Subset(dataset, val_idx)
    
    train_loader = DataLoader(train_subsampler, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subsampler, batch_size=batch_size, shuffle=False)
    
    # Initialize the model, criterion, optimizer, and scheduler
    model = SentimentAnalysisModel(embed_size, num_layers, num_heads, ff_dim, vocab_size, max_len, num_classes, dropout_rate).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)
    
    # Training and Validation
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    val_accuracies = []
    fold_labels = []
    fold_preds = []
    for epoch in range(num_epochs):
        model.train()
        epoch_train_loss = 0
        for reviews, labels in train_loader:
            reviews, labels = reviews.to(device), labels.to(device)
            mask = None  # Assuming no mask for simplicity
            outputs = model(reviews, mask)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()

        train_losses.append(epoch_train_loss / len(train_loader))

        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for reviews, labels in val_loader:
                reviews, labels = reviews.to(device), labels.to(device)
                mask = None  # Assuming no mask for simplicity
                outputs = model(reviews, mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(predicted.cpu().numpy())

        val_losses.append(val_loss / len(val_loader))
        val_accuracy = 100 * correct / total
        val_accuracies.append(val_accuracy)

        fold_labels.extend(all_labels)
        fold_preds.extend(all_preds)

        scheduler.step(val_loss / len(val_loader))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'best_model_fold_{fold + 1}.pth')
    
    fold_train_losses.append(train_losses)
    fold_val_losses.append(val_losses)
    fold_val_accuracies.append(val_accuracies)

    all_fold_labels.extend(fold_labels)
    all_fold_preds.extend(fold_preds)

average_train_loss = np.mean([np.mean(losses) for losses in fold_train_losses])
average_val_loss = np.mean([np.mean(losses) for losses in fold_val_losses])
average_val_accuracy = np.mean([np.mean(accuracies) for accuracies in fold_val_accuracies])

print(f'Average Train Loss: {average_train_loss:.4f}')
print(f'Average Val Loss: {average_val_loss:.4f}')
print(f'Average Val Accuracy: {average_val_accuracy:.2f}%')

plt.figure(figsize=(10, 5))
plt.plot(np.mean(fold_train_losses, axis=0), label='Average Training Loss')
plt.plot(np.mean(fold_val_losses, axis=0), label='Average Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(np.mean(fold_val_accuracies, axis=0), label='Average Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy')
plt.legend()
plt.show()

# Confusion Matrix
conf_matrix = confusion_matrix(all_fold_labels, all_fold_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
class_report = classification_report(all_fold_labels, all_fold_preds, target_names=label_encoder.classes_)
print("Classification Report:\n", class_report)
