# **IMPORTS**

In [None]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from collections import Counter
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(Config.SEED)
print(f"Using device: {Config.DEVICE}")

In [None]:
train_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/train.csv')
test_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/test.csv')

# **Configuration Class**

This block defines a Config class containing all model, training, and augmentation hyperparameters for easy management.

In [None]:
class Config:
    # Model architecture
    VOCAB_SIZE = 30000
    EMBEDDING_DIM = 400
    HIDDEN_DIM = 512
    NUM_LAYERS = 3
    NUM_LABELS = 5
    DROPOUT = 0.4
    BIDIRECTIONAL = True

    # Training settings
    BATCH_SIZE = 32
    EPOCHS = 20
    LEARNING_RATE = 0.0015
    MAX_LENGTH = 150
    WEIGHT_DECAY = 1e-5

    # Augmentation
    USE_AUGMENTATION = True
    AUG_PROBABILITY = 0.3

    SEED = 42
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    EMOTION_COLS = ['anger', 'fear', 'joy', 'sadness', 'surprise']

In [None]:
wandb.login(key = "1cae1eb0b3009c258573b649b577124df891befe" , relogin=True)

wandb.init(
    project="23f1001420-t32025",
    name="scratch_emotion_classifier",
    config={
        "vocab_size": Config.VOCAB_SIZE,
        "embedding_dim": Config.EMBEDDING_DIM,
        "hidden_dim": Config.HIDDEN_DIM,
        "num_layers": Config.NUM_LAYERS,
        "num_labels": Config.NUM_LABELS,
        "dropout": Config.DROPOUT,
        "bidirectional": Config.BIDIRECTIONAL,

        "batch_size": Config.BATCH_SIZE,
        "epochs": Config.EPOCHS,
        "learning_rate": Config.LEARNING_RATE,
        "max_length": Config.MAX_LENGTH,
        "weight_decay": Config.WEIGHT_DECAY,

        "augmentation": Config.USE_AUGMENTATION,
        "aug_prob": Config.AUG_PROBABILITY,

        "seed": Config.SEED,
    }
)

config = wandb.config


# **TEXT PREPROCESSING WITH AUGMENTATION**

In [None]:
class EnhancedTextPreprocessor:
    def __init__(self):
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.vocab_size = 2

        # Emotion-specific keywords for better understanding
        self.emotion_keywords = {
            'anger': ['angry', 'mad', 'furious', 'annoyed', 'irritated', 'rage'],
            'fear': ['scared', 'afraid', 'terrified', 'anxious', 'worried', 'nervous'],
            'joy': ['happy', 'joyful', 'excited', 'glad', 'delighted', 'cheerful'],
            'sadness': ['sad', 'depressed', 'unhappy', 'miserable', 'heartbroken', 'crying'],
            'surprise': ['surprised', 'shocked', 'amazed', 'astonished', 'unexpected']
        }

    def clean_text(self, text):
        """Enhanced text cleaning"""
        text = str(text).lower()

        # Preserve important punctuation
        text = re.sub(r'!+', ' ! ', text)
        text = re.sub(r'\?+', ' ? ', text)

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

        # Handle negations
        text = re.sub(r"n't", " not", text)
        text = re.sub(r"'m", " am", text)
        text = re.sub(r"'re", " are", text)
        text = re.sub(r"'ve", " have", text)

        # Remove mentions but keep hashtag content
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#', '', text)

        # Keep letters, numbers, and some punctuation
        text = re.sub(r'[^a-z0-9\s.,!?\'-]', '', text)

        # Handle repeated characters (e.g., 'soooo' -> 'so')
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def tokenize(self, text):
        """Enhanced tokenization"""
        return text.split()

    def augment_text(self, text):
        """Simple augmentation: synonym replacement"""
        words = text.split()
        if len(words) < 3 or np.random.random() > Config.AUG_PROBABILITY:
            return text

        # Simple word dropout
        if np.random.random() < 0.3:
            idx = np.random.randint(0, len(words))
            words.pop(idx)

        return ' '.join(words)

    def build_vocab(self, texts, max_vocab_size=30000):
        """Build vocabulary from texts"""
        print("Building vocabulary...")
        word_freq = Counter()

        for text in tqdm(texts):
            cleaned = self.clean_text(text)
            tokens = self.tokenize(cleaned)
            word_freq.update(tokens)

        # Add most common words to vocabulary
        most_common = word_freq.most_common(max_vocab_size - 2)

        for word, freq in most_common:
            if word not in self.word2idx:
                self.word2idx[word] = self.vocab_size
                self.idx2word[self.vocab_size] = word
                self.vocab_size += 1

        print(f"Vocabulary size: {self.vocab_size}")
        return self.word2idx

    def text_to_sequence(self, text, max_length=150):
        """Convert text to sequence of indices"""
        cleaned = self.clean_text(text)
        tokens = self.tokenize(cleaned)

        # Convert to indices
        sequence = [self.word2idx.get(token, 1) for token in tokens]

        # Pad or truncate
        if len(sequence) < max_length:
            sequence = sequence + [0] * (max_length - len(sequence))
        else:
            sequence = sequence[:max_length]

        return sequence

    def texts_to_sequences(self, texts, max_length=150):
        """Convert multiple texts to sequences"""
        sequences = []
        for text in tqdm(texts, desc="Converting texts"):
            sequences.append(self.text_to_sequence(text, max_length))
        return np.array(sequences)

# **DATASET CLASS WITH AUGMENTATION**

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, sequences, labels, augment=False):
        self.sequences = torch.LongTensor(sequences)
        self.labels = torch.FloatTensor(labels)
        self.augment = augment

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]

        # Simple sequence augmentation: random dropout
        if self.augment and np.random.random() < 0.2:
            mask = torch.rand(sequence.shape) > 0.1
            sequence = sequence * mask.long()

        return {
            'sequence': sequence,
            'labels': self.labels[idx]
        }

# **ATTENTION MECHANISM**

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads=4):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.head_dim = hidden_dim // num_heads

        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):
        batch_size, seq_len, hidden_dim = x.size()

        # Linear projections
        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attention_weights = F.softmax(scores, dim=-1)

        # Apply attention
        attended = torch.matmul(attention_weights, V)
        attended = attended.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_dim)

        output = self.out(attended)
        return output

# **MODEL ARCHITECTURE**

In [None]:
class EnhancedEmotionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
                 num_labels, dropout=0.4, bidirectional=True):
        super(EnhancedEmotionClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        # Embedding layer with dropout
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.2)

        # GRU layers (often better than LSTM for text)
        self.gru = nn.GRU(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )

        # Multi-head attention
        gru_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.attention = MultiHeadAttention(gru_output_dim, num_heads=4)

        # Global max pooling
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)

        # Feature combination
        combined_dim = gru_output_dim * 2  # attention + max pool

        # Enhanced fully connected layers
        self.fc1 = nn.Linear(combined_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(dropout)

        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(dropout)

        self.fc3 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        self.bn3 = nn.BatchNorm1d(hidden_dim // 4)
        self.dropout3 = nn.Dropout(dropout)

        self.fc_out = nn.Linear(hidden_dim // 4, num_labels)

    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)

        # GRU
        gru_out, _ = self.gru(embedded)

        # Multi-head attention
        attended = self.attention(gru_out)

        # Mean of attended output
        attended_mean = torch.mean(attended, dim=1)

        # Max pooling over sequence
        max_pooled = self.global_max_pool(gru_out.transpose(1, 2)).squeeze(2)

        # Combine features
        combined = torch.cat([attended_mean, max_pooled], dim=1)

        # Fully connected layers
        x = self.fc1(combined)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.dropout3(x)

        logits = self.fc_out(x)

        return logits

# **FOCAL LOSS FOR IMBALANCED DATA**

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

# **TRAINING & EVALUATION FUNCTIONS**

In [None]:
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    losses = []

    progress_bar = tqdm(data_loader, desc='Training')

    for batch in progress_bar:
        sequences = batch['sequence'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(sequences)
        loss = criterion(logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        losses.append(loss.item())
        progress_bar.set_postfix({'loss': np.mean(losses)})

    return np.mean(losses)

def eval_model(model, data_loader, criterion, device, threshold=0.5):
    model.eval()
    losses = []
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            sequences = batch['sequence'].to(device)
            labels = batch['labels'].to(device)

            logits = model(sequences)
            loss = criterion(logits, labels)

            losses.append(loss.item())

            probs = torch.sigmoid(logits)
            preds = (probs > threshold).int()

            all_predictions.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)

    f1_macro = f1_score(all_labels, all_predictions, average='macro', zero_division=0)
    f1_per_class = f1_score(all_labels, all_predictions, average=None, zero_division=0)

    return np.mean(losses), f1_macro, f1_per_class

def predict(model, data_loader, device, threshold=0.5):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Predicting'):
            sequences = batch['sequence'].to(device)

            logits = model(sequences)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).int()

            all_predictions.extend(preds.cpu().numpy())

    return np.array(all_predictions)

# **PREPARE DATA**

In [None]:
# Initialize preprocessor
preprocessor = EnhancedTextPreprocessor()

# Build vocabulary
preprocessor.build_vocab(train_df['text'].tolist(), max_vocab_size=Config.VOCAB_SIZE)

# Convert texts to sequences
print("\nConverting texts to sequences...")
sequences = preprocessor.texts_to_sequences(
    train_df['text'].tolist(),
    max_length=Config.MAX_LENGTH
)

labels = train_df[Config.EMOTION_COLS].values

# Split data
train_seq, val_seq, train_labels, val_labels = train_test_split(
    sequences, labels, test_size=0.15, random_state=Config.SEED
)

# Create datasets
train_dataset = EmotionDataset(train_seq, train_labels, augment=Config.USE_AUGMENTATION)
val_dataset = EmotionDataset(val_seq, val_labels, augment=False)

train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE)

# INITIALIZE MODEL
model = EnhancedEmotionClassifier(
    vocab_size=preprocessor.vocab_size,
    embedding_dim=Config.EMBEDDING_DIM,
    hidden_dim=Config.HIDDEN_DIM,
    num_layers=Config.NUM_LAYERS,
    num_labels=Config.NUM_LABELS,
    dropout=Config.DROPOUT,
    bidirectional=Config.BIDIRECTIONAL
)
model.to(Config.DEVICE)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

criterion = FocalLoss(alpha=0.25, gamma=2.0)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=Config.LEARNING_RATE,
    weight_decay=Config.WEIGHT_DECAY
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=5, T_mult=2
)

# **TRAINING LOOP**

In [None]:
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)

best_f1 = 0
patience_counter = 0
patience = 5

history = {'train_loss': [], 'val_loss': [], 'val_f1': []}

for epoch in range(Config.EPOCHS):
    print(f'\nEpoch {epoch + 1}/{Config.EPOCHS}')
    print('-' * 60)

    train_loss = train_epoch(model, train_loader, optimizer, criterion, Config.DEVICE)
    val_loss, val_f1, val_f1_per_class = eval_model(model, val_loader, criterion, Config.DEVICE)

    scheduler.step()

    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_f1'].append(val_f1)

    wandb.log({
        "train_loss": train_loss,
        "val_loss": val_loss,
        "val_f1": val_f1,
        "learning_rate": optimizer.param_groups[0]['lr']
    })

    print(f'\nTrain Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f}')
    print(f'Val Macro F1: {val_f1:.4f}')
    print(f'\nPer-class F1 scores:')
    for emotion, f1 in zip(Config.EMOTION_COLS, val_f1_per_class):
        wandb.log({f"F1_{emotion}": f1})
        print(f'  {emotion}: {f1:.4f}')

    # Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save({
            'model_state_dict': model.state_dict(),
            'preprocessor': preprocessor,
            'config': Config
        }, 'best_model.pt')
        print(f'\nâœ“ New best model saved! (F1: {best_f1:.4f})')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping triggered after {epoch+1} epochs")
            break

print("\n" + "="*60)
print(f"TRAINING COMPLETED - Best F1: {best_f1:.4f}")
print("="*60)


In [None]:
wandb.finish()

# **Saving and Uploading the Trained Model**

In [None]:
import kagglehub
torch.save({
    'model_state_dict': model.state_dict(),
    'preprocessor': preprocessor,
    'config': Config
}, 'best_model.pt')

user = "somya2611"
emotion_handle = f"{user}/emotion-classifier/pyTorch/v1"
os.makedirs("emotion_model_dir", exist_ok=True)
os.rename("best_model.pt", "emotion_model_dir/best_model.pt")
kagglehub.model_upload(emotion_handle, "emotion_model_dir")

# **Downloading the Saved Model from Kaggle**

In [None]:
import kagglehub
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
user = "somya2611"
emotion_handle = f"{user}/emotion-classifier/pyTorch/v1"

model_path = kagglehub.model_download(emotion_handle)

checkpoint = torch.load(f"{model_path}/best_model.pt", map_location=device, weights_only=False)

# Recreate model architecture
model = EnhancedEmotionClassifier(
    vocab_size=checkpoint['preprocessor'].vocab_size,
    embedding_dim=checkpoint['config'].EMBEDDING_DIM,
    hidden_dim=checkpoint['config'].HIDDEN_DIM,
    num_layers=checkpoint['config'].NUM_LAYERS,
    num_labels=checkpoint['config'].NUM_LABELS,
    dropout=checkpoint['config'].DROPOUT,
    bidirectional=checkpoint['config'].BIDIRECTIONAL
)

# Load trained weights
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Get preprocessor
preprocessor = checkpoint['preprocessor']

# Process and predict
test_sequences = preprocessor.texts_to_sequences(
    test_df['text'].tolist(),
    max_length=checkpoint['config'].MAX_LENGTH
)

# Create dataset and loader
test_labels_dummy = torch.zeros((len(test_sequences), 5))
test_dataset = EmotionDataset(test_sequences, test_labels_dummy, augment=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

# **Running Inference and Generating Submission File**

In [None]:
predictions = predict(model, test_loader, device, threshold=0.5)

submission = pd.DataFrame(predictions, columns=['anger', 'fear', 'joy', 'sadness', 'surprise'])
if 'id' in test_df.columns:
    submission.insert(0, 'id', test_df['id'])
submission.to_csv('submission.csv', index=False)

print(submission.head())