In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import pickle
import unicodedata

In [28]:
device='cuda'

In [29]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

def clean_text(text):
    text = str(text)
    # Normalize Unicode (so accents and diacritics are consistent)
    text = unicodedata.normalize("NFKC", text)
    # Remove brackets and dashes
    text = re.sub(r'[\[\]\(\)\{\}<>\-–—]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text']  = test_df['text'].apply(clean_text)

texts  = train_df['text'].astype(str).tolist()
labels = train_df['label'].astype(int).tolist()
test_texts = test_df['text'].astype(str).tolist()

In [30]:
all_texts = train_df['text'].tolist() + test_df['text'].tolist()
vocab = {c: i+2 for i, c in enumerate(sorted(set(''.join(all_texts))))}
vocab["<PAD>"] = 0
vocab["<OOV>"] = 1

# Encode function
def encode_text(text):
    return [vocab.get(c, 1) for c in text]

# Encode training and test data
encoded_train = [encode_text(t) for t in train_df['text']]
encoded_test  = [encode_text(t) for t in test_df['text']]


In [31]:
MAX_LEN = 400  # you can increase if your texts are longer

def pad_sequences_custom(seqs, max_len=MAX_LEN):
    padded = []
    for seq in seqs:
        if len(seq) < max_len:
            seq += [0]*(max_len - len(seq))  # pad with 0 (PAD token)
        else:
            seq = seq[:max_len]              # truncate if longer
        padded.append(seq)
    return torch.tensor(padded)

X = pad_sequences_custom(encoded_train, MAX_LEN)
y = torch.tensor(labels)
X_test = pad_sequences_custom(encoded_test, MAX_LEN)


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [33]:
# -----------------------------
# 4. Dataset & Dataloaders
# -----------------------------
class TextDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

train_data = TextDataset(X_train, y_train)
val_data   = TextDataset(X_val, y_val)
test_data  = TextDataset(X_test)

train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=128, shuffle=False)
test_loader  = DataLoader(test_data, batch_size=128, shuffle=False)

In [34]:
class CNN_BiLSTM(nn.Module):
    def __init__(self, emb_dim, hidden_dim, n_classes,vocab_size=len(vocab)):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv1 = nn.Conv1d(emb_dim, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.lstm = nn.LSTM(128, hidden_dim, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim*2, 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, n_classes)

    def forward(self, x):
        x = self.embedding(x)                # (B, L, E)
        x = x.permute(0, 2, 1)               # (B, E, L)
        x = torch.relu(self.conv1(x))
        x = self.pool(x)                     # (B, C, L/2)
        x = x.permute(0, 2, 1)               # (B, L/2, C)
        _, (h, _) = self.lstm(x)
        h = torch.cat((h[0], h[1]), dim=1)   # (B, 2H)
        x = torch.relu(self.fc1(h))
        x = self.dropout(x)
        return self.fc2(x)


In [35]:
# -----------------------------
# 6. Setup Training
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNN_BiLSTM(vocab_size=len(vocab),
                   emb_dim=100,
                   hidden_dim=128,
                   n_classes=15).to(device)

# Class weights
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train.numpy()),
                                     y=y_train.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [36]:
# -----------------------------
# 7. Training Loop
# -----------------------------
EPOCHS = 20
best_f1 = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for Xb, yb in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(Xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")

    # Validation
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            out = model(Xb)
            pred = torch.argmax(out, dim=1)
            preds.extend(pred.cpu().numpy())
            trues.extend(yb.cpu().numpy())

    f1 = f1_score(trues, preds, average='macro')
    print(f"Validation Macro-F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_cnn_bilstm.pth")
        print("✅ Model saved!")


Epoch 1/30: 100%|██████████| 748/748 [00:21<00:00, 34.08it/s]


Train Loss: 2.5129
Validation Macro-F1: 0.1590
✅ Model saved!


Epoch 2/30: 100%|██████████| 748/748 [00:21<00:00, 34.70it/s]


Train Loss: 2.3192
Validation Macro-F1: 0.2019
✅ Model saved!


Epoch 3/30: 100%|██████████| 748/748 [00:21<00:00, 35.31it/s]


Train Loss: 2.2079
Validation Macro-F1: 0.2221
✅ Model saved!


Epoch 4/30: 100%|██████████| 748/748 [00:21<00:00, 35.02it/s]


Train Loss: 2.0989
Validation Macro-F1: 0.2475
✅ Model saved!


Epoch 5/30: 100%|██████████| 748/748 [00:21<00:00, 34.52it/s]


Train Loss: 1.9913
Validation Macro-F1: 0.2494
✅ Model saved!


Epoch 6/30: 100%|██████████| 748/748 [00:21<00:00, 34.72it/s]


Train Loss: 1.9043
Validation Macro-F1: 0.2628
✅ Model saved!


Epoch 7/30: 100%|██████████| 748/748 [00:21<00:00, 35.10it/s]


Train Loss: 1.8173
Validation Macro-F1: 0.2801
✅ Model saved!


Epoch 8/30: 100%|██████████| 748/748 [00:21<00:00, 35.05it/s]


Train Loss: 1.7027
Validation Macro-F1: 0.3001
✅ Model saved!


Epoch 9/30: 100%|██████████| 748/748 [00:21<00:00, 35.03it/s]


Train Loss: 1.6513
Validation Macro-F1: 0.2790


Epoch 10/30: 100%|██████████| 748/748 [00:21<00:00, 35.00it/s]


Train Loss: 1.5874
Validation Macro-F1: 0.2896


Epoch 11/30: 100%|██████████| 748/748 [00:21<00:00, 35.05it/s]


Train Loss: 1.5635
Validation Macro-F1: 0.2659


Epoch 12/30: 100%|██████████| 748/748 [00:21<00:00, 35.04it/s]


Train Loss: 1.5041
Validation Macro-F1: 0.3022
✅ Model saved!


Epoch 13/30: 100%|██████████| 748/748 [00:21<00:00, 35.08it/s]


Train Loss: 1.4532
Validation Macro-F1: 0.3414
✅ Model saved!


Epoch 14/30: 100%|██████████| 748/748 [00:21<00:00, 35.14it/s]


Train Loss: 1.4040
Validation Macro-F1: 0.3259


Epoch 15/30: 100%|██████████| 748/748 [00:21<00:00, 35.09it/s]


Train Loss: 1.3948
Validation Macro-F1: 0.2953


Epoch 16/30: 100%|██████████| 748/748 [00:21<00:00, 35.15it/s]


Train Loss: 1.3517
Validation Macro-F1: 0.3178


Epoch 17/30: 100%|██████████| 748/748 [00:21<00:00, 35.14it/s]


Train Loss: 1.3009
Validation Macro-F1: 0.3216


Epoch 18/30: 100%|██████████| 748/748 [00:21<00:00, 35.17it/s]


Train Loss: 1.2460
Validation Macro-F1: 0.3453
✅ Model saved!


Epoch 19/30: 100%|██████████| 748/748 [00:21<00:00, 35.17it/s]


Train Loss: 1.2210
Validation Macro-F1: 0.3431


Epoch 20/30: 100%|██████████| 748/748 [00:21<00:00, 35.14it/s]


Train Loss: 1.1801
Validation Macro-F1: 0.3502
✅ Model saved!


Epoch 21/30: 100%|██████████| 748/748 [00:21<00:00, 35.12it/s]


Train Loss: 1.1399
Validation Macro-F1: 0.3273


Epoch 22/30: 100%|██████████| 748/748 [00:21<00:00, 35.14it/s]


Train Loss: 1.1919
Validation Macro-F1: 0.3232


Epoch 23/30: 100%|██████████| 748/748 [00:21<00:00, 35.14it/s]


Train Loss: 1.1043
Validation Macro-F1: 0.3305


Epoch 24/30: 100%|██████████| 748/748 [00:21<00:00, 35.12it/s]


Train Loss: 1.0587
Validation Macro-F1: 0.3300


Epoch 25/30: 100%|██████████| 748/748 [00:21<00:00, 35.13it/s]


Train Loss: 1.0632
Validation Macro-F1: 0.3490


Epoch 26/30: 100%|██████████| 748/748 [00:21<00:00, 35.13it/s]


Train Loss: 1.0631
Validation Macro-F1: 0.3304


Epoch 27/30: 100%|██████████| 748/748 [00:21<00:00, 35.13it/s]


Train Loss: 1.0299
Validation Macro-F1: 0.3406


Epoch 28/30: 100%|██████████| 748/748 [00:21<00:00, 35.12it/s]


Train Loss: 0.9756
Validation Macro-F1: 0.3399


Epoch 29/30: 100%|██████████| 748/748 [00:21<00:00, 35.11it/s]


Train Loss: 0.9806
Validation Macro-F1: 0.3346


Epoch 30/30: 100%|██████████| 748/748 [00:21<00:00, 35.09it/s]


Train Loss: 0.9341
Validation Macro-F1: 0.3346


In [37]:
# -----------------------------
# 8. Inference on Test Set
# -----------------------------
model.load_state_dict(torch.load("best_cnn_bilstm.pth"))
model.eval()

test_preds = []
with torch.no_grad():
    for Xb in tqdm(test_loader, desc="Predicting"):
        Xb = Xb.to(device)
        out = model(Xb)
        pred = torch.argmax(out, dim=1)
        test_preds.extend(pred.cpu().numpy())

Predicting: 100%|██████████| 234/234 [00:02<00:00, 96.33it/s]


In [38]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "label": test_preds
})
submission.to_csv("prediction_rnn+bilstm.csv", index=False)
print("✅ Submission saved as prediction_rnn+bilstm.csv")

✅ Submission saved as prediction_rnn+bilstm.csv


In [39]:
# -----------------------------
# 10. Save Artifacts
# -----------------------------
torch.save(model.state_dict(), "cnn_bilstm_final.pth")
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)
print("✅ Model and vocab saved!")

✅ Model and vocab saved!
