In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


In [None]:
# Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# -------------------------------
# 1. Load the Data
# -------------------------------
train_df = pd.read_csv("/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv")  
test_df = pd.read_csv("/kaggle/input/classification-of-math-problems-by-kasut-academy/test.csv")    

print(train_df.head())
print(test_df.head())

In [None]:
X_train = train_df["Question"].values
y_train = train_df["label"].values
X_test  = test_df["Question"].values

In [None]:
# 2. Encode Labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# 3. Tokenization
tokenizer = BertTokenizer.from_pretrained('tbs17/MathBERT')
MAX_LEN = 256  # Math problems are typically short

def tokenize(texts):
    return tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize(X_train)
test_encodings = tokenize(X_test)

# 4. Create Datasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(y_train)
)



In [None]:
# 5. Initialize Model
model = BertForSequenceClassification.from_pretrained(
    'tbs17/MathBERT',
    num_labels=len(label_encoder.classes_),
    output_attentions=True,
    output_hidden_states=True
).to(device)

# 6. Optimizer with Layer-wise LR Decay
def get_optimizer(model):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    
    # Group parameters with and without weight decay
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)]},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)]},
    ]
    
    return AdamW(optimizer_grouped_parameters, lr=2e-5)

optimizer = get_optimizer(model)



In [None]:
# 7. Training Loop
def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    preds, true_labels = [], []
    
    for batch in tqdm(dataloader):
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        true_labels.extend(inputs['labels'].cpu().numpy())
    
    return {
        'loss': total_loss/len(dataloader),
        'f1': f1_score(true_labels, preds, average='weighted')
    }

# 8. Evaluation
def evaluate(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    
    for batch in tqdm(dataloader, desc="Evaluating"):
        batch = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            outputs = model(
                input_ids=batch[0],
                attention_mask=batch[1]
            )
        
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        true_labels.extend(batch[2].cpu().numpy())
    
    # Convert numeric labels to original class names
    pred_names = label_encoder.inverse_transform(preds)
    true_names = label_encoder.inverse_transform(true_labels)
    
    return {
        'f1': f1_score(true_labels, preds, average='weighted'),
    }


In [None]:

# 9. Run Training
BATCH_SIZE = 16
EPOCHS = 10

# Split train into train/val
train_idx, val_idx = train_test_split(
    np.arange(len(y_train)),
    test_size=0.2,
    stratify=y_train,
    random_state=42
)

train_loader = DataLoader(
    TensorDataset(
        train_encodings['input_ids'][train_idx],
        train_encodings['attention_mask'][train_idx],
        torch.tensor(y_train[train_idx])
    ),
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    TensorDataset(
        train_encodings['input_ids'][val_idx],
        train_encodings['attention_mask'][val_idx],
        torch.tensor(y_train[val_idx])
    ),
    batch_size=BATCH_SIZE
)


In [None]:
best_f1 = 0
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train_metrics = train_epoch(model, train_loader, optimizer)
    val_metrics = evaluate(model, val_loader)
    
    print(f"Train Loss: {train_metrics['loss']:.4f} | Train F1: {train_metrics['f1']:.4f}")
    print(f"Val F1: {val_metrics['f1']:.4f}")
    
    
    if val_metrics['f1'] > best_f1:
        best_f1 = val_metrics['f1']
        torch.save(model.state_dict(), 'best_mathbert_model.pth')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, class_names, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred, labels=class_names)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', 
                xticklabels=class_names,
                yticklabels=class_names,
                cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
def get_predictions(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    
    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            outputs = model(
                input_ids=batch[0],
                attention_mask=batch[1]
            )
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        true_labels.extend(batch[2].cpu().numpy())
    
    return true_labels, preds

# Get predictions
train_true, train_pred = get_predictions(model, train_loader)
val_true, val_pred = get_predictions(model, val_loader)

# Convert to class names
class_names = label_encoder.classes_

# Training set
plot_confusion_matrix(
    label_encoder.inverse_transform(train_true),
    label_encoder.inverse_transform(train_pred),
    class_names,
    title='Training Confusion Matrix'
)

# Validation set
plot_confusion_matrix(
    label_encoder.inverse_transform(val_true),
    label_encoder.inverse_transform(val_pred),
    class_names,
    title='Validation Confusion Matrix'
)

In [None]:
# 10. Final Evaluation on Test Set
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask']
)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 1. Load the best saved model
model.load_state_dict(torch.load('best_mathbert_model.pth'))
model.eval()

# 2. Generate predictions
test_preds = []
for batch in tqdm(test_loader, desc="Predicting Test Set"):
    batch = tuple(b.to(device) for b in batch)
    with torch.no_grad():
        outputs = model(
            input_ids=batch[0],
            attention_mask=batch[1]
        )
    test_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

# 3. Convert to original labels
test_labels = label_encoder.inverse_transform(test_preds)

# 4. Save predictions in required format
submission = pd.DataFrame({
    "id": test_df["id"].values,  # Assuming your test_df has an 'id' column
    "label": test_labels
})

# 5. Save to CSV
submission.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

In [None]:
submission