In [None]:
##############################################
# PART 2: RoBERTa Model
##############################################

!pip install transformers datasets accelerate scikit-learn -q

import pandas as pd
import torch
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from torch.optim import AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm
import matplotlib.pyplot as plt

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device, torch.cuda.get_device_name(0) if torch.cuda.is_available() else "")

# Upload files (train.jsonl, val.jsonl, test.jsonl)
from google.colab import files
uploaded = files.upload()

# Load data
train_df = pd.read_json("train.jsonl", lines=True)
val_df = pd.read_json("val.jsonl", lines=True)
test_df = pd.read_json("test.jsonl", lines=True)

# Process labels
train_df['tags'] = train_df['tags'].apply(lambda x: x[0] if isinstance(x, list) else x)
val_df['tags'] = val_df['tags'].apply(lambda x: x[0] if isinstance(x, list) else x)
label_map = {'phrase': 0, 'passage': 1, 'multi': 2}
train_df['label'] = train_df['tags'].map(label_map)
val_df['label'] = val_df['tags'].map(label_map)

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Combine postText + targetTitle + first 4 paragraphs with RoBERTa separator
def combine_text(row):
    text = f"{row['postText']} </s> {row['targetTitle']}"
    if isinstance(row['targetParagraphs'], list):
        for para in row['targetParagraphs'][:4]:  # Take first 4 paragraphs
            text += f" </s> {para[:100]}"  # Truncate each paragraph to 100 chars
    return text

train_df['text'] = train_df.apply(combine_text, axis=1)
val_df['text'] = val_df.apply(combine_text, axis=1)
test_df['text'] = test_df.apply(combine_text, axis=1)

# Custom Dataset
class SpoilerDataset(Dataset):
    def __init__(self, df, tokenizer, include_labels=True):
        self.texts = df['text'].tolist()
        self.labels = df['label'].tolist() if include_labels else None
        self.include_labels = include_labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=384,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.include_labels:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create datasets and loaders
train_dataset = SpoilerDataset(train_df, tokenizer)
val_dataset = SpoilerDataset(val_df, tokenizer)
test_dataset = SpoilerDataset(test_df, tokenizer, include_labels=False)

collator = DataCollatorWithPadding(tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collator)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collator)

# Initialize model with dropout
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
model.config.hidden_dropout_prob = 0.3
model.to(device)

# Freeze first 6 layers
for name, param in model.named_parameters():
    if "encoder.layer" in name:
        layer_num = int(name.split("encoder.layer.")[1].split(".")[0])
        if layer_num < 6:
            param.requires_grad = False

# Optimizer with Layer-wise Learning Rate Decay (LLRD)
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if "encoder.layer.6" in n], 'lr': 1e-5},
    {'params': [p for n, p in model.named_parameters() if "encoder.layer.7" in n], 'lr': 1e-5},
    {'params': [p for n, p in model.named_parameters() if "classifier" in n], 'lr': 3e-5},
]
optimizer = AdamW(optimizer_grouped_parameters, weight_decay=0.01)

# Scheduler with warmup
num_training_steps = len(train_loader) * 8
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

# Mixed precision training scaler
scaler = torch.cuda.amp.GradScaler()

# Early stopping setup
best_f1 = 0
patience, patience_counter = 3, 0
epochs = 8

train_loss_list, val_f1_list = [], []

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_loss_list.append(avg_loss)
    print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.cuda.amp.autocast():
                outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    f1 = f1_score(true_labels, preds, average='macro')
    val_f1_list.append(f1)
    print(f"Validation F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_roberta.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping!")
            break

print(f"Best Validation F1: {best_f1:.4f}")

# Plot training loss and validation F1
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_loss_list)+1), train_loss_list, marker='o', label='Train Loss', color='blue')
plt.title('Training Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, len(val_f1_list)+1), val_f1_list, marker='o', label='Validation F1', color='green')
plt.title('Validation F1 per Epoch')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.grid()
plt.legend()

plt.tight_layout()
plt.show()

# Load best model and predict on test set
model.load_state_dict(torch.load("best_roberta.pt"))
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast():
            outputs = model(**batch)
        test_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

inv_label_map = {0:'phrase', 1:'passage', 2:'multi'}
test_labels = [inv_label_map[i] for i in test_preds]

# Save final submission CSV
submission = pd.DataFrame({'id': test_df['id'], 'spoilerType': test_labels})
submission.to_csv('prediction_task1.csv', index=False)

files.download('prediction_task1.csv')
