In [None]:
# ================== STEP 1: Install and Setup ==================
!pip install transformers -q
!pip install scikit-learn -q
!pip install accelerate -q

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
from accelerate import Accelerator
import zipfile
from google.colab import drive

drive.mount('/content/drive')

# ================== STEP 2: Load and Preprocess Data ==================

train_path = "/content/drive/MyDrive/ground_truth.csv"
df = pd.read_csv(train_path)
df = df.dropna(subset=["content"]).copy()

# Merge title + content if title exists
if "title" in df.columns:
    df["title"] = df["title"].fillna("")
    df["text"] = (df["title"] + " " + df["content"]).fillna("")
else:
    df["text"] = df["content"].fillna("")

df["Class"] = df["Class"].str.strip().str.lower()
df["label"] = df["Class"].map({"human": 0, "machine": 1})
df = df.dropna(subset=["label"])

# Stratified split for balanced train/val
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.1, random_state=42, stratify=df["label"]
)

# ================== STEP 3: Tokenization ==================
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_enc = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

def to_list(encodings):
    return {k: v if isinstance(v, list) else v.tolist() for k, v in encodings.items()}

train_enc = to_list(train_enc)
val_enc = to_list(val_enc)

# ================== STEP 4: Dataset Class ==================
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_enc, train_labels)
val_dataset = TextDataset(val_enc, val_labels)

# ================== STEP 5: Weighted Sampler ==================

class_counts = np.bincount(train_labels)
class_weights = 1.0 / class_counts
sample_weights = [class_weights[label] for label in train_labels]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# ================== STEP 6: Model & Training Setup ==================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
model.to(device)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=0.1)

optimizer = AdamW(model.parameters(), lr=3e-5)

num_epochs = 10
num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

accelerator = Accelerator()
model, optimizer, train_loader, val_loader = accelerator.prepare(
    model, optimizer, train_loader, val_loader
)

# ================== STEP 7: Training Loop ==================

best_f1 = 0.0
patience = 3
early_stop_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        outputs = model(**batch)
        loss = loss_fn(outputs.logits, batch["labels"])
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} avg training loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(accelerator.gather(preds).cpu().numpy())
            all_labels.extend(accelerator.gather(batch["labels"]).cpu().numpy())

    f1 = f1_score(all_labels, all_preds, pos_label=1)
    print(f"Epoch {epoch+1} Validation F1 (machine class): {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        early_stop_counter = 0
        unwrapped_model = accelerator.unwrap_model(model)
        torch.save(unwrapped_model.state_dict(), "/content/best_model.pt")
        print(f"✅ Saved best model at epoch {epoch+1} (F1 = {f1:.4f})")
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"Stopping early at epoch {epoch+1} due to no F1 improvement.")
            break

# ================== STEP 8: Inference Function ==================

def predict_unlabeled(csv_path):
    df_unlabeled = pd.read_csv(csv_path)
    df_unlabeled["title"] = df_unlabeled.get("title", pd.Series([""] * len(df_unlabeled))).fillna("")
    df_unlabeled["content"] = df_unlabeled["content"].fillna("")
    df_unlabeled["text"] = df_unlabeled["title"] + " " + df_unlabeled["content"]

    encodings = tokenizer(df_unlabeled["text"].tolist(), truncation=True, padding=True, max_length=512)
    encodings = to_list(encodings)

    dataset = TextDataset(encodings, [0]*len(df_unlabeled))
    loader = DataLoader(dataset, batch_size=batch_size)

    preds = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            outputs = model(**batch)
            batch_preds = torch.argmax(outputs.logits, dim=-1)
            preds.extend(batch_preds.cpu().numpy())
    return preds

# ================== STEP 9: Load Best Model and Predict ==================

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
model.load_state_dict(torch.load("/content/best_model.pt"))
model.to(device)

test_path = "/content/drive/MyDrive/test_unlabeled.csv"
test_preds = predict_unlabeled(test_path)
labels = ["human" if p == 0 else "machine" for p in test_preds]

submission = pd.DataFrame({"label": labels})
submission_path = "/content/predictions.csv"
submission.to_csv(submission_path, index=False)

zip_path = "/content/predictions.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(submission_path, arcname="predictions.csv")

print("✅ Saved predictions.csv and compressed to predictions.zip")
