In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

# ===== 1. Load dataset =====
dataset = load_dataset("project-droid/DroidCollection")


train_dataset = dataset["train"]
dev_dataset   = dataset["dev"]
test_dataset  = dataset["test"]

# ===== 3. Remap string labels to 0/1 =====
def remap_labels(example):
    example["Label"] = 0 if example["Label"] == "HUMAN_GENERATED" else 1
    return example

train_dataset = train_dataset.map(remap_labels)
dev_dataset   = dev_dataset.map(remap_labels)
test_dataset  = test_dataset.map(remap_labels)

# ===== 4. Load tokenizer =====
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-large")

# ===== 5. Tokenization function =====
def tokenize_fn(examples):
    tokens = tokenizer(
        examples["Code"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["Label"] = examples["Label"]  # keep label
    return tokens

# ===== 6. Tokenize datasets =====
train_dataset = train_dataset.map(tokenize_fn, batched=True)
dev_dataset   = dev_dataset.map(tokenize_fn, batched=True)
test_dataset  = test_dataset.map(tokenize_fn, batched=True)





train_dataset = train_dataset.rename_column("Label", "labels")
dev_dataset   = dev_dataset.rename_column("Label", "labels")
test_dataset  = test_dataset.rename_column("Label", "labels")

columns = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns)
dev_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)
