**Configuration Summary:**
- Model: roberta-base + LoRA
- LoRA Rank: 8, Alpha: 16
- Dropout: 0.1
- Target Modules: query, value
- Epochs: 5
- Learning Rate: 2.5e-5
- Max Length: 128
- Warmup Ratio: 0.1
- Scheduler: linear
- Batch Size: 64
- Seed: 42
- No label smoothing

In [None]:
!pip install -q transformers datasets peft accelerate evaluate

In [None]:
#Load and tokenize dataset
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import RobertaTokenizer

dataset = load_dataset("ag_news")
split_data = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_data = split_data["train"]
val_data = split_data["test"]

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_data.map(tokenize, batched=True).rename_column("label", "labels").remove_columns(["text"]).with_format("torch")
val_dataset = val_data.map(tokenize, batched=True).rename_column("label", "labels").remove_columns(["text"]).with_format("torch")

In [None]:
#Setup LoRA model
from transformers import RobertaForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
for p in model.base_model.parameters():
    p.requires_grad = False

config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"],
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, config)
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
print("Total parameters:", sum(p.numel() for p in model.parameters()))

In [None]:
#Training
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, set_seed
import evaluate
import numpy as np

set_seed(42)
args = TrainingArguments(
    output_dir="./final_model",
    learning_rate=2.5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    fp16=True,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    report_to="none",
    seed=42,
    evaluation_strategy="steps",
    eval_steps=250
)

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
#Plot loss and accuracy
import matplotlib.pyplot as plt

log = trainer.state.log_history
losses, accs, steps = [], [], []

for entry in log:
    if "loss" in entry and "step" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])
    if "eval_accuracy" in entry:
        accs.append(entry["eval_accuracy"])

plt.figure(figsize=(10, 5))
plt.plot(steps, losses, label="Train Loss")
eval_steps = [entry["step"] for entry in log if "eval_accuracy" in entry]
if eval_steps:
    plt.plot(eval_steps, accs, label="Eval Accuracy")
plt.xlabel("Step")
plt.ylabel("Metric")
plt.title("Training Metrics")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Metrics on validation set
results = trainer.evaluate(val_dataset)
print("\n Validation Metrics:")
for k, v in results.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

In [None]:
#Generate submission file
from google.colab import files
uploaded = files.upload()

pkl_file = [k for k in uploaded if k.endswith(".pkl")][0]
df_unlabelled = pd.read_pickle(pkl_file)
if hasattr(df_unlabelled, "to_pandas"):
    df_unlabelled = df_unlabelled.to_pandas()

submission_dataset = Dataset.from_pandas(df_unlabelled[["text"]])
submission_dataset = submission_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=128), batched=True)
submission_dataset = submission_dataset.remove_columns(["text"]).with_format("torch")

preds = trainer.predict(submission_dataset)
labels = np.argmax(preds.predictions, axis=-1)

submission = pd.DataFrame({
    "ID": df_unlabelled["id"] if "id" in df_unlabelled.columns else np.arange(len(labels)),
    "Label": labels
})
submission.to_csv("submission.csv", index=False, encoding="utf-8")
files.download("submission.csv")