# AG News Classification with LoRA Fine-tuning
This notebook implements an optimized approach to fine-tuning RoBERTa with LoRA for news classification

# Install required libraries

In [1]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-non

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import collections
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    get_scheduler
)
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import load_dataset, Dataset
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm
import random
import gc


In [3]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


## Load and Preprocess Data

In [4]:
# Model configuration
base_model = 'roberta-base'

# Load AG News dataset
dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# Better preprocessing function with improved tokenization
def preprocess(examples):
    # Tokenize with truncation and padding but no max length yet
    # We'll dynamically pad in the DataCollator
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding=False,  # Dynamic padding in data collator
        max_length=256,  # Increased from default to capture more context
        return_tensors=None
    )
    return tokenized

# Process the dataset
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Extract class information
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"Number of labels: {num_labels}")
print(f"Class labels: {class_names}")

# Create label mappings
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

# Create data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Number of labels: 4
Class labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model

In [5]:
# Load the base model with appropriate configuration
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


## Dataset Preparation

In [6]:
# Split dataset with stratification to ensure class balance
split_datasets = tokenized_dataset.train_test_split(test_size=0.1, seed=42, stratify_by_column="labels")
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")

# Calculate label distributions using Counter instead of value_counts
train_labels = train_dataset['labels']
eval_labels = eval_dataset['labels']

train_label_counts = collections.Counter(train_labels)
eval_label_counts = collections.Counter(eval_labels)

print("Training label distribution:")
for label_id, count in train_label_counts.items():
    print(f"  Class {label_id} ({id2label[label_id]}): {count} examples")

print("Validation label distribution:")
for label_id, count in eval_label_counts.items():
    print(f"  Class {label_id} ({id2label[label_id]}): {count} examples")


Training examples: 108000
Validation examples: 12000
Training label distribution:
  Class 1 (Sports): 27000 examples
  Class 3 (Sci/Tech): 27000 examples
  Class 2 (Business): 27000 examples
  Class 0 (World): 27000 examples
Validation label distribution:
  Class 2 (Business): 3000 examples
  Class 0 (World): 3000 examples
  Class 3 (Sci/Tech): 3000 examples
  Class 1 (Sports): 3000 examples


## Optimized LoRA Configuration

In [7]:
# Enhanced LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,  # Increased alpha for stronger updates
    lora_dropout=0.1,  # Increased dropout for better regularization
    bias="none",
    target_modules=["query", "value"],
    task_type=TaskType.SEQ_CLS,
)

# Create PEFT model
peft_model = get_peft_model(model, peft_config)
peft_model.to(device)

print('PEFT Model Parameters:')
peft_model.print_trainable_parameters()


PEFT Model Parameters:
trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


## Optimized Training Setup

In [8]:
# Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate comprehensive metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    # Add per-class metrics
    class_report = classification_report(labels, preds, target_names=class_names, output_dict=True)

    # Extract per-class F1 scores
    class_f1s = {f"f1_{class_names[i]}": class_report[name]['f1-score']
                for i, name in enumerate(class_names)}

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        **class_f1s
    }

    return metrics


In [9]:
# Setup optimized training arguments
# Using the correct parameter names for the version of transformers library
output_dir = "results"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to="wandb",  # Set to "wandb" if you want to use Weights & Biases
    eval_strategy='steps',  # Instead of evaluation_strategy
    eval_steps=500,  # Evaluate more frequently
    logging_steps=100,
    save_strategy='steps',  # Instead of save_strategy
    save_steps=500,
    save_total_limit=3,  # Keep only the best 3 checkpoints
    learning_rate=2e-4,  # Higher learning rate for LoRA
    num_train_epochs=5,  # Train longer
    max_steps=-1,  # Use epochs instead of steps
    per_device_train_batch_size=32,  # Increased batch size
    per_device_eval_batch_size=64,
    weight_decay=0.01,  # L2 regularization
    warmup_ratio=0.1,  # Gradual warmup
    optim="adamw_torch",  # Better optimizer
    fp16=torch.cuda.is_available(), # Enable fp16 only if cuda is available
    gradient_accumulation_steps=2,  # Simulate larger batches
    load_best_model_at_end=True,  # Load best model after training
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42
)


In [10]:
# Define trainer with early stopping
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Training

In [11]:
# Free up memory
gc.collect()
torch.cuda.empty_cache()

# Start training
print("Starting training...")
train_result = trainer.train()

# Save the final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mms15532[0m ([33mms15532-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 World,F1 Sports,F1 Business,F1 Sci/tech
500,0.2858,0.267206,0.90875,0.908871,0.90875,0.9084,0.913117,0.975538,0.864828,0.880117
1000,0.2453,0.247861,0.915,0.918517,0.915,0.915408,0.92121,0.977815,0.878434,0.884175
1500,0.2152,0.20833,0.9275,0.928166,0.9275,0.927538,0.93643,0.982415,0.892652,0.898654
2000,0.196,0.20101,0.9305,0.930414,0.9305,0.930376,0.94197,0.981716,0.895962,0.901857
2500,0.198,0.186604,0.93375,0.934124,0.93375,0.93381,0.946746,0.984891,0.898473,0.905129
3000,0.1974,0.184886,0.935583,0.935724,0.935583,0.935587,0.947938,0.983786,0.902078,0.908548
3500,0.1771,0.185424,0.93525,0.936791,0.93525,0.935413,0.947977,0.986169,0.900683,0.906824
4000,0.1763,0.176158,0.937917,0.938221,0.937917,0.937862,0.949967,0.98719,0.904754,0.909536
4500,0.1738,0.174441,0.939167,0.9397,0.939167,0.939202,0.952767,0.98621,0.905488,0.912343
5000,0.174,0.172495,0.939167,0.939552,0.939167,0.939199,0.952093,0.987182,0.904512,0.913008


('results/tokenizer_config.json',
 'results/special_tokens_map.json',
 'results/vocab.json',
 'results/merges.txt',
 'results/added_tokens.json')

In [12]:
# Show training results
print("Training completed!")
print(f"Training loss: {train_result.training_loss}")
print(f"Training time: {train_result.metrics['train_runtime']} seconds")

# Final evaluation
final_metrics = trainer.evaluate()
print("Final evaluation metrics:")
for key, value in final_metrics.items():
    print(f"{key}: {value:.4f}")


Training completed!
Training loss: 0.21225497331150056
Training time: 2631.4519 seconds


Final evaluation metrics:
eval_loss: 0.1687
eval_accuracy: 0.9415
eval_precision: 0.9419
eval_recall: 0.9415
eval_f1: 0.9416
eval_f1_World: 0.9537
eval_f1_Sports: 0.9872
eval_f1_Business: 0.9091
eval_f1_Sci/Tech: 0.9163
eval_runtime: 21.8626
eval_samples_per_second: 548.8820
eval_steps_per_second: 8.5990
epoch: 4.9973


## Model Evaluation

In [13]:
# Reload best model
best_model_path = output_dir
print("Reloading the best model for detailed evaluation...")
model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=num_labels)
# Load PEFT adapter weights onto the base model
peft_model = PeftModel.from_pretrained(model, best_model_path)
peft_model.to(device)
peft_model.eval()

def evaluate_model(inference_model, dataset, labelled=True, batch_size=32, data_collator=None):
    """Enhanced evaluation function with additional metrics and optimizations"""
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)

    all_predictions = []
    all_labels = []

    # Track progress with tqdm
    progress_bar = tqdm(eval_dataloader, desc="Evaluating")

    # Inference loop
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = inference_model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            all_labels.append(batch["labels"].cpu())

    # Concatenate predictions
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        all_labels = torch.cat(all_labels, dim=0)

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_predictions)
        precision = precision_score(all_labels, all_predictions, average='weighted')
        recall = recall_score(all_labels, all_predictions, average='weighted')
        f1 = f1_score(all_labels, all_predictions, average='weighted')

        print(f"Evaluation Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        # Detailed classification report
        print("\nDetailed Classification Report:")
        print(classification_report(all_labels, all_predictions, target_names=class_names))

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }, all_predictions
    else:
        return all_predictions

# Evaluate on validation set
print("Evaluating model on validation set:")
eval_metrics, _ = evaluate_model(peft_model, eval_dataset, True, 64, data_collator)



Reloading the best model for detailed evaluation...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating model on validation set:


Evaluating: 100%|██████████| 188/188 [01:21<00:00,  2.30it/s]

Evaluation Results:
Accuracy: 0.9416
Precision: 0.9420
Recall: 0.9416
F1 Score: 0.9417

Detailed Classification Report:
              precision    recall  f1-score   support

       World       0.97      0.94      0.95      3000
      Sports       0.99      0.99      0.99      3000
    Business       0.92      0.90      0.91      3000
    Sci/Tech       0.90      0.93      0.92      3000

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000






## Run Inference on Unlabelled Test Dataset

In [14]:
# Load test dataset
print("Loading unlabelled test dataset...")
try:
    # First try to load as pickle file (it might be a pandas DataFrame)
    unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
    # If it's a DataFrame, convert to HF Dataset
    if isinstance(unlabelled_dataset, pd.DataFrame):
        test_dataset = Dataset.from_pandas(unlabelled_dataset)
    else:
        # If it's already a Dataset object, use it directly
        test_dataset = unlabelled_dataset
except Exception as e:
    print(f"Error loading pickle: {e}")
    # Alternative: Try loading as a Dataset directly
    test_dataset = load_dataset('pickle', data_files='test_unlabelled.pkl', split='train')

# Preprocess test data
test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=["text"])

# Run inference
print("Running inference on test dataset...")
preds = evaluate_model(peft_model, test_dataset, False, 32, data_collator)

# Create submission file
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()
})

# Save predictions
submission_path = os.path.join(output_dir, "inference_output.csv")
df_output.to_csv(submission_path, index=False)
print(f"Inference complete. Predictions saved to {submission_path}")

# Show sample of predictions
print("\nSample of predictions:")
print(df_output.head(10))

print("\nPrediction counts per class:")
pred_counts = collections.Counter(df_output['Label'])
for class_id, count in sorted(pred_counts.items()):
    print(f"Class {class_id} ({id2label[class_id]}): {count} predictions")


Loading unlabelled test dataset...


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Running inference on test dataset...


Evaluating: 100%|██████████| 250/250 [01:13<00:00,  3.39it/s]

Inference complete. Predictions saved to results/inference_output.csv

Sample of predictions:
   ID  Label
0   0      3
1   1      0
2   2      0
3   3      3
4   4      1
5   5      1
6   6      3
7   7      0
8   8      3
9   9      3

Prediction counts per class:
Class 0 (World): 1571 predictions
Class 1 (Sports): 1994 predictions
Class 2 (Business): 1731 predictions
Class 3 (Sci/Tech): 2704 predictions



