In [1]:
# Install required libraries
!pip install transformers datasets evaluate accelerate peft bitsandbytes trl optuna
!pip install nvidia-ml-py3

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Down

In [2]:
import torch
import pandas as pd
import numpy as np
import pickle
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

2025-04-18 08:49:52.809855: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744966192.993591      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744966193.047551      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Load tokenizer and dataset
base_model = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(base_model)
dataset = load_dataset('ag_news')
# Preprocess function
def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=256)
# Tokenize dataset
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [4]:
# Split dataset
split_datasets = tokenized_dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [5]:
# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy}

In [6]:
# Objective function for hyperparameter tuning
def objective(trial):

    # Suggest hyperparameters
    lora_r = trial.suggest_categorical('lora_r', [4, 8, 16])
    lora_alpha = trial.suggest_categorical('lora_alpha', [8, 16, 32])
    lora_dropout = trial.suggest_float('lora_dropout', 0.05, 0.2)
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 2e-5, 3e-5])
    num_epochs = trial.suggest_int('num_epochs', 2,3)

    # Load pre-trained RoBERTa model
    model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=4)

    # LoRA configuration
    peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias='none',
        target_modules=['query', 'value'],
        task_type="SEQ_CLS"
    )

    # Get LoRA model
    peft_model = get_peft_model(model, peft_config)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"results_trial_{trial.number}",
        eval_strategy="steps",
        eval_steps=250,
        logging_steps=250,
        learning_rate=learning_rate,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        save_total_limit=1,
        optim="adamw_torch",
        load_best_model_at_end=True,
        lr_scheduler_type="cosine",
        warmup_steps=500,
        gradient_accumulation_steps=2,
        report_to="none"
    )

    # Trainer setup
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    accuracy = eval_results['eval_accuracy']

    # Log experiment details
    trial.set_user_attr("accuracy", accuracy)

    return accuracy


In [7]:
import optuna
# Run hyperparameter tuning with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

# Save experiment log
experiment_log = pd.DataFrame([{
    **study.best_params,
    "best_accuracy": study.best_value
}])
experiment_log.to_csv("experiment_log.csv", index=False)
print("Experiment log saved: experiment_log.csv")


[I 2025-04-18 08:51:20,448] A new study created in memory with name: no-name-12f2452e-1632-4834-8a98-6ba457bcd737
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
250,1.3831,1.365604,0.50575
500,0.9259,0.325141,0.891
750,0.3191,0.285537,0.903167
1000,0.2956,0.275186,0.905583
1250,0.2831,0.265026,0.90975
1500,0.2786,0.262909,0.90825
1750,0.2704,0.256629,0.913083
2000,0.2611,0.254235,0.911667
2250,0.2632,0.251777,0.913833
2500,0.2617,0.247722,0.914583


[I 2025-04-18 11:15:33,548] Trial 0 finished with value: 0.9166666666666666 and parameters: {'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1525095153494318, 'learning_rate': 2e-05, 'num_epochs': 3}. Best is trial 0 with value: 0.9166666666666666.


Best hyperparameters: {'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1525095153494318, 'learning_rate': 2e-05, 'num_epochs': 3}
Best validation accuracy: 0.9166666666666666
Experiment log saved: experiment_log.csv


In [8]:

# Train final model with best hyperparameters
best_params = study.best_params

final_model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=4)
final_peft_config = LoraConfig(
    r=best_params['lora_r'],
    lora_alpha=best_params['lora_alpha'],
    lora_dropout=best_params['lora_dropout'],
    bias='none',
    target_modules=['query', 'value'],
    task_type="SEQ_CLS"
)
final_peft_model = get_peft_model(final_model, final_peft_config)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
final_training_args = TrainingArguments(
    output_dir="final_results",
    eval_strategy="steps",
    eval_steps=250,
    logging_steps=250,
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=best_params['num_epochs'],
    weight_decay=0.01,
    save_total_limit=1,
    optim="adamw_torch",
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    gradient_accumulation_steps=2,
    report_to="none"
)
final_trainer = Trainer(
    model=final_peft_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

final_trainer.train()

  final_trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
250,1.3865,1.36301,0.566417
500,0.887,0.33103,0.88725
750,0.3241,0.289554,0.90275
1000,0.3002,0.277886,0.905
1250,0.2855,0.266949,0.908917
1500,0.2824,0.263771,0.9085
1750,0.2738,0.258688,0.911917
2000,0.2642,0.255948,0.912167
2250,0.2659,0.253197,0.9145
2500,0.2663,0.249463,0.914


TrainOutput(global_step=5061, training_loss=0.3520867928665303, metrics={'train_runtime': 8561.218, 'train_samples_per_second': 37.845, 'train_steps_per_second': 0.591, 'total_flos': 4.304570301284352e+16, 'train_loss': 0.3520867928665303, 'epoch': 2.9985185185185186})

In [10]:
# Load dataset object from pickle file
with open("/kaggle/input/tests-dataset/test_unlabelled.pkl", "rb") as f:
    test_data = pickle.load(f)

# Convert loaded data into HuggingFace Dataset format
test_dataset = Dataset.from_dict({"text": test_data["text"]})

# Define preprocessing function (consistent with your training preprocessing)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer to the test dataset
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching during inference
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Set device explicitly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
final_peft_model.to(device)

# Prediction loop
final_peft_model.eval()
all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = final_peft_model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())

# Save predictions to CSV
submission_df = pd.DataFrame({
    'ID': range(len(all_predictions)),
    'Label': all_predictions
})

submission_df.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

100%|██████████| 125/125 [00:28<00:00,  4.35it/s]

Submission file created: submission.csv



