In [1]:
import os
import re
import numpy as np
import pandas as pd
import pickle
from typing import Literal, Union, List, Any, Dict
import matplotlib.pyplot as plt
from tqdm.auto import tqdm 

import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchinfo import summary

from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, BitsAndBytesConfig, Gemma3ForCausalLM, DataCollatorForLanguageModeling
from peft import PeftModel, get_peft_model, LoraConfig, prepare_model_for_kbit_training, TaskType
import evaluate
from trl import SFTTrainer

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score

In [2]:
if torch.backends.mps.is_available():
    if torch.backends.mps.is_built():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")

torch.manual_seed(42)
if device=="mps":
    torch.mps.manual_seed(42)

print("Device: ", device)

Device:  mps


# Load Data

In [3]:
data = pd.read_csv("datasets/processed/prepared_data.csv", index_col=[0])
data.head(3)

Unnamed: 0,language,domain,topic,text,category,split,processed_text,processed_lemmas
0,tr-TR,wikibooks,"geography, turkey",Türkiye'nin üç tarafı denizlerle çevrilidir: B...,6,train,türkiye'nin üç tarafı denizlerle çevrilidir ba...,türkiye üç taraf deniz çevri batı ege deniz ku...
1,tr-TR,wikibooks,"world war ii, submarines",Savaşın başlangıcında çoğunlukla denizin yüzey...,0,train,savaşın başlangıcında denizin yüzeyinde seyaha...,savaş başlangıç deniz yüzey seyahat etmek rada...
2,tr-TR,wikivoyage,"natural wonders, hill stations in india","Ancak, kış aylarında farklı bir güzelliği ve ç...",5,train,ancak kış aylarında farklı güzelliği çekiciliğ...,kış ay fark güzel çekici var birçok tepe şehir...


In [4]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data[["split","text","category"]], preserve_index=False)

# Split dataset into train, test, validation
dataset_dict = DatasetDict({
    "train": dataset.filter(lambda x: x["split"] == "train"),
    "test": dataset.filter(lambda x: x["split"] == "test"),
    "validation": dataset.filter(lambda x: x["split"] == "validation"),
})

# Remove the 'split' column as it's no longer needed
for split in dataset_dict.keys():
    dataset_dict[split] = dataset_dict[split].remove_columns(column_names=["split"])
    dataset_dict[split] = dataset_dict[split].rename_column("category", "labels")
dataset_dict

Filter:   0%|          | 0/4795 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4795 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4795 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 3595
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 857
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 343
    })
})

In [5]:
id2category = {
    0: "science/technology",
    1: "travel",
    2: "politics",
    3: "sports",
    4: "health",
    5: "entertainment",
    6: "geography"
}
category2id = {
    "science/technology": 0,
    "travel": 1,
    "politics": 2,
    "sports": 3,
    "health": 4,
    "entertainment": 5,
    "geography": 6
}

# Retraning All Parameters

In [None]:
# Initialize Teacher Model 
teacher_model_id = 'xlm-roberta-large'
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_id,
    num_labels=7,
    id2label=id2category,
    label2id=category2id
)
tokenizer_teacher = AutoTokenizer.from_pretrained(teacher_model_id)
# tokenizer_teacher.add_special_tokens({'pad_token': '[PAD]'})

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1

In [166]:
# Tokenize the dataset
tokenized_data = dataset_dict.map(lambda x: tokenizer_teacher(x["text"]), batched=True)

# Define Accuracy Metric
accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/3595 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

In [167]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    return {"accuracy": acc}

In [168]:
# Training HyperParameters
lr = 2e-4
weight_decay = 0.01
batch_size = 16
num_epochs = 10

teacher_training_args = TrainingArguments(
    # output_dir="bert-category-classifier-teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer_teacher)
# Use the custom compute_loss function in Trainer
trainer = Trainer(
    model=teacher_model,
    args=teacher_training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer_teacher,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9458,1.900039,0.299883
2,1.9171,1.852447,0.299883
3,1.9125,1.87659,0.299883
4,1.9017,1.879262,0.299883
5,1.9015,1.86976,0.299883
6,1.8937,1.881078,0.155193
7,1.8925,1.864643,0.299883
8,1.8858,1.87018,0.299883
9,1.885,1.863566,0.299883
10,1.8821,1.871811,0.299883


TrainOutput(global_step=2250, training_loss=1.9017613254123265, metrics={'train_runtime': 1660.2411, 'train_samples_per_second': 21.653, 'train_steps_per_second': 1.355, 'total_flos': 3894661567799532.0, 'train_loss': 1.9017613254123265, 'epoch': 10.0})

In [219]:
# Forward pass
tokenized_val_data = tokenizer_teacher(
    dataset_dict["validation"]["text"], 
    padding=True, 
    truncation=True,
    return_tensors="pt",
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_teacher, return_tensors="pt")
batch_inputs = data_collator(tokenized_val_data)

teacher_model.eval()
with torch.no_grad():
    teacher_outputs = teacher_model.to("cpu")(**batch_inputs)
teacher_logits = teacher_outputs.logits
teacher_preds = torch.softmax(teacher_logits, dim=1).argmax(dim=1)

In [224]:
compute_metrics(eval_pred=(teacher_logits.numpy(), dataset_dict["validation"]["labels"]))

{'accuracy': 0.27988338192419826}

In [228]:
save_directory = "./fine_tuned_models/fine_tuned_roberta_teacher_model"

# Save model
teacher_model.save_pretrained(save_directory)
# Save tokenizer
tokenizer_teacher.save_pretrained(save_directory)

('./fine_tuned_models/fine_tuned_roberta_teacher_model/tokenizer_config.json',
 './fine_tuned_models/fine_tuned_roberta_teacher_model/special_tokens_map.json',
 './fine_tuned_models/fine_tuned_roberta_teacher_model/sentencepiece.bpe.model',
 './fine_tuned_models/fine_tuned_roberta_teacher_model/added_tokens.json',
 './fine_tuned_models/fine_tuned_roberta_teacher_model/tokenizer.json')

# Transfer Learning

## a. Fine Tuning only classifier layer of Teacher Model 2

In [None]:
# Initialize Teacher Model 
teacher_model_id2 = 'xlm-roberta-large'
teacher_model2 = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_id2,
    num_labels=7,
    id2label=id2category,
    label2id=category2id
)
tokenizer_teacher2 = AutoTokenizer.from_pretrained(teacher_model_id2)
# tokenizer_teacher2.add_special_tokens({'pad_token': '[PAD]'})

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1

In [170]:
teacher_model2

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [171]:
for name, param in teacher_model2.named_parameters():
    print(name)

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

In [172]:
# Keeping only classifier layer trainable and the rest will be remained frozen
for name, param in teacher_model2.named_parameters():
    if "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [173]:
# Tokenize the dataset
tokenized_data = dataset_dict.map(lambda x: tokenizer_teacher2(x["text"]), batched=True)

# Define Accuracy Metric
accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/3595 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

In [174]:
# Training HyperParameters
lr = 1e-3
weight_decay = 0.01
batch_size = 16
num_epochs = 10

teacher_training_args2 = TrainingArguments(
    # output_dir="bert-category-classifier-teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

data_collator2 = DataCollatorWithPadding(tokenizer=tokenizer_teacher2)
# Use the custom compute_loss function in Trainer
trainer2 = Trainer(
    model=teacher_model2,
    args=teacher_training_args2,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer_teacher2,
    compute_metrics=compute_metrics,
    data_collator=data_collator2
)

trainer2.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.8941,1.694619,0.389732
2,1.6454,1.4474,0.408401
3,1.4758,1.229917,0.627771
4,1.3709,1.194873,0.572929
5,1.288,0.982774,0.694282
6,1.2256,1.036065,0.638273
7,1.1694,0.92805,0.708285
8,1.1062,0.879331,0.726954
9,1.0818,0.868958,0.747958
10,1.0503,0.8385,0.746791


TrainOutput(global_step=2250, training_loss=1.3307618476019965, metrics={'train_runtime': 322.4626, 'train_samples_per_second': 111.486, 'train_steps_per_second': 6.978, 'total_flos': 3894661567799532.0, 'train_loss': 1.3307618476019965, 'epoch': 10.0})

In [216]:
# Forward pass
tokenized_val_data = tokenizer_teacher2(
    dataset_dict["validation"]["text"], 
    padding=True, 
    truncation=True,
    return_tensors="pt",
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_teacher2, return_tensors="pt")
batch_inputs = data_collator(tokenized_val_data)

teacher_model2.eval()
with torch.no_grad():
    teacher_outputs2 = teacher_model2.to("cpu")(**batch_inputs)
teacher_logits2 = teacher_outputs2.logits
teacher_preds2 = torch.softmax(teacher_logits2, dim=1).argmax(dim=1)

In [218]:
compute_metrics(eval_pred=(teacher_logits2.numpy(), dataset_dict["validation"]["labels"]))

{'accuracy': 0.7638483965014577}

In [227]:
save_directory = "./fine_tuned_models/fine_tuned_roberta_teacher_model2"

# Save model
teacher_model2.save_pretrained(save_directory)
# Save tokenizer
tokenizer_teacher2.save_pretrained(save_directory)

('./fine_tuned_models/fine_tuned_roberta_teacher_model2/tokenizer_config.json',
 './fine_tuned_models/fine_tuned_roberta_teacher_model2/special_tokens_map.json',
 './fine_tuned_models/fine_tuned_roberta_teacher_model2/sentencepiece.bpe.model',
 './fine_tuned_models/fine_tuned_roberta_teacher_model2/added_tokens.json',
 './fine_tuned_models/fine_tuned_roberta_teacher_model2/tokenizer.json')

## b. Fine Tuning Student Model with Fine-Tuned Teacher Model 2

In [181]:
# Initialize Student Model
student_model_id = 'bert-base-multilingual-cased'
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_model_id,
    num_labels=7,
    id2label=id2category,
    label2id=category2id
)
tokenizer_student = AutoTokenizer.from_pretrained(student_model_id)
tokenizer_student.add_special_tokens({'pad_token': '[PAD]'})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0

In [182]:
student_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [183]:
for name, param in student_model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [184]:
# Freeze all student model parameters except pooler and classifier layers
for name, param in student_model.named_parameters():
    if ("pooler" in name) | ("classifier" in name):
        param.requires_grad = True
    else:
        param.requires_grad = False

In [185]:
# Tokenize the dataset
tokenized_data = dataset_dict.map(lambda x: tokenizer_student(x["text"]), batched=True)

# Define Accuracy Metric
accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/3595 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

In [189]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.softmax(torch.tensor(logits, dtype=torch.float32), dim=1).argmax(dim=1)
    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    return {"accuracy": acc}

# Custom Trainer Class for Knowledge Distillation
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Get student model's input features
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = inputs["labels"]
        
        # Get teacher's logits
        with torch.no_grad():
            teacher_logits = self.teacher_model(input_ids).logits

        # Forward pass through student model
        student_outputs = model(input_ids, attention_mask=attention_mask)
        student_logits = student_outputs.logits

        # Compute the distillation loss (KL Divergence)
        student_probs = torch.softmax(student_logits / 2.0, dim=1)  # Temperature=2
        teacher_probs = torch.softmax(teacher_logits / 2.0, dim=1)
        if num_items_in_batch is not None:
            kl_loss = F.kl_div(student_probs.log(), teacher_probs, reduction='batchmean')
        else:
            kl_loss = F.kl_div(student_probs.log(), teacher_probs, reduction="none")
        
        # Compute Cross-Entropy loss
        ce_loss = torch.nn.functional.cross_entropy(student_logits, labels)
        
        # Compute weighted fina loss
        alpha = 0.5
        final_loss = alpha * kl_loss + (1-alpha) * ce_loss

        return (final_loss, student_outputs) if return_outputs else final_loss

In [None]:
# Training HyperParameters
lr = 1e-3
weight_decay = 0.01
batch_size = 16
num_epochs = 10

training_args_student = TrainingArguments(
    # output_dir="bert-category-classifier-student",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

data_collator_student = DataCollatorWithPadding(tokenizer=tokenizer_student)
# Use the custom compute_loss function in Trainer
trainer_student = DistillationTrainer(
    model=student_model,
    args=training_args_student,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator_student,
    teacher_model=teacher_model2
)

trainer_student.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6619,0.394438,0.786464
2,0.5602,0.371054,0.803967
3,0.5396,0.367315,0.807468
4,0.5242,0.380781,0.786464
5,0.5254,0.360209,0.812135
6,0.5166,0.365204,0.805134
7,0.4991,0.358351,0.815636
8,0.5003,0.360284,0.808635
9,0.4927,0.34969,0.819137
10,0.4805,0.348302,0.822637


TrainOutput(global_step=2250, training_loss=0.5300369160970052, metrics={'train_runtime': 416.1976, 'train_samples_per_second': 86.377, 'train_steps_per_second': 5.406, 'total_flos': 1152083474035200.0, 'train_loss': 0.5300369160970052, 'epoch': 10.0})

In [192]:
# Forward pass
tokenized_val_data = tokenizer_student(
    dataset_dict["validation"]["text"], 
    padding=False, 
    truncation=True,
    # return_tensors="pt",
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_student, return_tensors="pt")
batch_inputs = data_collator(tokenized_val_data)

student_model.eval()
with torch.no_grad():
    student_outputs = student_model.to("cpu")(**batch_inputs)
student_logits = student_outputs.logits
student_preds = torch.softmax(student_logits, dim=1).argmax(dim=1)

In [199]:
compute_metrics(eval_pred=(student_logits.numpy(), dataset_dict["validation"]["labels"]))

{'accuracy': 0.8688046647230321}

In [226]:
save_directory = "./fine_tuned_models/fine_tuned_bert_student_model"

# Save model
student_model.save_pretrained(save_directory)
# Save tokenizer
tokenizer_student.save_pretrained(save_directory)

('./fine_tuned_models/fine_tuned_bert_student_model/tokenizer_config.json',
 './fine_tuned_models/fine_tuned_bert_student_model/special_tokens_map.json',
 './fine_tuned_models/fine_tuned_bert_student_model/vocab.txt',
 './fine_tuned_models/fine_tuned_bert_student_model/added_tokens.json',
 './fine_tuned_models/fine_tuned_bert_student_model/tokenizer.json')

# Parameter Efficient Fine-Tuning (PEFT)

## a. Low-Rank Adaptation (LoRA)

In [310]:
# Initialize Teacher Model 
model_wout_lora_id = 'xlm-roberta-large'
model_wout_lora = AutoModelForSequenceClassification.from_pretrained(
    model_wout_lora_id,
    num_labels=7,
    id2label=id2category,
    label2id=category2id
)
tokenizer_lora = AutoTokenizer.from_pretrained(model_wout_lora_id)
# tokenizer_teacher2.add_special_tokens({'pad_token': '[PAD]'})

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [311]:
for name, param in model_wout_lora.named_parameters():
    print(name)

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

In [312]:
# Keeping only classifier layer trainable and the rest will be remained frozen
for name, param in model_wout_lora.named_parameters():
    if "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [313]:
for name, module in model_wout_lora.named_modules():
    print(name)


roberta
roberta.embeddings
roberta.embeddings.word_embeddings
roberta.embeddings.position_embeddings
roberta.embeddings.token_type_embeddings
roberta.embeddings.LayerNorm
roberta.embeddings.dropout
roberta.encoder
roberta.encoder.layer
roberta.encoder.layer.0
roberta.encoder.layer.0.attention
roberta.encoder.layer.0.attention.self
roberta.encoder.layer.0.attention.self.query
roberta.encoder.layer.0.attention.self.key
roberta.encoder.layer.0.attention.self.value
roberta.encoder.layer.0.attention.self.dropout
roberta.encoder.layer.0.attention.output
roberta.encoder.layer.0.attention.output.dense
roberta.encoder.layer.0.attention.output.LayerNorm
roberta.encoder.layer.0.attention.output.dropout
roberta.encoder.layer.0.intermediate
roberta.encoder.layer.0.intermediate.dense
roberta.encoder.layer.0.intermediate.intermediate_act_fn
roberta.encoder.layer.0.output
roberta.encoder.layer.0.output.dense
roberta.encoder.layer.0.output.LayerNorm
roberta.encoder.layer.0.output.dropout
roberta.encod

In [314]:
lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"]
)

model_lora = get_peft_model(model_wout_lora, lora_config)
model_lora.print_trainable_parameters()

trainable params: 2,629,639 || all params: 562,527,246 || trainable%: 0.4675


In [None]:
for name, param in model_lora.named_parameters():
    print(f"Reuqires grad: {param.requires_grad} --> {name}")

Reuqires grad: False --> base_model.model.roberta.embeddings.word_embeddings.weight
Reuqires grad: False --> base_model.model.roberta.embeddings.position_embeddings.weight
Reuqires grad: False --> base_model.model.roberta.embeddings.token_type_embeddings.weight
Reuqires grad: False --> base_model.model.roberta.embeddings.LayerNorm.weight
Reuqires grad: False --> base_model.model.roberta.embeddings.LayerNorm.bias
Reuqires grad: False --> base_model.model.roberta.encoder.layer.0.attention.self.query.base_layer.weight
Reuqires grad: False --> base_model.model.roberta.encoder.layer.0.attention.self.query.base_layer.bias
Reuqires grad: True --> base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
Reuqires grad: True --> base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
Reuqires grad: False --> base_model.model.roberta.encoder.layer.0.attention.self.key.weight
Reuqires grad: False --> base_model.model.roberta.encoder.layer.0.at

In [316]:
# Tokenize the dataset
tokenized_data = dataset_dict.map(lambda x: tokenizer_lora(x["text"]), batched=True)

# Define Accuracy Metric
accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/3595 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

In [317]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.softmax(torch.tensor(logits, dtype=torch.float32), dim=1).argmax(dim=1)
    return accuracy.compute(predictions=preds, references=labels)

In [318]:
# Training HyperParameters
lr = 1e-3
weight_decay = 0.01
batch_size = 16
num_epochs = 10

training_args_w_lora = TrainingArguments(
    # output_dir="bert-category-classifier-teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

data_collator_w_lora = DataCollatorWithPadding(tokenizer=tokenizer_lora)
# Use the custom compute_loss function in Trainer
trainer_w_lora = Trainer(
    model=model_lora,
    args=training_args_w_lora,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer_lora,
    compute_metrics=compute_metrics,
    data_collator=data_collator_w_lora
)

trainer_w_lora.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.0047,1.939613,0.299883
2,1.9328,1.859379,0.299883
3,1.9243,1.863775,0.299883
4,1.9084,1.903535,0.155193
5,1.9065,1.894046,0.299883
6,1.8994,1.880758,0.155193
7,1.8951,1.863283,0.299883
8,1.8878,1.873966,0.299883
9,1.8789,1.862162,0.299883
10,1.8793,1.869251,0.299883


TrainOutput(global_step=2250, training_loss=1.9117214898003472, metrics={'train_runtime': 766.0434, 'train_samples_per_second': 46.929, 'train_steps_per_second': 2.937, 'total_flos': 3928421052096984.0, 'train_loss': 1.9117214898003472, 'epoch': 10.0})

In [319]:
save_lora_model_directory = "./fine_tuned_models/fine_tuned_roberta_w_lora"

# Save model
model_lora.save_pretrained(save_lora_model_directory)
# Save tokenizer
tokenizer_lora.save_pretrained(save_lora_model_directory)

('./fine_tuned_models/fine_tuned_roberta_w_lora/tokenizer_config.json',
 './fine_tuned_models/fine_tuned_roberta_w_lora/special_tokens_map.json',
 './fine_tuned_models/fine_tuned_roberta_w_lora/sentencepiece.bpe.model',
 './fine_tuned_models/fine_tuned_roberta_w_lora/added_tokens.json',
 './fine_tuned_models/fine_tuned_roberta_w_lora/tokenizer.json')

In [320]:
# Load base model first
base_roberta_model = AutoModelForSequenceClassification.from_pretrained(model_wout_lora_id, 
                                                                        num_labels=7, 
                                                                        id2label=id2category, 
                                                                        label2id=category2id)

# Load LoRA model on top of base model
loaded_lora_model = PeftModel.from_pretrained(base_roberta_model, save_lora_model_directory)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [321]:
# Forward pass
tokenized_val_data = tokenizer_lora(
    dataset_dict["validation"]["text"], 
    padding=True, 
    truncation=True,
    return_tensors="pt",
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_lora, return_tensors="pt")
batch_inputs = data_collator(tokenized_val_data)

model_lora.eval()
with torch.no_grad():
    model_lora_outputs = loaded_lora_model.to("cpu")(**batch_inputs)
model_lora_outputs_logits = model_lora_outputs.logits
model_lora_preds = torch.softmax(model_lora_outputs_logits, dim=1).argmax(dim=1)

In [322]:
compute_metrics(eval_pred=(model_lora_outputs_logits.numpy(), dataset_dict["validation"]["labels"]))

{'accuracy': 0.27988338192419826}

## b. Quantized Low-Rank Adaptation (QLoRA)

In [290]:
# Initialize Teacher Model 
model_wout_qlora_id = 'xlm-roberta-large'
model_wout_qlora = AutoModelForSequenceClassification.from_pretrained(
    model_wout_qlora_id,
    num_labels=7,
    id2label=id2category,
    label2id=category2id
)
tokenizer_qlora = AutoTokenizer.from_pretrained(model_wout_qlora_id)
# tokenizer_teacher2.add_special_tokens({'pad_token': '[PAD]'})

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [291]:
# Keeping only classifier layer trainable and the rest will be remained frozen
for name, param in model_wout_qlora.named_parameters():
    if "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [292]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype= torch.bfloat16
)

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"]
)

model_wout_qlora = prepare_model_for_kbit_training(model_wout_qlora)

model_qlora = get_peft_model(model_wout_qlora, lora_config)
model_qlora.print_trainable_parameters()

trainable params: 2,629,639 || all params: 562,527,246 || trainable%: 0.4675


In [293]:
model_qlora

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): XLMRobertaForSequenceClassification(
      (roberta): XLMRobertaModel(
        (embeddings): XLMRobertaEmbeddings(
          (word_embeddings): Embedding(250002, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): XLMRobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x XLMRobertaLayer(
              (attention): XLMRobertaAttention(
                (self): XLMRobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
     

In [294]:
# Tokenize the dataset
tokenized_data = dataset_dict.map(lambda x: tokenizer_qlora(x["text"]), batched=True)

# Define Accuracy Metric
accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/3595 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

In [295]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.softmax(torch.tensor(logits, dtype=torch.float32), dim=1).argmax(dim=1)
    return accuracy.compute(predictions=preds, references=labels)

In [296]:
# Training HyperParameters
lr = 2e-4
weight_decay = 0.01
batch_size = 24
num_epochs = 10

training_args_w_qlora = TrainingArguments(
    # output_dir="bert-category-classifier-teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

data_collator_w_qlora = DataCollatorWithPadding(tokenizer=tokenizer_qlora)
# Use the custom compute_loss function in Trainer
trainer_w_qlora = Trainer(
    model=model_qlora,
    args=training_args_w_qlora,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer_qlora,
    compute_metrics=compute_metrics,
    data_collator=data_collator_w_qlora
)

trainer_w_qlora.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0884,0.267853,0.905484
2,0.2236,0.319284,0.890315
3,0.1047,0.404527,0.882147
4,0.0443,0.439541,0.885648
5,0.0361,0.515788,0.879813
6,0.0211,0.591615,0.872812
7,0.0117,0.57861,0.885648
8,0.0068,0.589177,0.890315
9,0.0044,0.597647,0.896149
10,0.0036,0.586818,0.890315


TrainOutput(global_step=1500, training_loss=0.154475594162941, metrics={'train_runtime': 891.4032, 'train_samples_per_second': 40.33, 'train_steps_per_second': 1.683, 'total_flos': 4132795784105832.0, 'train_loss': 0.154475594162941, 'epoch': 10.0})

In [297]:
save_qlora_model_directory = "./fine_tuned_models/fine_tuned_roberta_w_qlora"

# Save model
model_qlora.save_pretrained(save_qlora_model_directory)
# Save tokenizer
tokenizer_qlora.save_pretrained(save_qlora_model_directory)

('./fine_tuned_models/fine_tuned_roberta_w_qlora/tokenizer_config.json',
 './fine_tuned_models/fine_tuned_roberta_w_qlora/special_tokens_map.json',
 './fine_tuned_models/fine_tuned_roberta_w_qlora/sentencepiece.bpe.model',
 './fine_tuned_models/fine_tuned_roberta_w_qlora/added_tokens.json',
 './fine_tuned_models/fine_tuned_roberta_w_qlora/tokenizer.json')

In [298]:
# Load base model first
base_roberta_model = AutoModelForSequenceClassification.from_pretrained(model_wout_qlora_id, 
                                                                        num_labels=7, 
                                                                        id2label=id2category, 
                                                                        label2id=category2id)

# Load LoRA model on top of base model
loaded_qlora_model = PeftModel.from_pretrained(base_roberta_model, save_qlora_model_directory)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [299]:
# Forward pass
tokenized_val_data = tokenizer_qlora(
    dataset_dict["validation"]["text"], 
    padding=True, 
    truncation=True,
    return_tensors="pt",
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_qlora, return_tensors="pt")
batch_inputs = data_collator(tokenized_val_data)

model_lora.eval()
with torch.no_grad():
    model_qlora_outputs = loaded_qlora_model.to("cpu")(**batch_inputs)
model_qlora_outputs_logits = model_qlora_outputs.logits
model_qlora_preds = torch.softmax(model_qlora_outputs_logits, dim=1).argmax(dim=1)

In [300]:
compute_metrics(eval_pred=(model_qlora_outputs_logits.numpy(), dataset_dict["validation"]["labels"]))

{'accuracy': 0.9037900874635568}

## c. QLora with Gemma3-1B

In [6]:
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype= torch.bfloat16
# )

In [7]:
gemma_model_id = "google/gemma-3-1b-pt"
tokenizer_gemma = AutoTokenizer.from_pretrained(gemma_model_id)
gemma_model_wout_qlora = Gemma3ForCausalLM.from_pretrained(gemma_model_id)

In [8]:
gemma_model_wout_qlora

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

In [9]:
# finding maximum token no
max(tokenizer_gemma.vocab.values())

262144

In [10]:
# each row in lm_head represent one token, so 262144 is vocab size and each row's index represent token id
gemma_model_wout_qlora.lm_head.weight.shape

torch.Size([262144, 1152])

In [11]:
token_id_0 = tokenizer_gemma.encode("0", add_special_tokens=False)[0]
token_id_1 = tokenizer_gemma.encode("1", add_special_tokens=False)[0]
token_id_2 = tokenizer_gemma.encode("2", add_special_tokens=False)[0]
token_id_3 = tokenizer_gemma.encode("3", add_special_tokens=False)[0]
token_id_4 = tokenizer_gemma.encode("4", add_special_tokens=False)[0]
token_id_5 = tokenizer_gemma.encode("5", add_special_tokens=False)[0]
token_id_6 = tokenizer_gemma.encode("6", add_special_tokens=False)[0]

print([token_id_0,token_id_1,token_id_2,token_id_3,token_id_4,token_id_5,token_id_6])

# keep only the 0,1,2,3,4,5,6 tokens from lm_head
par = torch.nn.Parameter(torch.vstack([gemma_model_wout_qlora.lm_head.weight[token_id_0, :], 
                                       gemma_model_wout_qlora.lm_head.weight[token_id_1, :],
                                       gemma_model_wout_qlora.lm_head.weight[token_id_2, :],
                                       gemma_model_wout_qlora.lm_head.weight[token_id_3, :],
                                       gemma_model_wout_qlora.lm_head.weight[token_id_4, :],
                                       gemma_model_wout_qlora.lm_head.weight[token_id_5, :],
                                       gemma_model_wout_qlora.lm_head.weight[token_id_6, :]
                                    ]))
gemma_model_wout_qlora.lm_head.weight = par

# set vocab size as 7 (number of classes). we set this because while computing logits vocab_size parameter will be used!
gemma_model_wout_qlora.vocab_size = 7
gemma_model_wout_qlora.config.vocab_size = 7
gemma_model_wout_qlora.lm_head.out_features=7

[236771, 236770, 236778, 236800, 236812, 236810, 236825]


In [12]:
# new lm_head will generate only this 7 tokens since we only keep seven of them 
gemma_model_wout_qlora.lm_head.weight.shape

torch.Size([7, 1152])

In [13]:
gemma_model_wout_qlora

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

In [14]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules = ["q_proj", "v_proj"]
)

gemma_model_wout_qlora = prepare_model_for_kbit_training(gemma_model_wout_qlora)

gemma_model_qlora = get_peft_model(gemma_model_wout_qlora, lora_config)
gemma_model_qlora.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 1,490,944 || all params: 1,001,384,960 || trainable%: 0.1489


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [15]:
gemma_model_qlora

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1152, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1152, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
         

In [16]:
# Original model's parameters in lm_head is freezed automatically
for name, param in gemma_model_qlora.named_parameters():
    print(f"{param.requires_grad} - {name}")

False - base_model.model.model.embed_tokens.weight
False - base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight
True - base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
True - base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
False - base_model.model.model.layers.0.self_attn.k_proj.weight
False - base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight
True - base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
True - base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
False - base_model.model.model.layers.0.self_attn.o_proj.weight
False - base_model.model.model.layers.0.self_attn.q_norm.weight
False - base_model.model.model.layers.0.self_attn.k_norm.weight
False - base_model.model.model.layers.0.mlp.gate_proj.weight
False - base_model.model.model.layers.0.mlp.up_proj.weight
False - base_model.model.model.layers.0.mlp.down_proj.weight
False - base_model.model.model.layers.0.input_laye

In [17]:
# Unfreeze lm_head parameters
for name, param in gemma_model_qlora.named_parameters():
    if "lm_head" in name:
        param.requires_grad=True
    
    print(f"{param.requires_grad} - {name}")

False - base_model.model.model.embed_tokens.weight
False - base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight
True - base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
True - base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
False - base_model.model.model.layers.0.self_attn.k_proj.weight
False - base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight
True - base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
True - base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
False - base_model.model.model.layers.0.self_attn.o_proj.weight
False - base_model.model.model.layers.0.self_attn.q_norm.weight
False - base_model.model.model.layers.0.self_attn.k_norm.weight
False - base_model.model.model.layers.0.mlp.gate_proj.weight
False - base_model.model.model.layers.0.mlp.up_proj.weight
False - base_model.model.model.layers.0.mlp.down_proj.weight
False - base_model.model.model.layers.0.input_laye

In [18]:
prompt = """Here is a text:
{}

Classify it into one of the following class: [0, 1, 2, 3, 4, 5, 6]

The correct class is: {}"""


def formatting_prompts_func(dataset_):
    # this is to fix an issue with the transformers library where the first time this function is called, it is called with a string for some reason
    if isinstance(dataset_['text'], str):
        return [" "]*100
        
    texts = []
    for i in range(len(dataset_['text'])):
        t = dataset_['text'][i]
        label = dataset_['labels'][i]
        text = prompt.format(t, label)
        texts.append(text)

    return texts

In [19]:
# this custom collator is needed to change the sequence labels from yes_token_id and no_token_id to 1 and 0. It also trains only on the last token of the sequence.
# NOT: Format of batch['labels'] is different than 'labels' column of training data!
class DataCollatorForLastTokenLM(DataCollatorForLanguageModeling):
    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100, # this will be used in loss calculation in order not to include input tokens, we will try to include only target prediction token
        pad_token_id: int = tokenizer_gemma.pad_token_type_id, # it should be padding token id of the chosen base model
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index
        self.pad_token_id = pad_token_id

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            # Find the last non-padding token (since tokenizer of gemma3 uses left padding there is no need to use below code!)
            ## last_token_idx = (batch["labels"][i] != self.pad_token_id).nonzero()[-1].item()

            last_token_idx = batch["labels"][i].shape[0]-1
            # Set all labels to ignore_index except for the last token
            batch["labels"][i, :last_token_idx] = self.ignore_index
            # The old labels for the category tokens need to be mapped to their corresponding class numbers
            last_token = batch["labels"][i, last_token_idx]
            if last_token==token_id_0:
                batch["labels"][i, last_token_idx] = 0
            elif last_token==token_id_1:
                batch["labels"][i, last_token_idx] = 1
            elif last_token==token_id_2:
                batch["labels"][i, last_token_idx] = 2
            elif last_token==token_id_3:
                batch["labels"][i, last_token_idx] = 3
            elif last_token==token_id_4:
                batch["labels"][i, last_token_idx] = 4
            elif last_token==token_id_5:
                batch["labels"][i, last_token_idx] = 5
            elif last_token==token_id_6:
                batch["labels"][i, last_token_idx] = 6

        return batch

collator = DataCollatorForLastTokenLM(tokenizer=tokenizer_gemma)

In [20]:
# Training HyperParameters
lr = 2e-3
weight_decay = 0.01
batch_size = 16
num_epochs = 2

sft_training_args = TrainingArguments(
    # output_dir="bert-category-classifier-teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    group_by_length = True
)

sft_trainer = SFTTrainer(
    model = gemma_model_qlora,
    args = sft_training_args,
    processing_class = tokenizer_gemma,
    train_dataset = dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

sft_trainer.train()

Applying formatting function to train dataset:   0%|          | 0/3595 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/3595 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3595 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3595 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3595 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/857 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/857 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/857 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/857 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/857 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Epoch,Training Loss,Validation Loss
1,0.5799,0.419297
2,0.1321,0.45724




TrainOutput(global_step=450, training_loss=0.3559852345784505, metrics={'train_runtime': 353.8368, 'train_samples_per_second': 20.32, 'train_steps_per_second': 1.272, 'total_flos': 2451481839040512.0, 'train_loss': 0.3559852345784505})

In [21]:
save_qlora_gemma_model_directory = "./fine_tuned_models/fine_tuned_gemma_w_qlora"

# Save model
gemma_model_qlora.save_pretrained(save_qlora_gemma_model_directory)
# Save tokenizer
tokenizer_gemma.save_pretrained(save_qlora_gemma_model_directory)



('./fine_tuned_models/fine_tuned_gemma_w_qlora/tokenizer_config.json',
 './fine_tuned_models/fine_tuned_gemma_w_qlora/special_tokens_map.json',
 './fine_tuned_models/fine_tuned_gemma_w_qlora/tokenizer.model',
 './fine_tuned_models/fine_tuned_gemma_w_qlora/added_tokens.json',
 './fine_tuned_models/fine_tuned_gemma_w_qlora/tokenizer.json')

In [None]:
# Load base model first
base_gemma_model = Gemma3ForCausalLM.from_pretrained(gemma_model_id)
token_id_0 = tokenizer_gemma.encode("0", add_special_tokens=False)[0]
token_id_1 = tokenizer_gemma.encode("1", add_special_tokens=False)[0]
token_id_2 = tokenizer_gemma.encode("2", add_special_tokens=False)[0]
token_id_3 = tokenizer_gemma.encode("3", add_special_tokens=False)[0]
token_id_4 = tokenizer_gemma.encode("4", add_special_tokens=False)[0]
token_id_5 = tokenizer_gemma.encode("5", add_special_tokens=False)[0]
token_id_6 = tokenizer_gemma.encode("6", add_special_tokens=False)[0]
# keep only the 0,1,2,3,4,5,6 tokens from lm_head
par = torch.nn.Parameter(torch.vstack([base_gemma_model.lm_head.weight[token_id_0, :], 
                                       base_gemma_model.lm_head.weight[token_id_1, :],
                                       base_gemma_model.lm_head.weight[token_id_2, :],
                                       base_gemma_model.lm_head.weight[token_id_3, :],
                                       base_gemma_model.lm_head.weight[token_id_4, :],
                                       base_gemma_model.lm_head.weight[token_id_5, :],
                                       base_gemma_model.lm_head.weight[token_id_6, :]
                                    ]))
base_gemma_model.lm_head.weight = par
# set vocab size as 7 (number of classes). we set this because while computing logits vocab_size parameter will be used!
base_gemma_model.vocab_size = 7
base_gemma_model.config.vocab_size = 7
base_gemma_model.lm_head.out_features=7

# Load LoRA model on top of base model
loaded_qlora_gemma_model = PeftModel.from_pretrained(base_gemma_model, save_qlora_gemma_model_directory)

In [25]:
from collections import defaultdict
import torch.nn.functional as F
from tqdm import tqdm

# Step 1: Tokenize the inputs and sort them by their tokenized length
tokenized_inputs = []
for i in range(len(dataset_dict["validation"]['text'])):
    text = dataset_dict["validation"]['text'][i]
    test_str = prompt.format(text, "")
    tokenized_input = tokenizer_gemma(test_str, return_tensors="pt", add_special_tokens=False)
    tokenized_inputs.append((tokenized_input, test_str, dataset_dict["validation"]['labels'][i]))

# Sort by tokenized length
tokenized_inputs.sort(key=lambda x: x[0]['input_ids'].shape[1])

# Step 2: Group the inputs by their tokenized length
grouped_inputs = defaultdict(list)
for tokenized_input, test_str, label in tokenized_inputs:
    length = tokenized_input['input_ids'].shape[1]
    grouped_inputs[length].append((tokenized_input, test_str, label))

# Step 3: Process each group in batches of 64
batch_size = 64
all_outputs = []
all_strings = []
all_labels = []

for length, group in tqdm(grouped_inputs.items()):
    for i in range(0, len(group), batch_size):
        batch = group[i:i+batch_size]
        batch_inputs = [item[0] for item in batch]
        batch_strings = [item[1] for item in batch]
        batch_labels = [item[2] for item in batch]

        # Concatenate the batch inputs
        input_ids = torch.cat([item['input_ids'] for item in batch_inputs], dim=0).to("cpu")
        attention_mask = torch.cat([item['attention_mask'] for item in batch_inputs], dim=0).to("cpu")

        # Forward pass
        with torch.no_grad():
            outputs = loaded_qlora_gemma_model.to("cpu")(input_ids=input_ids, attention_mask=attention_mask)
            # print(outputs.logits[:, -1].shape)
        
        # logits are shape (batch_size, sequence_length, num_classes), we want only the last token of each sequence in the batch
        logits = outputs.logits[:, -1, :]
        
        # Apply softmax
        probabilities = F.softmax(logits, dim=-1)
        
        # Get predictions
        predictions = torch.argmax(probabilities, dim=-1)
        
        all_outputs.extend(predictions.cpu().numpy())
        all_labels.extend(batch_labels)
        all_strings.extend(batch_strings)

# Step 4: Do the label assignment
correct = 0
total = 0

for i in range(len(all_outputs)):
    pred = str(all_outputs[i])
    label = str(all_labels[i])
    if i > len(all_outputs) - 25:
        print(f"{i}: text: {all_strings[i]}\n pred: {pred} label: {label}\n")

    if pred == label:
        correct += 1
    total += 1

print(f"Correct: {correct} Total: {total} Accuracy: {correct/total}")

100%|██████████| 62/62 [00:33<00:00,  1.88it/s]

319: text: Here is a text:
Le porte-parole de Bush, Gordon Johndroe, a qualifié la promesse de la Corée du Nord « d'étape majeure vers l'objectif de la dénucléarisation vérifiable de la péninsule coréenne ».

Classify it into one of the following class: [0, 1, 2, 3, 4, 5, 6]

The correct class is: 
 pred: 2 label: 2

320: text: Here is a text:
Le médaillé d'or olympique devait nager au 100 m et au 200 m nage libre et dans trois relais aux Jeux du Commonwealth, mais en raison de ses plaintes, sa condition physique a été mise en doute.

Classify it into one of the following class: [0, 1, 2, 3, 4, 5, 6]

The correct class is: 
 pred: 3 label: 3

321: text: Here is a text:
El ganador olímpico de la medalla de oro debía nadar en el estilo libre de 100 metros y 200 metros, y en tres relevos en los Juegos de la Commonwealth, pero su condición física ha sido puesta en dudas a raíz de sus quejas.

Classify it into one of the following class: [0, 1, 2, 3, 4, 5, 6]

The correct class is: 
 pred: 




## d. Lora with Unsloth

In [None]:
# There is an issue while installing unsloth. The issue I got: "ERROR: Could not build wheels for xformers, which is required to install pyproject.toml-based projects"

# I will check & try it later!

# Observations

Fine tuning all parameters of roberta model is inefficient and gives bad perfromance on validation data. Although, while the training both training and test losses mostly decreases accuracy remains the same and also in validation set accuracy is 28%. If model classifies all observations as class 0, the accuracy will also be 28%. So, this model does not learn anything with fine tuning.

Fine tuning with transfer learning teacher model (roberta model) learned the dataset well compared to all parameter fine tuning approach and results in 76% accuracy in validation set. Morover, fine tuning student model (bert model) with fine tuned teacher model improves the accuracy from 76% to 86%. Rather than fien tuning all parameters, fine tuning only classification head results in faster training and better classification performance.

Fine tuning roberta model with LoRA gives similar results with fine tuning all parameters of roberta model. Learning adoptors and classifcaiton head with training hyperparameters may not be enough. However, with QLoRA approach and same training hyperparameters provides superior performances among all fine tuning approach with validation accuracy as 90%.

Fine tuning gemma3-1b model is different than the others. With propoer preprocessing model parameters and custom data collator, the fine tuned model's validation accuracy is 87% which makes it second best fine tuned model.

# Resources

### Transfer Learning & PEFT

* https://www.youtube.com/watch?v=bZcKYiwtw1I&ab_channel=NeuralBreakdownwithAVB

* https://www.youtube.com/watch?v=eC6Hd1hFvos&list=PLz-ep5RbHosUwDlaic4w8u2NdFgQiX_a_&ab_channel=ShawTalebi

* https://www.youtube.com/watch?v=4QHg8Ix8WWQ&list=PLz-ep5RbHosUwDlaic4w8u2NdFgQiX_a_&index=5&ab_channel=ShawTalebi

* https://www.youtube.com/watch?v=YJNbgusTSF0&ab_channel=TradeMamba

* https://www.youtube.com/watch?v=4nNbg4bWDrQ&ab_channel=Rohan-Paul-AI

* https://dassum.medium.com/fine-tune-large-language-model-llm-on-a-custom-dataset-with-qlora-fb60abdeba07

### Unsloth

* https://github.com/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb

* https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B).ipynb#scrollTo=_rD6fl8EUxnG

* https://www.youtube.com/watch?v=Gpyukc6c0w8&ab_channel=MervinPraison

* https://www.youtube.com/watch?v=JJWvYQdOVOY&list=WL&index=108&ab_channel=NodematicTutorials

* https://www.youtube.com/watch?v=YZW3pkIR-YE&t=10s&ab_channel=PromptEngineering

* https://www.youtube.com/watch?v=pxhkDaKzBaY&ab_channel=warpdotdev

* https://www.youtube.com/watch?v=jFl5Fewrieo&ab_channel=AIJason

* https://www.youtube.com/watch?v=qcNmOItRw4U&ab_channel=DataCamp