In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install git+https://github.com/Shannu3766/bi_influence.git

Collecting git+https://github.com/Shannu3766/bi_influence.git
  Cloning https://github.com/Shannu3766/bi_influence.git to /tmp/pip-req-build-t3py0trb
  Running command git clone --filter=blob:none --quiet https://github.com/Shannu3766/bi_influence.git /tmp/pip-req-build-t3py0trb
  Resolved https://github.com/Shannu3766/bi_influence.git to commit e768abe24613868eff413a83dff718b6df358305
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=21.0.0 (from datasets->adalora_bi==0.1.0)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->adalora_bi==0.1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->adalora_bi==0.1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->ad

In [3]:
"""
Example: run dynamic BI-based AdaLoRA fine-tuning on a tiny dummy dataset.

This demo:
 - builds small dataset using HuggingFace datasets or dummy data,
 - computes BI scores each epoch,
 - reallocates LoRA ranks each epoch,
 - trains adapters for that epoch.

Run: python adalora_bi/examples/finetune_bi_demo.py
"""

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from adalora_bi import fine_tune_lora_dynamic

# Simple dummy dataset for demo
class DummyTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx],
                             truncation=True,
                             padding="max_length",
                             max_length=self.max_length,
                             return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def collate_fn(batch):
    # batch already dict tensors
    out = {}
    for k in batch[0].keys():
        out[k] = torch.stack([b[k] for b in batch], dim=0)
    return out

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(device)
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Dummy texts
    texts = [
        "I love programming in Python",
        "The weather is sunny today",
        "Transformers are great",
        "I had pizza",
        "PyTorch is awesome",
        "OpenAI builds models",
        "I went for a run",
        "I enjoy reading books"
    ]
    labels = [0,1,0,1,0,0,1,1]

    ds = DummyTextDataset(texts, labels, tokenizer)
    train_loader = DataLoader(ds, batch_size=2, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(ds, batch_size=2, shuffle=False, collate_fn=collate_fn)

    # Run dynamic training: BI recomputed at start of each epoch (so 2 epochs -> computed twice)
    fine_tune_lora_dynamic(
    model,
    train_loader,
    val_loader=val_loader,
    device=device,
    total_R=48,
    tau=0.5,
    epochs=4,
    lr=5e-4,
    max_batches_for_bi=2,     # only 2 batches for BI scoring
    recompute_every=2,        # recompute BI every 2 epochs
    fast_mode=False,           # monitors fewer modules
)

if __name__ == "__main__":
    main()

cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-10-20 18:22:25.400015: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760984545.557250      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760984545.611380      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Epoch 1/4 ===
Computing BI importance...
BI scores collected for 20 modules in 0.44s
Allocated ranks (first few):
  distilbert.transformer.layer.0.attention.q_lin -> r=3
  distilbert.transformer.layer.0.attention.k_lin -> r=3
  distilbert.transformer.layer.0.attention.v_lin -> r=2
  distilbert.transformer.layer.0.attention.out_lin -> r=7
  distilbert.transformer.layer.0.ffn.lin1 -> r=1
  distilbert.transformer.layer.0.ffn.lin2 -> r=2
  distilbert.transformer.layer.1.attention.q_lin -> r=1
  distilbert.transformer.layer.1.attention.k_lin -> r=1
  distilbert.transformer.layer.1.attention.v_lin -> r=6
  distilbert.transformer.layer.1.attention.out_lin -> r=1
Patched 20 modules with LoRA adapters.


Training epoch 1: 100%|██████████| 4/4 [00:00<00:00, 11.93it/s]


Epoch 1 step 4 avg loss 0.2769
After epoch 1: val loss=0.6868, acc=0.5000

=== Epoch 2/4 ===


Training epoch 2: 100%|██████████| 4/4 [00:00<00:00, 54.60it/s]


Epoch 2 step 4 avg loss 0.2708
After epoch 2: val loss=0.6752, acc=0.6250

=== Epoch 3/4 ===
Computing BI importance...
BI scores collected for 20 modules in 0.06s
Allocated ranks (first few):
  distilbert.transformer.layer.0.attention.q_lin.orig -> r=3
  distilbert.transformer.layer.0.attention.k_lin.orig -> r=3
  distilbert.transformer.layer.0.attention.v_lin.orig -> r=2
  distilbert.transformer.layer.0.attention.out_lin.orig -> r=7
  distilbert.transformer.layer.0.ffn.lin1.orig -> r=1
  distilbert.transformer.layer.0.ffn.lin2.orig -> r=2
  distilbert.transformer.layer.1.attention.q_lin.orig -> r=1
  distilbert.transformer.layer.1.attention.k_lin.orig -> r=2
  distilbert.transformer.layer.1.attention.v_lin.orig -> r=6
  distilbert.transformer.layer.1.attention.out_lin.orig -> r=1
Patched 20 modules with LoRA adapters.


Training epoch 3: 100%|██████████| 4/4 [00:00<00:00, 37.81it/s]


Epoch 3 step 4 avg loss 0.2811
After epoch 3: val loss=0.6521, acc=1.0000

=== Epoch 4/4 ===


Training epoch 4: 100%|██████████| 4/4 [00:00<00:00, 29.63it/s]


Epoch 4 step 4 avg loss 0.2531
After epoch 4: val loss=0.6044, acc=1.0000


In [4]:
!pip install -q --upgrade pyarrow datasets pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m96.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
dask-cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0

In [5]:
!pip install --upgrade --force-reinstall "pyarrow>=14.0.1,<17"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyarrow<17,>=14.0.1
  Downloading pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting numpy>=1.16.6 (from pyarrow<17,>=14.0.1)
  Downloading numpy-2.3.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl (40.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading numpy-2.3.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, pyarrow
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1

In [6]:
!pip install --upgrade --force-reinstall "numpy<2.0" "scipy>=1.10,<1.14" "scikit-learn>=1.3,<1.5"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14,>=1.10
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m974.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scikit-learn<1.5,>=1.3
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn<1.5,>=1.3)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn<1.5,>=1.3)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [7]:
"""
Fine-tune DistilBERT on SST-2 using BI-based Adaptive LoRA.
This demonstrates a full run on a public dataset with saving.

Run:
    python -m adalora_bi.examples.finetune_sst2_bi
"""

import os
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, default_data_collator
from adalora_bi import fine_tune_lora_dynamic


def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # ------------------------------------------------------------------
    # 1️⃣  Load public dataset (GLUE SST-2)
    # ------------------------------------------------------------------
    dataset = load_dataset("glue", "sst2")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize_fn(batch):
        return tokenizer(
            batch["sentence"],
            truncation=True,
            padding="max_length",
            max_length=128,
        )

    tokenized = dataset.map(tokenize_fn, batched=True)
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    train_ds = tokenized["train"].shuffle(seed=42).select(range(2000))  # small subset for demo
    val_ds = tokenized["validation"].select(range(500))

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=default_data_collator)
    val_loader = DataLoader(val_ds, batch_size=8, shuffle=False, collate_fn=default_data_collator)

    # ------------------------------------------------------------------
    # 2️⃣  Load model
    # ------------------------------------------------------------------
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2
    )

    # ------------------------------------------------------------------
    # 3️⃣  Fine-tune with BI-based adaptive LoRA
    # ------------------------------------------------------------------
    fine_tune_lora_dynamic(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        total_R=32,          # total LoRA rank budget
        tau=0.5,             # temperature
        epochs=2,            # recompute BI every epoch
        lr=2e-5,
        weight_decay=0.01,
        max_batches_for_bi=8,
        recompute_every=1,
        fast_mode=False,     # collect all Linear layers for real benchmark
    )

    # ------------------------------------------------------------------
    # 4️⃣  Save the fine-tuned model
    # ------------------------------------------------------------------
    save_dir = "./saved_models"
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, "adalora_bi_sst2.pt")

    torch.save(model.state_dict(), save_path)
    print(f"\n✅ Model saved successfully to: {save_path}\n")

    # Optional: evaluate final model
    model.eval()
    print("Final evaluation on validation set...")
    total, correct = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = outputs.logits.argmax(dim=-1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)
    print(f"Validation Accuracy: {correct/total:.4f}")

if __name__ == "__main__":
    main()




Using device: cuda


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Epoch 1/2 ===
Computing BI importance...
BI scores collected for 20 modules in 1.57s
Allocated ranks (first few):
  distilbert.transformer.layer.0.attention.q_lin -> r=1
  distilbert.transformer.layer.0.attention.k_lin -> r=1
  distilbert.transformer.layer.0.attention.v_lin -> r=1
  distilbert.transformer.layer.0.attention.out_lin -> r=5
  distilbert.transformer.layer.0.ffn.lin1 -> r=2
  distilbert.transformer.layer.0.ffn.lin2 -> r=1
  distilbert.transformer.layer.1.attention.q_lin -> r=2
  distilbert.transformer.layer.1.attention.k_lin -> r=1
  distilbert.transformer.layer.1.attention.v_lin -> r=4
  distilbert.transformer.layer.1.attention.out_lin -> r=2
Patched 20 modules with LoRA adapters.


Training epoch 1: 100%|██████████| 250/250 [00:14<00:00, 17.47it/s]


Epoch 1 step 250 avg loss 17.6416
After epoch 1: val loss=0.6920, acc=0.5000

=== Epoch 2/2 ===
Computing BI importance...
BI scores collected for 20 modules in 1.38s
Allocated ranks (first few):
  distilbert.transformer.layer.0.attention.q_lin.orig -> r=1
  distilbert.transformer.layer.0.attention.k_lin.orig -> r=1
  distilbert.transformer.layer.0.attention.v_lin.orig -> r=1
  distilbert.transformer.layer.0.attention.out_lin.orig -> r=4
  distilbert.transformer.layer.0.ffn.lin1.orig -> r=2
  distilbert.transformer.layer.0.ffn.lin2.orig -> r=1
  distilbert.transformer.layer.1.attention.q_lin.orig -> r=2
  distilbert.transformer.layer.1.attention.k_lin.orig -> r=1
  distilbert.transformer.layer.1.attention.v_lin.orig -> r=4
  distilbert.transformer.layer.1.attention.out_lin.orig -> r=1
Patched 20 modules with LoRA adapters.


Training epoch 2: 100%|██████████| 250/250 [00:16<00:00, 15.62it/s]


Epoch 2 step 250 avg loss 17.0408
After epoch 2: val loss=0.6787, acc=0.5700

✅ Model saved successfully to: ./saved_models/adalora_bi_sst2.pt

Final evaluation on validation set...
Validation Accuracy: 0.5700


In [8]:
import pyarrow
print(pyarrow.__version__)


19.0.1
