In [1]:
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np

np.random.seed(42)

def load_imdb_dataset():
    imdb_dataset = load_dataset("imdb")
    N = 1000
    rand_idx = np.random.randint(24999, size=N)
    
    x_train = imdb_dataset['train'][rand_idx]['text']
    y_train = imdb_dataset['train'][rand_idx]['label']
    
    x_test = imdb_dataset['test'][rand_idx]['text']
    y_test = imdb_dataset['test'][rand_idx]['label']
    
    return DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'test':Dataset.from_dict({'label':y_test,'text':x_test})})    

dataset = load_imdb_dataset()

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [3]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

np.float64(0.504)

## QLoRA Configuration

In [4]:
import torch
from transformers.utils.quantization_config import BitsAndBytesConfig

qlora_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

## Base Model : Pythia (2.8B)

In [5]:
from transformers import AutoModelForSequenceClassification

model_1 = AutoModelForSequenceClassification.from_pretrained(
    "EleutherAI/pythia-2.8b",
    num_labels = 2,
    quantization_config = qlora_config,
    output_attentions=False,
    output_hidden_states=False
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-2.8b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
model_1.to(device)

GPTNeoXForSequenceClassification(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2560)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear4bit(in_features=2560, out_features=7680, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear4bit(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm)

In [8]:
from transformers import AutoTokenizer

model_1_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")

if model_1_tokenizer.pad_token is None:
    model_1_tokenizer.pad_token = model_1_tokenizer.eos_token
    model_1.config.pad_token_id = model_1.config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
print(model_1_tokenizer.eos_token)

<|endoftext|>


In [10]:
test = "Hi! How are you?"

In [11]:
encoded_text = model_1_tokenizer(test)["input_ids"]

In [12]:
encoded_text

[12764, 2, 1359, 403, 368, 32]

## LoRA Configuration

In [13]:
model_1.gradient_checkpointing_enable()

In [14]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

model_1 = prepare_model_for_kbit_training(model_1)

In [15]:
lora_config_model_1 = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

In [16]:
model_1 = get_peft_model(model_1, lora_config_model_1)

In [17]:
model_1.print_trainable_parameters()

trainable params: 10,490,880 || all params: 2,656,926,720 || trainable%: 0.3949


In [18]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

import re

for i in range(len(train_texts)):
    train_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", train_texts[i])
    train_texts[i] = re.sub(r"\s+", " ", train_texts[i]).strip()
for i in range(len(test_texts)):
    test_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", test_texts[i])
    test_texts[i] = re.sub(r"\s+", " ", test_texts[i]).strip()

In [19]:
def tokenize_model_1(texts):
    return model_1_tokenizer(
        texts, 
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize_model_1(train_texts)
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels = torch.tensor(train_labels)
    
# Tokenize test data
test_encodings = tokenize_model_1(test_texts)
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']
test_labels = torch.tensor(test_labels)

In [20]:
test_labels.shape

torch.Size([1000])

In [21]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=BATCH_SIZE
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    sampler=test_sampler,
    batch_size=BATCH_SIZE
)

In [22]:
train_dataset.__sizeof__()

16

In [23]:
len(train_dataloader)

500

In [24]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import bitsandbytes.optim as bnb_optim


optimizer = bnb_optim.PagedAdamW8bit(model_1.parameters(), lr=5e-5)

epochs = 1
total_steps = len(train_dataloader) * epochs // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [25]:
from tqdm.notebook import tqdm
progress_bar = tqdm(range(total_steps))

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    
    model_1.train()
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # We don't call `model_1.zero_grad()` here yet because we are accumulating.
        
        # The model calculates loss internally when `labels` are provided.
        outputs = model_1(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        
        loss = outputs.loss
        
        # --- Gradient Accumulation Step ---
        # Normalize the loss
        loss = loss / GRADIENT_ACCUMULATION_STEPS
        
        # Backpropagate the normalized loss
        loss.backward()
        
        # Accumulate loss for tracking
        total_train_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        
        # --- Optimizer Step ---
        # Only perform an optimizer step after accumulating for the specified number of steps.
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model_1.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad() # Reset gradients after the step
            progress_bar.update(1)

            avg_train_loss = total_train_loss / (step + 1)
            print(f"\n  Step {progress_bar.n}/{total_steps} | Training loss: {avg_train_loss:.4f}")

print("\nTraining complete.")

  0%|          | 0/125 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.





  return fn(*args, **kwargs)



  Step 1/125 | Training loss: 2.7787

  Step 2/125 | Training loss: 1.9701

  Step 3/125 | Training loss: 1.8852

  Step 4/125 | Training loss: 1.8791

  Step 5/125 | Training loss: 1.8648

  Step 6/125 | Training loss: 1.7341

  Step 7/125 | Training loss: 1.6727

  Step 8/125 | Training loss: 1.5379

  Step 9/125 | Training loss: 1.4705

  Step 10/125 | Training loss: 1.3939

  Step 11/125 | Training loss: 1.3483

  Step 12/125 | Training loss: 1.3561

  Step 13/125 | Training loss: 1.3730

  Step 14/125 | Training loss: 1.3524

  Step 15/125 | Training loss: 1.3287

  Step 16/125 | Training loss: 1.3153

  Step 17/125 | Training loss: 1.2912

  Step 18/125 | Training loss: 1.3087

  Step 19/125 | Training loss: 1.2616

  Step 20/125 | Training loss: 1.2368

  Step 21/125 | Training loss: 1.2483

  Step 22/125 | Training loss: 1.2293

  Step 23/125 | Training loss: 1.2024

  Step 24/125 | Training loss: 1.1763

  Step 25/125 | Training loss: 1.1741

  Step 26/125 | Training loss: 1.

In [26]:
model_1.save_pretrained("./qlora-finetuned-pythia")

In [27]:
torch.cuda.empty_cache()

## Model 2 : StableLM (3B)

In [28]:
model_2 = AutoModelForSequenceClassification.from_pretrained(
    "stabilityai/stablelm-3b-4e1t",
    quantization_config=qlora_config,
    trust_remote_code=True,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/5.59G [00:00<?, ?B/s]

Some weights of StableLmForSequenceClassification were not initialized from the model checkpoint at stabilityai/stablelm-3b-4e1t and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
model_2.to(device)

StableLmForSequenceClassification(
  (model): StableLmModel(
    (embed_tokens): Embedding(50304, 2560)
    (layers): ModuleList(
      (0-31): 32 x StableLmDecoderLayer(
        (self_attn): StableLmSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): StableLmRotaryEmbedding()
        )
        (mlp): StableLmMLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Layer

In [30]:
model_2_tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")

if model_2_tokenizer.pad_token is None:
    model_2_tokenizer.pad_token = model_2_tokenizer.eos_token
    model_2.config.pad_token_id = model_2.config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [31]:
print(model_2_tokenizer.eos_token)

<|endoftext|>


In [32]:
encoded_text = model_2_tokenizer(test)["input_ids"]

In [33]:
encoded_text # test = "Hi! How are you?"

[12764, 2, 1359, 403, 368, 32]

In [34]:
model_2.gradient_checkpointing_enable()
model_2 = prepare_model_for_kbit_training(model_2)

In [35]:
lora_config_model_2 = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

In [36]:
model_2 = get_peft_model(model_2, lora_config_model_2)

In [37]:
model_2.print_trainable_parameters()

trainable params: 12,522,496 || all params: 2,679,192,576 || trainable%: 0.4674


In [38]:
def tokenize_model_2(texts):
    return model_2_tokenizer(
        texts, 
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize_model_2(train_texts)
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels = torch.tensor(train_labels)
    
# Tokenize test data
test_encodings = tokenize_model_2(test_texts)
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']
test_labels = torch.tensor(test_labels)

  train_labels = torch.tensor(train_labels)
  test_labels = torch.tensor(test_labels)


In [39]:
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels) 
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=BATCH_SIZE
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    sampler=test_sampler,
    batch_size=BATCH_SIZE
)

In [40]:
len(train_dataloader)

500

In [41]:
optimizer = bnb_optim.PagedAdamW8bit(model_2.parameters(), lr=5e-5)

epochs = 1
total_steps = len(train_dataloader) * epochs // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [42]:
progress_bar = tqdm(range(total_steps))
model_2.train()

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Forward pass
        outputs = model_2(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels 
        )
        
        loss = outputs.loss
        
        # --- Gradient Accumulation ---
        loss = loss / GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        
        total_train_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        
        # --- Optimizer Step ---
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model_2.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            avg_train_loss = total_train_loss / (step + 1)
            print(f"\n  Step {progress_bar.n}/{total_steps} | Training loss: {avg_train_loss:.4f}")

print("\nTraining complete.")

  0%|          | 0/125 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...




  Step 1/125 | Training loss: 6.4083

  Step 2/125 | Training loss: 3.7711

  Step 3/125 | Training loss: 3.3151

  Step 4/125 | Training loss: 2.9306

  Step 5/125 | Training loss: 2.5893

  Step 6/125 | Training loss: 2.4233

  Step 7/125 | Training loss: 2.1239

  Step 8/125 | Training loss: 1.9037

  Step 9/125 | Training loss: 1.8333

  Step 10/125 | Training loss: 1.6766

  Step 11/125 | Training loss: 1.5871

  Step 12/125 | Training loss: 1.4966

  Step 13/125 | Training loss: 1.4737

  Step 14/125 | Training loss: 1.4333

  Step 15/125 | Training loss: 1.4193

  Step 16/125 | Training loss: 1.4255

  Step 17/125 | Training loss: 1.3750

  Step 18/125 | Training loss: 1.3255

  Step 19/125 | Training loss: 1.3321

  Step 20/125 | Training loss: 1.3274

  Step 21/125 | Training loss: 1.3384

  Step 22/125 | Training loss: 1.3585

  Step 23/125 | Training loss: 1.3158

  Step 24/125 | Training loss: 1.2830

  Step 25/125 | Training loss: 1.2520

  Step 26/125 | Training loss: 1

In [43]:
model_2.save_pretrained("./qlora-finetuned-stableLM")