In [1]:
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np

np.random.seed(42)

def load_imdb_dataset():
    imdb_dataset = load_dataset("imdb")
    N = 1000
    rand_idx = np.random.randint(24999, size=N)
    
    x_train = imdb_dataset['train'][rand_idx]['text']
    y_train = imdb_dataset['train'][rand_idx]['label']
    
    x_test = imdb_dataset['test'][rand_idx]['text']
    y_test = imdb_dataset['test'][rand_idx]['label']
    
    return DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'test':Dataset.from_dict({'label':y_test,'text':x_test})})    

dataset = load_imdb_dataset()

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [3]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

np.float64(0.504)

## QLoRA Configuration

In [4]:
import torch
from transformers.utils.quantization_config import BitsAndBytesConfig

qlora_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

## Base Model : Pythia (2.8B)

In [5]:
from transformers import AutoModelForCausalLM

model_1 = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/pythia-2.8b",
    num_labels = 2,
    quantization_config = qlora_config,
    output_attentions=False,
    output_hidden_states=False
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
model_1.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2560)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear4bit(in_features=2560, out_features=7680, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear4bit(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2

In [8]:
from transformers import AutoTokenizer

model_1_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")

if model_1_tokenizer.pad_token is None:
    model_1_tokenizer.pad_token = model_1_tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
print(model_1_tokenizer.eos_token)

<|endoftext|>


In [10]:
test = "Hi! How are you?"

In [11]:
encoded_text = model_1_tokenizer(test)["input_ids"]

In [12]:
encoded_text

[12764, 2, 1359, 403, 368, 32]

## LoRA Configuration

In [13]:
model_1.gradient_checkpointing_enable()

In [14]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

model_1 = prepare_model_for_kbit_training(model_1)

In [15]:
lora_config_model_1 = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [16]:
model_1 = get_peft_model(model_1, lora_config_model_1)

In [17]:
model_1.print_trainable_parameters()

trainable params: 10,485,760 || all params: 2,785,694,720 || trainable%: 0.3764


In [18]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

import re

for i in range(len(train_texts)):
    train_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", train_texts[i])
    train_texts[i] = re.sub(r"\s+", " ", train_texts[i]).strip()
for i in range(len(test_texts)):
    test_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", test_texts[i])
    test_texts[i] = re.sub(r"\s+", " ", test_texts[i]).strip()

In [19]:
def tokenize_model_1(texts):
    return model_1_tokenizer(
        texts, 
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize_model_1(train_texts)
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels = torch.tensor(train_labels)
    
# Tokenize test data
test_encodings = tokenize_model_1(test_texts)
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']
test_labels = torch.tensor(test_labels)

In [20]:
test_labels.shape

torch.Size([1000])

In [21]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=BATCH_SIZE
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    sampler=test_sampler,
    batch_size=BATCH_SIZE
)

In [22]:
train_dataset.__sizeof__()

16

In [23]:
len(train_dataloader)

500

In [24]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import bitsandbytes.optim as bnb_optim


optimizer = bnb_optim.PagedAdamW8bit(model_1.parameters(), lr=5e-5)

epochs = 1
total_steps = len(train_dataloader) * epochs // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [25]:
from tqdm.notebook import tqdm
progress_bar = tqdm(range(total_steps))

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    
    model_1.train()
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # We don't call `model_1.zero_grad()` here yet because we are accumulating.
        
        # The model calculates loss internally when `labels` are provided.
        # The `labels` argument needs to be the same as `input_ids` for Causal LM loss.
        outputs = model_1(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_input_ids # For a Causal LM, labels are typically the input_ids themselves
        )
        
        loss = outputs.loss
        
        # --- Gradient Accumulation Step ---
        # Normalize the loss
        loss = loss / GRADIENT_ACCUMULATION_STEPS
        
        # Backpropagate the normalized loss
        loss.backward()
        
        # Accumulate loss for tracking
        total_train_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        
        # --- Optimizer Step ---
        # Only perform an optimizer step after accumulating for the specified number of steps.
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model_1.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad() # Reset gradients after the step
            progress_bar.update(1)

            avg_train_loss = total_train_loss / (step + 1)
            print(f"\n  Step {progress_bar.n}/{total_steps} | Training loss: {avg_train_loss:.4f}")

print("\nTraining complete.")

  0%|          | 0/125 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.





  return fn(*args, **kwargs)



  Step 1/125 | Training loss: 7.5293

  Step 2/125 | Training loss: 7.4759

  Step 3/125 | Training loss: 7.4824

  Step 4/125 | Training loss: 7.4637

  Step 5/125 | Training loss: 7.3973

  Step 6/125 | Training loss: 7.3623

  Step 7/125 | Training loss: 7.2320

  Step 8/125 | Training loss: 7.0608

  Step 9/125 | Training loss: 6.9088

  Step 10/125 | Training loss: 6.7390

  Step 11/125 | Training loss: 6.5581

  Step 12/125 | Training loss: 6.3675

  Step 13/125 | Training loss: 6.1813

  Step 14/125 | Training loss: 5.9827

  Step 15/125 | Training loss: 5.7704

  Step 16/125 | Training loss: 5.5616

  Step 17/125 | Training loss: 5.3579

  Step 18/125 | Training loss: 5.1764

  Step 19/125 | Training loss: 5.0025

  Step 20/125 | Training loss: 4.8678

  Step 21/125 | Training loss: 4.7218

  Step 22/125 | Training loss: 4.5852

  Step 23/125 | Training loss: 4.4617

  Step 24/125 | Training loss: 4.3489

  Step 25/125 | Training loss: 4.2290

  Step 26/125 | Training loss: 4.

In [26]:
model_1.save_pretrained("./qlora-finetuned-pythia")

In [27]:
torch.cuda.empty_cache()
del model_1
del model_1_tokenizer

## Model 2 : StableLM (3B)

In [28]:
model_2 = AutoModelForCausalLM.from_pretrained(
    "stabilityai/stablelm-3b-4e1t",
    quantization_config=qlora_config,
    device_map="cuda",
    trust_remote_code=True,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/5.59G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [29]:
model_2.to(device)

StableLmForCausalLM(
  (model): StableLmModel(
    (embed_tokens): Embedding(50304, 2560)
    (layers): ModuleList(
      (0-31): 32 x StableLmDecoderLayer(
        (self_attn): StableLmSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): StableLmRotaryEmbedding()
        )
        (mlp): StableLmMLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LayerNorm((2560,), 

In [30]:
model_2_tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")

if model_2_tokenizer.pad_token is None:
    model_2_tokenizer.pad_token = model_2_tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [31]:
print(model_2_tokenizer.eos_token)

<|endoftext|>


In [32]:
encoded_text = model_2_tokenizer(test)["input_ids"]

In [33]:
encoded_text # test = "Hi! How are you?"

[12764, 2, 1359, 403, 368, 32]

In [34]:
model_2.gradient_checkpointing_enable()
model_2 = prepare_model_for_kbit_training(model_2)

In [35]:
lora_config_model_2 = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [36]:
model_2 = get_peft_model(model_2, lora_config_model_2)

In [37]:
model_2.print_trainable_parameters()

trainable params: 12,517,376 || all params: 2,807,960,576 || trainable%: 0.4458


In [43]:
def tokenize_model_2(texts):
    return model_2_tokenizer(
        texts, 
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize_model_2(train_texts)
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels = train_input_ids.clone()

In [44]:
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels) 
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=BATCH_SIZE
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    sampler=test_sampler,
    batch_size=BATCH_SIZE
)

In [45]:
len(train_dataloader)

500

In [46]:
optimizer = bnb_optim.PagedAdamW8bit(model_2.parameters(), lr=5e-5)

epochs = 1
total_steps = len(train_dataloader) * epochs // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [47]:
progress_bar = tqdm(range(total_steps))
model_2.train()

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Forward pass
        outputs = model_2(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels 
        )
        
        loss = outputs.loss
        
        # --- Gradient Accumulation ---
        loss = loss / GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        
        total_train_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        
        # --- Optimizer Step ---
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model_2.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            avg_train_loss = total_train_loss / (step + 1)
            print(f"\n  Step {progress_bar.n}/{total_steps} | Training loss: {avg_train_loss:.4f}")

print("\nTraining complete.")

  0%|          | 0/125 [00:00<?, ?it/s]




  return fn(*args, **kwargs)



  Step 1/125 | Training loss: 5.4718

  Step 2/125 | Training loss: 4.8463

  Step 3/125 | Training loss: 4.3565

  Step 4/125 | Training loss: 3.8744

  Step 5/125 | Training loss: 3.4299

  Step 6/125 | Training loss: 3.2003

  Step 7/125 | Training loss: 2.8569

  Step 8/125 | Training loss: 2.6543

  Step 9/125 | Training loss: 2.4970

  Step 10/125 | Training loss: 2.3785

  Step 11/125 | Training loss: 2.2581

  Step 12/125 | Training loss: 2.1530

  Step 13/125 | Training loss: 2.1133

  Step 14/125 | Training loss: 2.0667

  Step 15/125 | Training loss: 2.0464

  Step 16/125 | Training loss: 2.0216

  Step 17/125 | Training loss: 2.0162

  Step 18/125 | Training loss: 2.0143

  Step 19/125 | Training loss: 1.9978

  Step 20/125 | Training loss: 1.9963

  Step 21/125 | Training loss: 1.9774

  Step 22/125 | Training loss: 1.9747

  Step 23/125 | Training loss: 1.9595

  Step 24/125 | Training loss: 1.9529

  Step 25/125 | Training loss: 1.9168

  Step 26/125 | Training loss: 1.

In [48]:
model_2.save_pretrained("./qlora-finetuned-stableLM")