In [None]:
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np

np.random.seed(42)

def load_imdb_dataset():
    imdb_dataset = load_dataset("imdb")
    N = 1000
    rand_idx = np.random.randint(24999, size=N)
    
    x_train = imdb_dataset['train'][rand_idx]['text']
    y_train = imdb_dataset['train'][rand_idx]['label']
    
    x_test = imdb_dataset['test'][rand_idx]['text']
    y_test = imdb_dataset['test'][rand_idx]['label']
    
    return DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'test':Dataset.from_dict({'label':y_test,'text':x_test})})    

dataset = load_imdb_dataset()

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [3]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

np.float64(0.504)

## QLoRA Configuration

In [5]:
import torch
from transformers.utils.quantization_config import BitsAndBytesConfig

qlora_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

## Model : TinyLlama-1.1B-Chatv1.0

In [None]:
from transformers import AutoModelForCausalLM

model_1 = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    num_labels = 2,
    quantization_config = qlora_config,
    output_attentions=False,
    output_hidden_states=False
)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
model_1.to(device)

In [None]:
from transformers import AutoTokenizer

model_1_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

if model_1_tokenizer.pad_token is None:
    model_1_tokenizer.pad_token = model_1_tokenizer.eos_token

In [10]:
print(model_1_tokenizer.eos_token)

</s>


In [11]:
test = "Hi! How are you?"

In [12]:
encoded_text = model_1_tokenizer(test)["input_ids"]

In [13]:
encoded_text

[1, 6324, 29991, 1128, 526, 366, 29973]

## LoRA Configuration

In [14]:
model_1.gradient_checkpointing_enable()

In [15]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

model_1 = prepare_model_for_kbit_training(model_1)

In [16]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [17]:
model_1 = get_peft_model(model_1, lora_config)

In [18]:
model_1.print_trainable_parameters()

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701


In [19]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

import re

for i in range(len(train_texts)):
    train_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", train_texts[i])
    train_texts[i] = re.sub(r"\s+", " ", train_texts[i]).strip()
for i in range(len(test_texts)):
    test_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", test_texts[i])
    test_texts[i] = re.sub(r"\s+", " ", test_texts[i]).strip()

In [20]:
def tokenize(texts):
    return model_1_tokenizer(
        texts, 
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize(train_texts)
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels = torch.tensor(train_labels)
    
# Tokenize test data
test_encodings = tokenize(test_texts)
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']
test_labels = torch.tensor(test_labels)

In [21]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=BATCH_SIZE
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    sampler=test_sampler,
    batch_size=BATCH_SIZE
)

In [22]:
len(train_dataloader)

500

In [23]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import bitsandbytes.optim as bnb_optim


optimizer = bnb_optim.PagedAdamW8bit(model_1.parameters(), lr=5e-5)

epochs = 1
total_steps = len(train_dataloader) * epochs // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [24]:
from tqdm.notebook import tqdm
progress_bar = tqdm(range(total_steps))

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    
    model_1.train()
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # We don't call `model_1.zero_grad()` here yet because we are accumulating.
        
        # The model calculates loss internally when `labels` are provided.
        # The `labels` argument needs to be the same as `input_ids` for Causal LM loss.
        outputs = model_1(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_input_ids # For a Causal LM, labels are typically the input_ids themselves
        )
        
        loss = outputs.loss
        
        # --- Gradient Accumulation Step ---
        # Normalize the loss
        loss = loss / GRADIENT_ACCUMULATION_STEPS
        
        # Backpropagate the normalized loss
        loss.backward()
        
        # Accumulate loss for tracking
        total_train_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        
        # --- Optimizer Step ---
        # Only perform an optimizer step after accumulating for the specified number of steps.
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model_1.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad() # Reset gradients after the step
            progress_bar.update(1)

            avg_train_loss = total_train_loss / (step + 1)
            print(f"\n  Step {progress_bar.n}/{total_steps} | Training loss: {avg_train_loss:.4f}")

print("\nTraining complete.")

  0%|          | 0/125 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.





  return fn(*args, **kwargs)



  Step 1/125 | Training loss: 6.7019

  Step 2/125 | Training loss: 6.5796

  Step 3/125 | Training loss: 6.1343

  Step 4/125 | Training loss: 5.8453

  Step 5/125 | Training loss: 5.4075

  Step 6/125 | Training loss: 5.0608

  Step 7/125 | Training loss: 4.6996

  Step 8/125 | Training loss: 4.3935

  Step 9/125 | Training loss: 4.1953

  Step 10/125 | Training loss: 4.0272

  Step 11/125 | Training loss: 3.8350

  Step 12/125 | Training loss: 3.6478

  Step 13/125 | Training loss: 3.4740

  Step 14/125 | Training loss: 3.3412

  Step 15/125 | Training loss: 3.2929

  Step 16/125 | Training loss: 3.1967

  Step 17/125 | Training loss: 3.1356

  Step 18/125 | Training loss: 3.0441

  Step 19/125 | Training loss: 2.9901

  Step 20/125 | Training loss: 2.9497

  Step 21/125 | Training loss: 2.8749

  Step 22/125 | Training loss: 2.8294

  Step 23/125 | Training loss: 2.7675

  Step 24/125 | Training loss: 2.7085

  Step 25/125 | Training loss: 2.6503

  Step 26/125 | Training loss: 2.

In [25]:
model_1.save_pretrained("./qlora-finetuned-imdb-final")