In [None]:
! pip install tensorflow peft bitsandbytes transformers accelerate datasets trl

In [2]:
# import tensorflow
import torch
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline,logging
from trl import SFTTrainer
from peft import LoraConfig,get_peft_model

In [3]:
### bitsandbytes parameters ###
# The bitsandbytes library is a lightweight Python wrapper around CUDA custom functions, particularly designed for 8-bit optimizers,
# matrix multiplication (LLM.int8()), and 8-bit and 4-bit quantization functions

bnb4bit_compute_dtype = 'bfloat16'

# Quantization type (fp4 or nf4)
bnb4bit_quant_type = 'nf4'

use_nested_quant = False

In [4]:
compute_dtype = getattr(torch,bnb4bit_compute_dtype)
# fetch the value of bnb4bit_compute_dtype from the torch module.

# getattr is a built-in Python function that retrieves an attribute from an object.

In [5]:
bitsAndbytes_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=compute_dtype,
                                         bnb_4bit_quant_type=bnb4bit_quant_type,
                                         bnb_4bit_use_double_quant=True,
                                        )

In [6]:
from google.colab import userdata

access_token = userdata.get('HF_TOKEN')

In [7]:
model_name = 'google/gemma-2-2b'

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,max_length=150)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bitsAndbytes_config,
                                             device_map='auto',
                                             attn_implementation = 'eager',
                                             token=access_token)
# device_map is where to load the entire model
# 0 which means to the GPU.
# for CPU we have to use 'cpu' instead of 0

# Disables the use of caching during model inference.
model.config.use_cache = False
# Caching stores intermediate results to speed up future computations. Turning it off might be necessary if caching leads to high memory consumption
# or isn't beneficial for our task.

# Sets the degree of tensor parallelism for pretraining.
model.config.pretraining_tp = 1
# Tensor parallelism splits the model tensors across multiple devices (e.g., GPUs) to speed up training. A value of 1 means no tensor splitting

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
print(f"{model.get_memory_footprint()/1e9:,.1f} GB")

2.2 GB


In [None]:
help(AutoModelForCausalLM)

***Generating before fine-tuning***

In [11]:
question = 'When did Virgin Australia start operating'
input_ = tokenizer.encode(question,return_tensors='pt').to('cuda')
response = model.generate(input_).to('cuda')

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


In [12]:
response = tokenizer.decode(response[0],skip_special_tokens=True)
print(response)

When did Virgin Australia start operating?

Virgin Australia started operating in 1999.

What is the Virgin Australia Group


***Tuning Phase***

In [13]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear4bit(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm):

In [14]:
Target_modules = ['q_proj','k_proj','v_proj','o_proj']

In [25]:
### QLORA hyperparameters ###

lora_learning_rate = 1e-4
lora_rank = 8
lora_dropout = 0.2
lora_alpha = 16               # double of lora rank

# even using QLORA lora config is required because LORA low rank optimization is applied after quantization
# alpha should be double the rank

In [16]:
peft_config = LoraConfig(r=lora_rank,
           lora_alpha=lora_alpha,
           lora_dropout=lora_dropout,
           bias='none',
           task_type='CAUSAL_LM',
           target_modules=Target_modules)

***Data Preparation***

In [17]:
!wget -O databricks-dolly-15k.jsonl https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
# this dataset is from huggingface and converted to jsonl

--2024-12-28 01:17:39--  https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
Resolving huggingface.co (huggingface.co)... 18.164.174.23, 18.164.174.17, 18.164.174.118, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.23|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/34/ac/34ac588cc580830664f592597bb6d19d61639eca33dc2d6bb0b6d833f7bfd552/2df9083338b4abd6bceb5635764dab5d833b393b55759dffb0959b6fcbf794ec?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27databricks-dolly-15k.jsonl%3B+filename%3D%22databricks-dolly-15k.jsonl%22%3B&Expires=1735607859&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNTYwNzg1OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy8zNC9hYy8zNGFjNTg4Y2M1ODA4MzA2NjRmNTkyNTk3YmI2ZDE5ZDYxNjM5ZWNhMzNkYzJkNmJiMGI2ZDgzM2Y3YmZkNTUyLzJkZjkwODMzMzhiNGFiZDZiY2ViNTYzNTc2NGRhYjVkODMzYjM5M2I1NTc1OWRm

In [18]:
import json

with open('databricks-dolly-15k.jsonl','r') as f:
  for line in f:
    k = json.loads(line)
    print(k)
    print(k['context'])
    break

{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}
Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It 

In [19]:
import json

def convert_data():
  data = []
  with open('databricks-dolly-15k.jsonl','r') as f:
    for line in f:
      line = json.loads(line)
      string_ = "instructions:{instruction},context:{context},response:{response}"
      data.append(string_.format(**line))

  return data

In [20]:
data = convert_data()

In [21]:
# we are only training on 1000 for quick training
data_for_training = data[:1000]

In [22]:
from datasets import Dataset
import pandas as pd

pd_data = pd.DataFrame(data_for_training,columns=['text'])
hf_dataset = Dataset.from_pandas(pd_data)

***Training Phase***

In [23]:
### training configuration ###

output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Batch size per GPU for training
train_batch_size_perGPU = 1

# Batch size per GPU for evaluation
eval_batch_size_perGPU = 1

# Number of update steps to accumulate the gradients for
# if our setup can manage it, keeping it simple with 1 works fine
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# learning rate (AdamW optimizer)
# Lower learning rates tend to provide more stable and gradual learning.
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optimizer_ = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

In [24]:
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=num_train_epochs,
                                  per_device_train_batch_size=train_batch_size_perGPU,
                                  per_device_eval_batch_size=eval_batch_size_perGPU,
                                  gradient_accumulation_steps=gradient_accumulation_steps,
                                  optim=optimizer_,
                                  save_steps=0,
                                  logging_steps=25,
                                  learning_rate=learning_rate,
                                  weight_decay=weight_decay,
                                  fp16=False,
                                  bf16=True,
                                  max_grad_norm=max_grad_norm,
                                  max_steps=max_steps,
                                  warmup_ratio=warmup_ratio,
                                  group_by_length=True,                     # Group sequences into batches with same length
                                  lr_scheduler_type=lr_scheduler_type,
                                  report_to="tensorboard"
                                  )

In [26]:
trainer = SFTTrainer(model=model,
                     args=training_args,
                     peft_config=peft_config,
                     train_dataset=hf_dataset,
                     tokenizer=tokenizer,
                    )

  trainer = SFTTrainer(model=model,


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [27]:
torch.cuda.empty_cache()

In [29]:
trainer.train()

Step,Training Loss
25,2.3374
50,2.4184
75,2.0282
100,1.6882
125,1.8503
150,1.7668
175,1.617
200,1.8102
225,1.9258
250,1.8447


TrainOutput(global_step=1000, training_loss=1.8509483871459962, metrics={'train_runtime': 1245.9477, 'train_samples_per_second': 0.803, 'train_steps_per_second': 0.803, 'total_flos': 2125253862991872.0, 'train_loss': 1.8509483871459962, 'epoch': 1.0})

In [30]:
trainer.model.save_pretrained('finetuned_model')

In [61]:
del model
del trainer
import gc
gc.collect()
gc.collect()

47

***Merging Weights of Lora Config with Base model***

In [62]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [63]:
model = PeftModel.from_pretrained(base_model,'finetuned_model')
model = model.merge_and_unload()

# reloading tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

***Pushing to huggingface models***

In [64]:
import locale

locale.preferred_encoding = lambda: "UTF-8"

In [65]:
name = "shiv-am-04/gemma2-2b-finetuned"

!huggingface-cli login

model.push_to_hub(name, check_pr=True)

tokenizer.push_to_hub(name,check_pr=True)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/shiv-am-04/gemma2-2b-finetuned/commit/a48366595483dd4baf535ad627c957a5cc3e1abb', commit_message='Upload tokenizer', commit_description='', oid='a48366595483dd4baf535ad627c957a5cc3e1abb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shiv-am-04/gemma2-2b-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='shiv-am-04/gemma2-2b-finetuned'), pr_revision=None, pr_num=None)

***Generation After FineTuning***

In [66]:
from transformers import pipeline

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=150)

Device set to use cuda:0


In [68]:
response = pipe('When did Virgin Australia start operating',truncation=True)

In [69]:
print(response[0]['generated_text'])

When did Virgin Australia start operating?

Virgin Australia started operating in 2000.

What is the Virgin Australia fleet size?

Virgin Australia has 110 aircraft in its fleet.

What is the Virgin Australia hub airport?

Virgin Australia has its hub at Melbourne Airport.

What is the Virgin Australia loyalty program?

Virgin Australia has its loyalty program called Velocity.

What is the Virgin Australia website?

Virgin Australia's website is www.virginaustralia.com.

What is the Virgin Australia app?

Virgin Australia has its app called Virgin Australia.

What is the Virgin Australia phone number?

Virgin Australia's phone number is 13 67 89.
