In [None]:
! pip install tensorflow peft bitsandbytes transformers

In [2]:
! pip install accelerate datasets trl

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m26.6 MB/s[0

In [3]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `token2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `token2`


In [4]:
# import tensorflow
import torch
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline,logging
from trl import SFTTrainer
from peft import LoraConfig,get_peft_model

In [5]:
### QLORA hyperparameters ###

lora_learning_rate = 1e-4
lora_rank = 64
lora_dropout = 0.2
lora_alpha = 16

# even using QLORA lora config is required because LORA low rank optimization is applied after quantization

In [34]:
peft_config = LoraConfig(r=lora_rank,
           lora_alpha=lora_alpha,
           lora_dropout=lora_dropout,
           bias='none',
           task_type='CAUSAL_LM')

In [7]:
### bitsandbytes parameters ###
# The bitsandbytes library is a lightweight Python wrapper around CUDA custom functions, particularly designed for 8-bit optimizers,
# matrix multiplication (LLM.int8()), and 8-bit and 4-bit quantization functions

bnb4bit_compute_dtype = 'float16'

# Quantization type (fp4 or nf4)
bnb4bit_quant_type = 'nf4'

use_nested_quant = False

In [8]:
compute_dtype = getattr(torch,bnb4bit_compute_dtype)
# fetch the value of bnb4bit_compute_dtype from the torch module.

# getattr is a built-in Python function that retrieves an attribute from an object.

In [9]:
bitsAndbytes_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=compute_dtype,
                                         bnb_4bit_quant_type=bnb4bit_quant_type,
                                         bnb_4bit_use_double_quant=False,
                                        )

In [10]:
torch.cuda.get_device_capability()

# does not compatible with bfloat16 becuase major is 7 it must be greater than 8.

(7, 5)

In [11]:
### training configuration ###

output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Batch size per GPU for training
train_batch_size_perGPU = 4

# Batch size per GPU for evaluation
eval_batch_size_perGPU = 4

# Number of update steps to accumulate the gradients for
# if our setup can manage it, keeping it simple with 1 works fine
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# learning rate (AdamW optimizer)
# Lower learning rates tend to provide more stable and gradual learning.
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optimizer_ = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

In [None]:
model_name = 'google/gemma-2-9b-it'

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bitsAndbytes_config,
                                             device_map={'':0},attn_implementation = 'eager')
# device_map is where to load the entire model , we specified 0 which means to the GPU
# for CPU we have to use 'cpu' instead of 0

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)

In [None]:
print(model.config)

In [17]:
# Disables the use of caching during model inference.
model.config.use_cache = False
# Caching stores intermediate results to speed up future computations. Turning it off might be necessary if caching leads to high memory consumption
# or isn't beneficial for our task.

# Sets the degree of tensor parallelism for pretraining.
model.config.pretraining_tp = 1
# Tensor parallelism splits the model tensors across multiple devices (e.g., GPUs) to speed up training. A value of 1 means no tensor splitting

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [18]:
!wget -O databricks-dolly-15k.jsonl https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
# this dataset is from huggingface and converted to jsonl

--2024-11-27 15:08:10--  https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
Resolving huggingface.co (huggingface.co)... 18.164.174.55, 18.164.174.118, 18.164.174.23, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.55|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/34/ac/34ac588cc580830664f592597bb6d19d61639eca33dc2d6bb0b6d833f7bfd552/2df9083338b4abd6bceb5635764dab5d833b393b55759dffb0959b6fcbf794ec?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27databricks-dolly-15k.jsonl%3B+filename%3D%22databricks-dolly-15k.jsonl%22%3B&Expires=1732979290&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjk3OTI5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy8zNC9hYy8zNGFjNTg4Y2M1ODA4MzA2NjRmNTkyNTk3YmI2ZDE5ZDYxNjM5ZWNhMzNkYzJkNmJiMGI2ZDgzM2Y3YmZkNTUyLzJkZjkwODMzMzhiNGFiZDZiY2ViNTYzNTc2NGRhYjVkODMzYjM5M2I1NTc1OWRm

In [19]:
import json

with open('databricks-dolly-15k.jsonl','r') as f:
  for line in f:
    k = json.loads(line)
    print(k)
    print(k['context'])
    break

{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}
Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It 

In [20]:
import json

def convert_data():
  data = []
  with open('databricks-dolly-15k.jsonl','r') as f:
    for line in f:
      line = json.loads(line)
      string_ = "instructions:{instruction},context:{context},response:{response}"
      data.append(string_.format(**line))

  return data

In [21]:
data = convert_data()

In [22]:
# we are only training on 1000 for quick training
data_for_training = data[:1000]

In [23]:
from datasets import Dataset
import pandas as pd

pd_data = pd.DataFrame(data_for_training,columns=['input'])
hf_dataset = Dataset.from_pandas(pd_data)

In [24]:
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=num_train_epochs,
                                  per_device_train_batch_size=train_batch_size_perGPU,
                                  per_device_eval_batch_size=eval_batch_size_perGPU,
                                  gradient_accumulation_steps=gradient_accumulation_steps,
                                  optim=optimizer_,
                                  save_steps=0,
                                  logging_steps=25,
                                  learning_rate=learning_rate,
                                  weight_decay=weight_decay,
                                  fp16=False,
                                  bf16=False,
                                  max_grad_norm=max_grad_norm,
                                  max_steps=max_steps,
                                  warmup_ratio=warmup_ratio,
                                  group_by_length=True,                     # Group sequences into batches with same length
                                  lr_scheduler_type=lr_scheduler_type,
                                  report_to="tensorboard"
                                  )

In [35]:
trainer = SFTTrainer(model=model,
                     args=training_args,
                     peft_config=peft_config,
                     train_dataset=hf_dataset,
                     tokenizer=tokenizer,
                     packing=False,
                     max_seq_length=None,
                     dataset_text_field='input'
                    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.is_available()

True

In [37]:
trainer.train()