In [None]:
!pip3 install transformers==4.38.2
!pip3 install -U torch datasets sentence_transformers
!pip3 install accelerate==0.27.2 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7
!pip3 install wandb




In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_name='google/gemma-2b'

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load pre-trained config
#################################################################

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"":0})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [None]:
from datasets import load_dataset

data = load_dataset("microsoft/orca-math-word-problems-200k")
data = data.map(lambda samples: tokenizer(samples["question"]), batched=True)

Map:   0%|          | 0/200035 [00:00<?, ? examples/s]

In [None]:
data['train'][0]

{'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.',
 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.',
 'input_ids': [2,
  176666,
  603,
  573,
  235248,
  235308,
  489,
  2040,
  235265,
  8682,
  573,
  1758,
  576,
  1461,
  1064,
  24757,
  573,
  9442,
  2017,
  13042,
  1178,
  124337,
  235265],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
import transformers
from trl import SFTTrainer
from transformers.generation.utils import top_k_top_p_filtering

def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"Question: {example['question'][i]}\nAnswer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=500,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Map:   0%|          | 0/200035 [00:00<?, ? examples/s]

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 9805824
all model parameters: 1525073920
percentage of trainable model parameters: 0.64%


In [None]:
trainer.train()


You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.93
20,0.7737
30,0.7722
40,0.6492
50,0.7011
60,0.6822
70,0.6398
80,0.6902
90,0.6513
100,0.6423


TrainOutput(global_step=500, training_loss=0.5343336153030396, metrics={'train_runtime': 1498.4886, 'train_samples_per_second': 1.335, 'train_steps_per_second': 0.334, 'total_flos': 8946927494430720.0, 'train_loss': 0.5343336153030396, 'epoch': 2.49})

In [None]:
new_model='google/gemma-2b-finetuned'
trainer.model.save_pretrained(new_model)

In [None]:
text = "If twice a number is increased by 8, the result is 28. Find the number."
device="cuda:0"
inputs=tokenizer(text,return_tensors="pt").to(device)

outputs=model.generate(**inputs,max_new_tokens=500)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))


If twice a number is increased by 8, the result is 28. Find the number.
Let's call the number we're looking for "x". According to the problem, "twice a number" can be written as 2x. The problem also states that "twice a number is increased by 8", which can be written as 2x + 8.

According to the problem, "the result is 28", which can be written as 28 = 2x + 8.

Now we can solve for x by first subtracting 8 from both sides of the equation:

28 - 8 = 2x + 8 - 8
20 = 2x

Now, we divide both sides by 2 to solve for x:

20 / 2 = 2x / 2
10 = x

Therefore, the number we're looking for is 10. To check our answer, we can set up the equation again:

Twice the number (2x) is increased by 8 (2x + 8) should equal 28:

2x + 8 = 28

Now we can solve for x again:

2x + 8 = 28
2x = 28 - 8
2x = 20
x = 20 / 2
x = 10

The result confirms that the number is indeed 10. So, the number is 10.

Let's confirm that the number is indeed 10:

Twice the number (2x) is increased by 8 (2x + 8) should equal 28:

2x + 

In [None]:
device_map = {"": 0}
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"


!huggingface-cli login

model.push_to_hub("tushar-r-pawar/gemma-2b-finetuned", check_pr=True)

tokenizer.push_to_hub("tushar-r-pawar/gemma-2b-finetuned",check_pr=True)





    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/tushar-r-pawar/gemma-2b-finetuned/commit/8956f518015a23fdcf101910a87e20f331187bfd', commit_message='Upload tokenizer', commit_description='', oid='8956f518015a23fdcf101910a87e20f331187bfd', pr_url=None, pr_revision=None, pr_num=None)