# Installing Dependencies

In [1]:
! pip install accelerate transformers einops datasets peft bitsandbytes

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Importing Dependencies

In [3]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import os

# Finetuning

In [4]:
## Load tokenzier
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

### Explan :
tokenizer.pad_token = tokenizer.eos_token is used to set the padding token of the tokenizer to be the same as the end-of-sequence (EOS) token.

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

## Load the model
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [6]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear4bit(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_laye

In [7]:
config = LoraConfig(
    r=16,                   ## if we increses the r valu, the number of trainable param will be incr
    lora_alpha=16,
    # target_modules=["Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 11,010,048 || all params: 1,429,280,768 || trainable%: 0.7703


###   r=16, if we increses the value, the number of trainable param will be incr

### Explan:
the target_modules parameter specifies which layers or modules of the neural network will be adapted using the low-rank matrices.

In [8]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2048)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleD

In [9]:
def tokenize(sample):
    model_inps =  tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return model_inps

## Dataset load and data prepration

In [10]:
data = load_dataset("gsm8k", "main", split="train")
data

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

### data_df = data.to_pandas()
#### Converting Dataset object into pandas dataframe

In [11]:
data_df = data.to_pandas()
data_df

Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...
...,...,...
7468,"Very early this morning, Elise left home in a ...","For the distance she traveled, Elise paid 23 -..."
7469,Josh is saving up for a box of cookies. To rai...,He makes $.5 profit on each bracelet because 1...
7470,Colin can skip at six times the speed that Bra...,Tony can skip at twice the speed that Bruce ca...
7471,"Janet, a third grade teacher, is picking up th...",Janet needs 35 lunches for the kids + 5 for th...


In [12]:

data_df["text"] = data_df[["question", "answer"]].apply(lambda x: "question: " + x["question"] + " answer: " + x["answer"], axis=1)
data_df

Unnamed: 0,question,answer,text
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...,question: Natalia sold clips to 48 of her frie...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...,question: Weng earns $12 an hour for babysitti...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<...",question: Betty is saving money for a new wall...
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....,question: Julie is reading a 120-page book. Ye...
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...,question: James writes a 3-page letter to 2 di...
...,...,...,...
7468,"Very early this morning, Elise left home in a ...","For the distance she traveled, Elise paid 23 -...","question: Very early this morning, Elise left ..."
7469,Josh is saving up for a box of cookies. To rai...,He makes $.5 profit on each bracelet because 1...,question: Josh is saving up for a box of cooki...
7470,Colin can skip at six times the speed that Bra...,Tony can skip at twice the speed that Bruce ca...,question: Colin can skip at six times the spee...
7471,"Janet, a third grade teacher, is picking up th...",Janet needs 35 lunches for the kids + 5 for th...,"question: Janet, a third grade teacher, is pic..."


In [13]:
data_df['text'][0]

'question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'

### data = Dataset.from_pandas(data_df)
#### is used to create a dataset object from a pandas DataFrame using the datasets library, which is part of the Hugging Face ecosystem. Here’s a detailed explanation of what this doe

In [14]:
data = Dataset.from_pandas(data_df)
data

Dataset({
    features: ['question', 'answer', 'text'],
    num_rows: 7473
})

In [15]:
data.column_names

['question', 'answer', 'text']

In [16]:
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
tokenized_data

Tokenizing data:   0%|          | 0/7473 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7473
})

In [17]:
training_arguments = TrainingArguments(
        output_dir="phi-1_5-finetuned-shrawan",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=100,
        max_steps=1000,
        num_train_epochs=1,
        push_to_hub=True
    )

### Explan
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False): The data collator is responsible for batching the data. In this case, DataCollatorForLanguageModeling is used with mlm=False, indicating that the model is being trained for causal language modeling (i.e., predicting the next token).

In [18]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()


max_steps is given, it will override any value given in num_train_epochs
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
100,1.1301
200,1.0601
300,1.0277
400,1.0363
500,1.0389
600,1.0153
700,0.9701
800,1.0002
900,1.0137
1000,1.015


TrainOutput(global_step=1000, training_loss=1.0307448501586913, metrics={'train_runtime': 923.037, 'train_samples_per_second': 4.334, 'train_steps_per_second': 1.083, 'total_flos': 1.330425453797376e+16, 'train_loss': 1.0307448501586913, 'epoch': 0.5350454788657036})

In [19]:
torch.cuda.empty_cache()

### Push the Trained Model to the Hugging Face Hub:
After training, the model can be pushed to the Hugging Face Hub for sharing and collaboration.

In [None]:
# trainer.push_to_hub()


CommitInfo(commit_url='https://huggingface.co/AISAG/phi-1_5-finetuned-shrawan/commit/34379d8993895f4c2bd2a4634d1ef74fb6bb22f7', commit_message='End of training', commit_description='', oid='34379d8993895f4c2bd2a4634d1ef74fb6bb22f7', pr_url=None, pr_revision=None, pr_num=None)

# Saving

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch

# Loads a pre-trained causal language model from the Hugging Face Model Hub.
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype=torch.float32)

# Loads a fine-tuned model on top of the base model using the PEFT library
peft_model = PeftModel.from_pretrained(model, "phi-1_5-finetuned-shrawan", from_transformers=True)

# Merges the fine-tuned parameters with the base model and unloads the PEFT wrapper.
model = peft_model.merge_and_unload()
model

# Inference

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("phi-1_5-finetuned-shrawan", trust_remote_code=True, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
inputs = tokenizer('''question: I have 2 apples. My friend gave me another two apples. I ate 1 apple. Totally how many I have now? answer: ''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=512)
text = tokenizer.batch_decode(outputs)[0]
print(text)
