<a href="https://colab.research.google.com/github/ShacharYonai/DicatLM-FineTune/blob/main/fine_tune_llm_eedi_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bitsandbytes flash-attn accelerate trl peft



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import Callable, Any, Tuple, List
import torch
from torch.utils.data import Dataset
from dataclasses import dataclass
import os
from pathlib import Path

In [3]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [4]:
from transformers import (AutoTokenizer,
                          AutoModel,
                          DataCollatorWithPadding,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          DataCollatorForLanguageModeling,
                          TrainingArguments,
                          AutoModelForCausalLM,
                          Trainer,)

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [6]:
misconseption = pd.read_csv("/content/data/misconception_mapping.csv")
sample_data = pd.read_csv("/content/data/sample_submission.csv")
train = pd.read_csv("/content/data/train.csv")
test = pd.read_csv("/content/data/test.csv")

In [7]:
def reshape_wide_to_long(df: pd.DataFrame, id_vars: list[str], value_vars: list[str], var_name: str, value_name: str, sorted_vars: list[str]) -> pd.DataFrame:
    df_reshape = df.melt(id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name)
    return df_reshape.sort_values(by=sorted_vars).reset_index(drop=True)

In [8]:
# select the variables for reshaping the misconception
train_tmp_mis = train.loc[:, ["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText",
                              *[var for var in train.columns if var.startswith('Mis')]]]

In [9]:
# select the variables for reshaping the answer
train_tmp_answer = train.loc[:, ["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText",
                              *[var for var in train.columns if var.startswith('Answer')]]]

In [10]:
train_misconception_reshaped = reshape_wide_to_long(df=train_tmp_mis,
                                                    id_vars=["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText"],
                                                    value_vars=[var for var in train.columns if var.startswith('Mis')],
                                                    var_name="misconception",
                                                    value_name="misconception_id",
                                                    sorted_vars=["QuestionId","misconception"])

In [11]:
train_answer_reshaped = reshape_wide_to_long(df=train_tmp_answer,
                                             id_vars=["QuestionId","ConstructName","SubjectName","CorrectAnswer","QuestionText"],
                                             value_vars=[var for var in train.columns if var.startswith('Answer')],
                                             var_name="answer",
                                             value_name="answer_id",
                                             sorted_vars=["QuestionId","answer"])

In [12]:
# concat the two data frames
train_reshaped = pd.concat([train_misconception_reshaped, train_answer_reshaped.loc[:, ["answer","answer_id"]]], axis=1)

In [13]:
# add the misconceptions texts to the reshaped train data
train_reshaped = train_reshaped.merge(misconseption, how="left", left_on="misconception_id", right_on="MisconceptionId")
train_reshaped.drop("MisconceptionId", axis=1, inplace=True)

In [14]:
# drop all the cases where misconception were not assigned
train_reshaped = train_reshaped[~train_reshaped["misconception_id"].isna()].reset_index(drop=True)

In [15]:
train_reshaped.drop(["CorrectAnswer","misconception","misconception_id","answer"], axis=1, inplace=True)

In [16]:
train_reshaped = train_reshaped.assign(txt=lambda x:
                                       "<name>\n" +
                                       x["ConstructName"] +
                                       "\n</name>\n" +
                                       "\n<subject>\n" +
                                       x["SubjectName"] +
                                       "\n</subject>\n" +
                                       "\n<question>\n" +
                                       x["QuestionText"] +
                                       "\n</question>\n" +
                                      "\n<answer>\n" +
                                      x["answer_id"] +
                                       "\n</answer>\n" +
                                      "\n<misconception>\n" +
                                      x["MisconceptionName"] +
                                      "\n</misconception>")

In [17]:
print(train_reshaped["txt"][0])

<name>
Use the order of operations to carry out calculations involving powers
</name>

<subject>
BIDMAS
</subject>

<question>
\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ?
</question>

<answer>
Does not need brackets
</answer>

<misconception>
Confuses the order of operations, believes addition comes before multiplication 
</misconception>


In [18]:
# shufle the data
train_reshaped = train_reshaped.sample(n=train_reshaped.shape[0], replace=False, random_state=42).reset_index(drop=True)
train, valid = train_test_split(train_reshaped, test_size=0.1, random_state=42)

In [19]:
train_prompts = train["txt"].to_list()
valid_prompts = valid["txt"].to_list()

In [20]:
model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [21]:
# create the Dataset templete
class PromptTemplet(Dataset):
  def __init__(self,
               prompts: list[str],
               tokenizer: AutoTokenizer):
    super().__init__()
    self.prompts = prompts
    self.tokenizer = tokenizer

  def __len__(self) -> int:
    """return the total number of samples"""
    return len(self.prompts)

  def __getitem__(self, index: int):
    """return one sample of image data and label"""
    prompt = self.prompts[index]
    score = tokenizer(prompt, return_tensors='pt')  #  max_length=512, padding='max_length',
    return {
        'prompt_text': prompt,
        'input_ids': score['input_ids'].squeeze(),
        'attention_mask': score['attention_mask'].squeeze()
    }

In [22]:
train_dataset = PromptTemplet(prompts=train_prompts, tokenizer=tokenizer)
valid_dataset = PromptTemplet(prompts=valid_prompts, tokenizer=tokenizer)

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [24]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        llm_int8_enable_fp32_cpu_offload=True,
    )

In [25]:
lora_config = LoraConfig(
        r=16,
        lora_alpha=8,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

In [26]:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             #torch_dtype=torch.bfloat16,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [28]:
model = get_peft_model(model, lora_config)

In [29]:
model.config.pad_token_id = tokenizer.eos_token_id

In [30]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 3145728 | total: 3824225280 | Percentage: 0.0823%


In [31]:
@dataclass
class TrainModel:
    output_dir: str
    run_name: str
    trainer_type: Callable
    model: Any
    tokenizer: Any
    train_data: Dataset
    valid_data: Dataset
    overwrite_output_dir: bool = True
    learning_rate: float = 0.000015
    auto_find_batch_size: bool = True
    per_device_train_batch_size: int = 8
    per_device_eval_batch_size: int = 4
    gradient_accumulation_steps: int = 4
    gradient_checkpointing: bool = True
    eval_accumulation_steps = 10
    warmup_steps: int = 20
    save_total_limit: int = 5
    num_train_epochs: int = 5
    weight_decay: float = 0.0001
    optim: str = "adamw_torch"  # "adamw_torch"  # "adamw_torch" 'lion_32bit'
    logging_steps = 1
    fp16: bool = True
    bf16: bool = True
    tf32: bool = True
    max_grad_norm: float = 0.3
    warmup_ratio: float = 0.03
    lr_scheduler_type: str = "cosine"
    eval_strategy: str = "steps"  # "epoch"
    save_strategy: str = "steps"  # "epoch"
    load_best_model_at_end: bool = True
    dataset_text_field: str = "prompt_text"
    metric_for_best_model: str = 'loss'
    push_to_hub: bool = False
    report_to = "wandb"

    @property
    def data_collator(self):
        return DataCollatorForLanguageModeling(self.tokenizer, mlm=False)

    @property
    def training_args(self):
        return TrainingArguments(
            output_dir=self.output_dir,
            run_name=self.run_name,
            overwrite_output_dir=self.overwrite_output_dir,
            learning_rate=self.learning_rate,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            auto_find_batch_size=self.auto_find_batch_size,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=self.per_device_eval_batch_size,
            eval_accumulation_steps=self.eval_accumulation_steps,
            # gradient_checkpointing=self.gradient_checkpointing,
            warmup_steps=self.warmup_steps,
            save_total_limit=self.save_total_limit,
            num_train_epochs=self.num_train_epochs,
            weight_decay=self.weight_decay,
            optim=self.optim,
            logging_steps=self.logging_steps,
            fp16=self.fp16,
            # bf16=self.bf16,
            # tf32=self.tf32,
            # max_grad_norm=self.max_grad_norm,
            # warmup_ratio=self.warmup_ratio,
            lr_scheduler_type=self.lr_scheduler_type,
            eval_strategy=self.eval_strategy,
            save_strategy=self.save_strategy,
            metric_for_best_model=self.metric_for_best_model,
            load_best_model_at_end=self.load_best_model_at_end,
            push_to_hub=self.push_to_hub,
            #report_to=self.report_to
        )

    @property
    def train(self):
        trainer_obj = SFTTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_data,
            eval_dataset=self.valid_data,
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
        )
        return trainer_obj.train()

In [32]:
torch.cuda.empty_cache()

In [33]:
trainer = TrainModel(output_dir="fine_tune_phi_3.5",
                     run_name="fine_tunning_phi_3.5_eedi",
                     trainer_type=Trainer,
                     model=model,
                     tokenizer=tokenizer,
                     train_data=train_dataset,
                     valid_data=valid_dataset
                    )

In [34]:
trainer.train

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshachar-yonai[0m ([33mshachar-yonai-st[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
1,2.0646,2.023294
2,2.0467,2.023271
3,2.0939,2.023219
4,2.0318,2.023377
5,2.041,2.02328
6,2.0567,2.023184
7,2.0216,2.023164
8,2.033,2.023067
9,2.0775,2.02311
10,1.9735,2.023


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=615, training_loss=1.3940291846670756, metrics={'train_runtime': 12499.1926, 'train_samples_per_second': 1.573, 'train_steps_per_second': 0.049, 'total_flos': 9.415823519558246e+16, 'train_loss': 1.3940291846670756, 'epoch': 5.0})

In [36]:
trainer.model.save_pretrained("fine-tune-phi-3.5-mini-instruct-eedi")

In [184]:
# import torch
from peft import PeftModel, PeftConfig
# from transformers import AutoModelForCausalLM, AutoTokenizer
path_to_model = "/content/fine_tune_phi_3.5/checkpoint-615"

peft_model_id = path_to_model # "/content/fine_tune_phi_3.5/checkpoint-61"  # "/content/fine-tune-phi-3.5-mini-instruct-eedi"
config = PeftConfig.from_pretrained(peft_model_id)

In [185]:
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_4bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [186]:
# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [187]:
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnit

In [41]:
import re

In [188]:
number=17
print(valid_prompts[number])

<name>
Given the area of a parallelogram, calculate a missing dimension
</name>

<subject>
Area of Simple Shapes
</subject>

<question>
The area of this parallelogram is \( 24 \mathrm{~cm}^{2} \)

What measurement should replace the star? ![Parallelogram with base length 6cm and the perpendicular height has a star symbol.]()
</question>

<answer>
\( 2 \mathrm{~cm} \)
</answer>

<misconception>
When finding a missing side of a parallelogram given the area completes an extra step of dividing by 2
</misconception>


In [163]:
valid_prompts[number]

'<name>\nIdentify cross sections of 3D shapes\n</name>\n\n<subject>\nVolume of Prisms\n</subject>\n\n<question>\nThis is the uniform cross-section of solid \\( Q \\).\n\nWhich of the following could shape \\( Q \\) be? ![A rectangle.]()\n</question>\n\n<answer>\nCube\n</answer>\n\n<misconception>\nMixes up cubes and cuboids\n</misconception>'

In [189]:
txt_to_delete = re.findall("<misconception>\n(.*\n</misconception>)", valid_prompts[number])[0]

In [190]:
p=re.sub(txt_to_delete, "", valid_prompts[number])

In [191]:
print(p)

<name>
Given the area of a parallelogram, calculate a missing dimension
</name>

<subject>
Area of Simple Shapes
</subject>

<question>
The area of this parallelogram is \( 24 \mathrm{~cm}^{2} \)

What measurement should replace the star? ![Parallelogram with base length 6cm and the perpendicular height has a star symbol.]()
</question>

<answer>
\( 2 \mathrm{~cm} \)
</answer>

<misconception>



In [46]:
from transformers import TextStreamer

In [111]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [192]:
inputs = tokenizer(p, return_tensors="pt")

In [193]:
generated_ids = model.generate(**inputs, streamer=streamer, max_new_tokens=30, do_sample=True, pad_token_id=tokenizer.eos_token_id)



Did not understand the formula to calculate the area of parallelograms
</misconception>

<misconceptual-question


In [194]:
decoded = tokenizer.batch_decode(generated_ids)

In [195]:
print(decoded[0])

<name>
Given the area of a parallelogram, calculate a missing dimension
</name>

<subject>
Area of Simple Shapes
</subject>

<question>
The area of this parallelogram is \( 24 \mathrm{~cm}^{2} \)

What measurement should replace the star? ![Parallelogram with base length 6cm and the perpendicular height has a star symbol.]()
</question>

<answer>
\( 2 \mathrm{~cm} \)
</answer>

<misconception>
Did not understand the formula to calculate the area of parallelograms
</misconception>

<misconceptual-question


In [196]:
print(valid_prompts[number])

<name>
Given the area of a parallelogram, calculate a missing dimension
</name>

<subject>
Area of Simple Shapes
</subject>

<question>
The area of this parallelogram is \( 24 \mathrm{~cm}^{2} \)

What measurement should replace the star? ![Parallelogram with base length 6cm and the perpendicular height has a star symbol.]()
</question>

<answer>
\( 2 \mathrm{~cm} \)
</answer>

<misconception>
When finding a missing side of a parallelogram given the area completes an extra step of dividing by 2
</misconception>


In [183]:
from google.colab import files
files.download("/content/fine-tune-phi-3.5-mini-instruct-eedi")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [99]:
# Clear the memory footprint
del model, trainer
torch.cuda.empty_cache()

In [100]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map= {"": 0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [104]:
new_model = AutoModelForCausalLM.from_pretrained(peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [105]:
model = PeftModel.from_pretrained(base_model, new_model)

TypeError: expected str, bytes or os.PathLike object, not Phi3ForCausalLM

In [None]:
model = model.merge_and_unload()

In [None]:
# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"