# Fine Tuning with MATH dataset - llama 2 chat model

### Set up the GPU

In [2]:
!nvidia-smi

Mon Apr  1 12:04:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.06              Driver Version: 545.23.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:1A:00.0 Off |                    0 |
| N/A   32C    P0              43W / 300W |      3MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           Off | 00000000:1B:00.0 Off |  

In [3]:
import os
gpu = os.environ["CUDA_VISIBLE_DEVICES"]="0"

### Hugging Face Login 

In [4]:
# !huggingface-cli login

### Import and load dataset

In [5]:
# Import packages
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

# import accelerate
# from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm
2024-04-01 12:04:15.015548: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-01 12:04:15.066488: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
import gc

#Force garbage collection
gc.collect()

20

In [10]:
# Get the dataset from hugging_face
math_dataset = load_dataset("hendrycks/competition_math",trust_remote_code=True, split= "train")

In [11]:
math_dataset

Dataset({
    features: ['problem', 'level', 'type', 'solution'],
    num_rows: 7500
})

In [13]:
# See how one sample looks like
print(math_dataset[0])

{'problem': 'Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \\Rightarrow a=-3$. Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$. Substituting, we get $-2-5=2(-2)-b$, which implies $b=3$. So $a+b=-3+3=\\boxed{0}$.'}


### Format the dataset

In [14]:
## What happens when we don't give this system prompt
# TODO 2: Tweak the system prompt
DEFAULT_SYSTEM_PROMPT = """You are a fine-tuned AI model who is a math genious. 
You can solve simple to moderate level mathematics problems. 
Follow a chain of thought approach while answering and answer in brief."""

## Options
sys_prompt = """You are a math genious. Answer the following question on basic and advanced mathematics. """

In [15]:
# TODO 3: Check the question answer format
def format_conversation_llama2(dataset):
    '''
    Formats a conversation in LLAMA2 style.

    This function takes a dataset containing a problem and its solution, and formats it into a LLAMA2-style 
    conversation.

    Args:
    - dataset (dict): A dictionary containing the problem and solution of the conversation.

    Returns:
    dict: A dictionary containing the formatted conversation.

    Example:
    >>> dataset = {'problem': 'How can I improve my coding skills?', 'solution': 'You can improve your coding skills 
    by practicing regularly and working on challenging projects.'}
    >>> formatted_conversation = format_conversation_llama2(dataset)
    >>> print(formatted_conversation)
    {'text': '<s>[INST] <<SYS>> How can I improve my coding skills? <</SYS>> You can improve your coding skills by 
    practicing regularly and working on challenging projects. </s>'}

    '''
    

    template = """<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{question}[/INST] {answer}</s>"""
    conversation = template.format(
        system_prompt=DEFAULT_SYSTEM_PROMPT,
        question=dataset['problem'],
        answer=dataset['solution'],
    )

    return {"text": conversation}


In [16]:
math_format_dataset = math_dataset.map(
    format_conversation_llama2,
    remove_columns=math_dataset.column_names, # remove all columns; only "text" will be left
    num_proc=os.cpu_count()  # multithreaded
)

In [17]:
math_format_dataset[2]

{'text': "<s>[INST] <<SYS>>\nYou are a fine-tuned AI model who is a math genious. \nYou can solve simple to moderate level mathematics problems. \nFollow a chain of thought approach while answering and answer in brief.\n<</SYS>>\n\nWhat is the degree of the polynomial $(4 +5x^3 +100 +2\\pi x^4 + \\sqrt{10}x^4 +9)$?[/INST] This polynomial is not written in standard form.  However, we don't need to write it in standard form, nor do we need to pay attention to the coefficients.  We just look for the exponents on $x$.  We have an $x^4$ term and no other term of higher degree, so $\\boxed{4}$ is the degree of the polynomial.</s>"}

### Define the Model Parameters

In [18]:
# Get the model
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned math model name (date-month-hour-minutes)
new_model = "llama-2-7b-chat-math-01-04-12-10"

In [19]:

# Output directory where the model predictions,configuration files and checkpoints will be stored
output_dir = f"./results/{new_model}"

In [20]:
# TODO 3: Check the padding
# Load LLaMA tokenizer
# set_special_tokens = True, so the bos (beginning of sequence </s>) and eos (end of sequence </s>) token is added
tokenizer = AutoTokenizer.from_pretrained(model_name, set_special_tokens = True,  trust_remote_code=True)

# not adding mask token for now - can be used to give attention to some part of the sequence
# manually add the padding token and set it to the eos token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
tokenizer.save_pretrained(output_dir)

('./results/llama-2-7b-chat-math-01-04-12-10/tokenizer_config.json',
 './results/llama-2-7b-chat-math-01-04-12-10/special_tokens_map.json',
 './results/llama-2-7b-chat-math-01-04-12-10/tokenizer.json')

In [21]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [22]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [23]:
################################################################################
# TrainingArguments parameters
################################################################################

# use_flash_attention = False

# Number of training epochs
num_train_epochs = 5

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

# Logging Directory
logging_dir = f"./logs/{new_model}/"

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0 (or the only visible GPU)
device_map = {"": 0}
# TODO : Try setting it to a number instead of auto
# device_map = "auto"

In [24]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [25]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    cache_dir="/projects/barman/cache",
#     use_flash_attention_2=use_flash_attention,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj", "v_proj", "k_proj"],
    modules_to_save= ["embed_tokens", "lm_head"]
)

# View the model layers
# print(model)

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:05<00:00,  2.61s/it]


In [26]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    #logging_dir= logging_dir,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
#     load_best_model_at_end=True, # This option only saves the best model and no checkpoints
    #save_strategy="no", # use this when using the load_best_model_at_end (requires save_strategy and eval_strategy to be same)
    #evaluation_strategy="epoch", 
    #logging_strategy="epoch",
    
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=math_format_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

In [27]:
model.config.to_json_file(output_dir + "/config.json")

In [28]:
# Check if needed
#torch.cuda.empty_cache()

In [None]:
# Train model
trainer.train()

# Save trained model - default directory output directory 
trainer.model.save_pretrained(output_dir)

Step,Training Loss


In [None]:
import pandas as pd
loss_outputs = pd.DataFrame(trainer.state.log_history)

# Select desired columns
desired_columns = ["loss", "learning_rate", "epoch", "step"]

# Keep only those columns in the DataFrame
loss_outputs = loss_outputs[desired_columns]

# Save the loss results
filename = f"loss-results"
filepath = os.path.join(f"results/{new_model}", filename)  # Replace with your desired path

# Save DataFrame to CSV
loss_outputs.to_csv(filepath, index=False)  # Avoid saving index column

In [None]:
# TODO : Use use_reentrant= True or False explicitly

In [None]:
log_dir = f"./results/{new_model}/runs/"
log_dir

In [None]:
from tensorboard import notebook
# log_dir = f"./results/{new_model}/runs/"
# notebook.start("--log_dir {} --port 4000".format(log_dir))

In [None]:
# !tensorboard --logdir ./results/llama-2-7b-chat-math-27-3-16-20/runs/

In [None]:
# logging.set_verbosity(logging.CRITICAL)

# prompt = "Do you know piecewise linear function?"
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# result = pipe(f"<s>[INST]  <<SYS>> {DEFAULT_SYSTEM_PROMPT} <</SYS>> {prompt} [/INST]")
# print(result[0]['generated_text'])