## Inference Code

In [3]:
!nvidia-smi

Mon Apr  1 10:41:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.06              Driver Version: 545.23.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:1A:00.0 Off |                    0 |
| N/A   32C    P0              43W / 300W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           Off | 00000000:1B:00.0 Off |  

In [4]:
import os
gpu = os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [44]:
import warnings
warnings.filterwarnings("ignore")
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer,pipeline, logging, BitsAndBytesConfig, AutoModelForCausalLM
from datasets import load_dataset
from random import randrange

In [45]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

In [74]:
# Get the model
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned math model name (date-month-hour-minutes)
new_model = "llama-2-7b-chat-math-27-3-16-20"

In [75]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = f"./results/{new_model}/"

In [76]:
output_dir

'./results/llama-2-7b-chat-math-27-3-16-20/'

In [77]:
device_map = {"": 0}

In [78]:
# Load finetuned LLM model and tokenizer
ft_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    cache_dir="/projects/barman/cache",
)

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:05<00:00,  2.53s/it]


In [79]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [80]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [81]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [82]:
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    cache_dir="/projects/barman/cache",
#     use_flash_attention_2=use_flash_attention,
)

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:05<00:00,  2.51s/it]


In [83]:
# Get the dataset from hugging_face
math_dataset = load_dataset("hendrycks/competition_math",trust_remote_code=True, split="train")

In [84]:
filtered_dataset = [data for data in math_dataset if data['level']=="Level 1" or data['level']=="Level 2"]
sample =  filtered_dataset[randrange(len(filtered_dataset))]

In [85]:
sample

{'problem': 'A scale drawing of a park shows that one inch represents 800 feet. A line segment in the drawing that is 4.75 inches long represents how many feet?',
 'level': 'Level 1',
 'type': 'Algebra',
 'solution': 'Each inch of the 4.75-inch line segment represents 800 feet, so the whole line segment represents $4.75\\times800=\\frac{19}{4}\\cdot800=19\\cdot200=\\boxed{3800}$ feet.'}

In [86]:
DEFAULT_SYSTEM_PROMPT = """You are a fine-tuned AI model who is a math genious. 
You can solve simple to moderate level mathematics problems. 
Follow a chain of thought approach while answering. Answer in brief. """

In [87]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=ft_model, tokenizer=tokenizer, max_length=512)
result = pipe(f"<s>[INST] <<SYS>> {DEFAULT_SYSTEM_PROMPT} <</SYS>> {sample['problem']} [/INST]")

pipe2 = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=2048)
result2 = pipe2(f"<s>[INST] <<SYS>> {DEFAULT_SYSTEM_PROMPT} <</SYS>> {sample['problem']} [/INST]")

print(f"Instruction:\n{DEFAULT_SYSTEM_PROMPT}\n")
print(f"Input:\n{sample['problem']}\n")
print(f"Generated Response with fine tuned model:\n {result[0]['generated_text'].split("[/INST]")[-1]}\n")
print(f"Generated Response with the base model:\n {result2[0]['generated_text'].split("[/INST]")[-1]}\n")
print(f"Ground Truth:\n{sample['solution']}")

Instruction:
You are a fine-tuned AI model who is a math genious. 
You can solve simple to moderate level mathematics problems. 
Follow a chain of thought approach while answering. Answer in brief. 

Input:
A scale drawing of a park shows that one inch represents 800 feet. A line segment in the drawing that is 4.75 inches long represents how many feet?

Generated Response with fine tuned model:
   Great, let's get started!

Given that one inch in the drawing represents 800 feet, and the line segment is 4.75 inches long, we can use the conversion factor to find the corresponding length in feet.

So, if one inch represents 800 feet, then the length of the line segment in feet can be calculated as:

4.75 inches × 800 feet/inch = 3600 feet

Therefore, the line segment in the drawing represents 3600 feet.

Generated Response with the base model:
   Great, let's solve this problem together! 🤔

So, we know that in this scale drawing, one inch represents 800 feet. That means that if we want to