In [1]:
import sys
import site
import os

# Install the required packages
!{sys.executable} -m pip install peft==0.10.0 datasets ipywidgets transformers==4.38.2 wandb trl==0.8.3 accelerate==0.27.2 bitsandbytes==0.43.0 scipy==1.12.0


# Get the site-packages directory
site_packages_dir = site.getsitepackages()[0]

# add the site pkg directory where these pkgs are insalled to the top of sys.path
if not os.access(site_packages_dir, os.W_OK):
    user_site_packages_dir = site.getusersitepackages()
    if user_site_packages_dir in sys.path:
        sys.path.remove(user_site_packages_dir)
    sys.path.insert(0, user_site_packages_dir)
else:
    if site_packages_dir in sys.path:
        sys.path.remove(site_packages_dir)
    sys.path.insert(0, site_packages_dir)

Defaulting to user installation because normal site-packages is not writeable
[0m

In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
import psutil

num_physical_cores = psutil.cpu_count(logical=False)
num_cores_per_socket = num_physical_cores // 2

os.environ["TOKENIZERS_PARALLELISM"] = "0"
#HF_TOKEN = os.environ["HF_TOKEN"]

# Set the LD_PRELOAD environment variable
ld_preload = os.environ.get("LD_PRELOAD", "")
conda_prefix = os.environ.get("CONDA_PREFIX", "")
# Improve memory allocation performance, if tcmalloc is not available, please comment this line out
os.environ["LD_PRELOAD"] = f"{ld_preload}:{conda_prefix}/lib/libtcmalloc.so"
# Reduce the overhead of submitting commands to the GPU
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
# reducing memory accesses by fusing SDP ops
os.environ["ENABLE_SDP_FUSION"] = "1"
# set openMP threads to number of physical cores
os.environ["OMP_NUM_THREADS"] = str(num_physical_cores)
# Set the thread affinity policy
os.environ["OMP_PROC_BIND"] = "close"
# Set the places for thread pinning
os.environ["OMP_PLACES"] = "cores"

print(f"Number of physical cores: {num_physical_cores}")
print(f"Number of cores per socket: {num_cores_per_socket}")
print(f"OpenMP environment variables:")
print(f"  - OMP_NUM_THREADS: {os.environ['OMP_NUM_THREADS']}")
print(f"  - OMP_PROC_BIND: {os.environ['OMP_PROC_BIND']}")
print(f"  - OMP_PLACES: {os.environ['OMP_PLACES']}")

Number of physical cores: 96
Number of cores per socket: 48
OpenMP environment variables:
  - OMP_NUM_THREADS: 96
  - OMP_PROC_BIND: close
  - OMP_PLACES: cores


In [3]:
import asyncio
import threading
import torch
from IPython.display import display, HTML

import torch
import intel_extension_for_pytorch as ipex

if torch.xpu.is_available():
    torch.xpu.empty_cache()
    
    def get_memory_usage():
        memory_reserved = round(torch.xpu.memory_reserved() / 1024**3, 3)
        memory_allocated = round(torch.xpu.memory_allocated() / 1024**3, 3)
        max_memory_reserved = round(torch.xpu.max_memory_reserved() / 1024**3, 3)
        max_memory_allocated = round(torch.xpu.max_memory_allocated() / 1024**3, 3)
        return memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated
   
    def print_memory_usage():
        device_name = torch.xpu.get_device_name()
        print(f"XPU Name: {device_name}")
        memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
        memory_usage_text = f"XPU Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
        print(f"\r{memory_usage_text}", end="", flush=True)
    
    async def display_memory_usage(output):
        device_name = torch.xpu.get_device_name()
        output.update(HTML(f"<p>XPU Name: {device_name}</p>"))
        while True:
            memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
            memory_usage_text = f"XPU ({device_name}) :: Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
            output.update(HTML(f"<p>{memory_usage_text}</p>"))
            await asyncio.sleep(5)
    
    def start_memory_monitor(output):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.create_task(display_memory_usage(output))
        thread = threading.Thread(target=loop.run_forever)
        thread.start()    
    output = display(display_id=True)
    start_memory_monitor(output)
else:
    print("XPU device not available.")

  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
    registered at /build/pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /build/pytorch/build/aten/src/ATen/RegisterCPU.cpp:30476
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:2971 (function operator())
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


In [4]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    # could use q, v and 0 projections as well and comment out the rest
    target_modules=["q_proj", "o_proj", 
                    "v_proj", "k_proj", 
                    "gate_proj", "up_proj",
                    "down_proj"],
    task_type="CAUSAL_LM")

/home/u36e87926e39e265cdd3b4f969ee677d/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

USE_CPU = False
device = "xpu:0" if torch.xpu.is_available() else "cpu"
if USE_CPU:
    device = "cpu"
print(f"using device: {device}")

model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
# Set padding side to the right to ensure proper attention masking during fine-tuning
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
# Disable caching mechanism to reduce memory usage during fine-tuning
model.config.use_cache = False
# Configure the model's pre-training tensor parallelism degree to match the fine-tuning setup
model.config.pretraining_tp = 1 

using device: xpu:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
def generate_response(model, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)    
    outputs = model.generate(input_ids, max_new_tokens=100,
                             eos_token_id=tokenizer.eos_token_id)    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def test_model(model, test_inputs):
    """quickly test the model using queries."""
    for input_text in test_inputs:
        print("__"*25)
        generated_response = generate_response(model, input_text)
        print(f"{input_text}")
        print(f"Generated Answer: {generated_response}\n")
        print("__"*25)

test_inputs = [
    "What are the main differences between a vegetarian and a vegan diet?",
    "What are some effective strategies for managing stress and anxiety?",
    "Can you explain the concept of blockchain technology in simple terms?",
    "What are the key factors that influence the price of crude oil in global markets?",
    "When did Virgin Australia start operating?"
]

print("Testing the model before fine-tuning:")
test_model(model, test_inputs)

Testing the model before fine-tuning:
__________________________________________________
What are the main differences between a vegetarian and a vegan diet?
Generated Answer: What are the main differences between a vegetarian and a vegan diet?

A 100-W lightbulb is plugged into a standard $120-\mathrm{V}$ (rms) outlet. Find $(a) I_{\text {mas }}$ and $(b) I_{\max }$ if a "slow-motion" camera were able to show the amplitude of the current more than once in $1.00 \mathrm{~ms}$.

A 100-turn coil has a radius of 4.50 cm and a resistance

__________________________________________________
__________________________________________________
What are some effective strategies for managing stress and anxiety?
Generated Answer: What are some effective strategies for managing stress and anxiety?

Answer:

Step 1/10
1. Exercise: Regular exercise can help reduce stress and anxiety by releasing endorphins, which are natural mood-boosting chemicals in the brain.

Step 2/10
2. Meditation: Practicing 

In [7]:
from datasets import load_dataset

dataset_name = "lavita/ChatDoctor-HealthCareMagic-100k"
dataset = load_dataset(dataset_name, split="train")

filtered_dataset = dataset.shuffle(seed=1).select(range(int(len(dataset) * 0.02)))
dataset = filtered_dataset

print(f"Input is: {dataset[0]['input']}")
print(f"Output is: {dataset[0]['output']}")

# Remove unwanted fields from the filtered dataset
dataset = filtered_dataset.remove_columns(["instruction"])
print(f"Number of examples in the dataset: {len(dataset)}")
print(f"Fields in the dataset: {list(dataset.features.keys())}")

2025-02-24 03:30:33,817 - datasets - INFO - PyTorch version 2.5.1+cxx11.abi available.


Input is: hi, i am a 25yr old female 75kg,1.75cm, noticed i started getting heart palpitations aprox 5mnths ago and over the last few weeks have been getting worse in the sense that they seem to be occuring more often e.g everyday or so and sometimes more than once a day, i get quite scared as if im gonna loose my breath perhaps bcos i get anxious  when they come on as it is abnormal for me, since disscussing with my family my father told me he has the same thing has had it all his life his doctor said his heart just missing a beat (30-40yrs ago) so obviously could be hereditary, i asked him if he gets them everyday he said no,but your heart just missing a beat is not enough for me i smoke cigarettes and this past week have noticed i feel sick when smoking so have seemed to find myself cutting down due to it with no effort at all i have spells were i smoke weed regularly but have found myself not smoking last few weeks i take the occasional recreational drugs and in last few months hav

In [8]:
def format_prompts(batch):
    formatted_prompts = []
    for question, answer in zip(batch["input"], batch["output"]):
        prompt = f"Question:\n{question}\n\nAnswer:\n{answer}"
        formatted_prompts.append(prompt)
    return {"text": formatted_prompts}

dataset = dataset.map(format_prompts, batched=True)
split_dataset = dataset.train_test_split(test_size=0.2, seed=99)
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

In [9]:
import transformers
import wandb

from trl import SFTTrainer

os.environ["WANDB_PROJECT"] = "gemma_dolly-qa"  
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["IPEX_TILE_AS_DEVICE"] = "1"

finetuned_model_id = "unrahul/gemma-2b-dolly-qa"
PUSH_TO_HUB = True
USE_WANDB = True

# Calculate max_steps based on the subset size
num_train_samples = len(train_dataset)
batch_size = 2
gradient_accumulation_steps = 8
steps_per_epoch = num_train_samples // (batch_size * gradient_accumulation_steps)
num_epochs = 5
max_steps = steps_per_epoch * num_epochs
print(f"Finetuning for max number of steps: {max_steps}")

def print_training_summary(results):
    print(f"Time: {results.metrics['train_runtime']: .2f}")
    print(f"Samples/second: {results.metrics['train_samples_per_second']: .2f}")
    get_memory_usage()

training_args = transformers.TrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_ratio=0.05,
        max_steps=max_steps,
        learning_rate=1e-5,
        evaluation_strategy="steps",
        save_steps=500,
        bf16=True,
        logging_steps=100,
        output_dir=finetuned_model_id,
        hub_model_id=finetuned_model_id if PUSH_TO_HUB else None,
        use_ipex=True,
        report_to="wandb" if USE_WANDB else None,
        #push_to_hub=PUSH_TO_HUB,
        max_grad_norm=0.6,
        weight_decay=0.01,
        group_by_length=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    packing=True
)

if device != "cpu":
    print_memory_usage()
    torch.xpu.empty_cache()
results = trainer.train()
print_training_summary(results)
wandb.finish()

# save lora model
tuned_lora_model = "gemma-2b-dolly-qa-lora"
trainer.model.save_pretrained(tuned_lora_model)

Finetuning for max number of steps: 560
XPU Name: Intel(R) Data Center GPU Max 1100
XPU Memory: Reserved=9.557 GB, Allocated=9.545 GB, Max Reserved=9.557 GB, Max Allocated=9.545 GB

2025-02-24 03:30:45,208 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025-02-24 03:30:45,233 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025-02-24 03:30:45,441 - wandb.jupyter - ERROR - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msteven_tran1[0m ([33msteven_tran1-umass-lowell[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
ERROR: ld.so: object '/lib/libtcmalloc.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


Step,Training Loss,Validation Loss
100,3.5751,3.202994
200,3.1157,2.985899
300,2.9782,2.904093
400,2.9185,2.867334
500,2.8897,2.851064


2025-02-24 03:33:51,882 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025-02-24 03:33:51,902 - _logger.py - IPEX - INFO - Linear BatchNorm folding failed during the optimize process.
2025-02-24 03:36:58,004 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025-02-24 03:36:58,023 - _logger.py - IPEX - INFO - Linear BatchNorm folding failed during the optimize process.
2025-02-24 03:40:04,145 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025-02-24 03:40:04,163 - _logger.py - IPEX - INFO - Linear BatchNorm folding failed during the optimize process.
2025-02-24 03:43:10,142 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025-02-24 03:43:10,161 - _logger.py - IPEX - INFO - Linear BatchNorm folding failed during the optimize process.
2025-02-24 03:46:16,157 - _logger.py - IPEX - INFO - Currently split master weight for xpu only support sgd
2025

Time:  1048.22
Samples/second:  8.55


0,1
eval/loss,█▄▂▁▁
eval/runtime,█▁▁▁▂
eval/samples_per_second,▁███▇
eval/steps_per_second,▁███▇
train/epoch,▁▁▃▃▄▄▆▆▇▇█
train/global_step,▁▁▃▃▄▄▆▆▇▇█
train/grad_norm,█▇▁▃▇
train/learning_rate,█▆▅▃▁
train/loss,█▃▂▁▁
train/total_flos,▁

0,1
eval/loss,2.85106
eval/runtime,7.9849
eval/samples_per_second,25.924
eval/steps_per_second,3.256
train/epoch,11.03
train/global_step,560.0
train/grad_norm,3.21875
train/learning_rate,0.0
train/loss,2.8897
train/total_flos,5.556293503392154e+16


In [10]:
from peft import PeftModel

tuned_model = "gemma-2b-dolly-qa"

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
)

model = PeftModel.from_pretrained(base_model, tuned_lora_model)
model = model.merge_and_unload()
# save final tuned model
model.save_pretrained(tuned_model)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
#model2 = ipex.optimize_transformers(model)  # optimize the model using `ipex`

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
test_inputs = [
    "What are the main differences between a vegetarian and a vegan diet?",
    "What are some effective strategies for managing stress and anxiety?",
    "Can you explain the concept of blockchain technology in simple terms?",
    "What are the key factors that influence the price of crude oil in global markets?",
    "When did Virgin Australia start operating?"
]
device = "xpu:0"

model = model.to(device)
for text in test_inputs:
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=200, 
                             do_sample=False, top_k=100,temperature=0.1, 
                             eos_token_id=tokenizer.eos_token_id)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

What are the main differences between a vegetarian and a vegan diet?

A 100-W lightbulb is plugged into a standard $120-\mathrm{V}$ (rms) outlet. Find $(a) I_{\text {mas }}$ and $(b) I_{\max }$ if a "slow-motion" camera were able to show the amplitude of the current more than once in $1.00 \mathrm{~ms}$.

A 100-turn coil has a radius of 4.50 cm and a resistance of 0.600 $\Omega$. The coil is in a uniform magnetic field that is perpendicular to the plane of the coil. Find the time required for the magnetic field to change from 1.50 T to 0 T if the average induced current in the coil is 75.0 $\mu A$.

A 100-W lightbulb is plugged into a standard 120-V outlet. (a) How much does it cost per 
What are some effective strategies for managing stress and anxiety?

Answer:

Step 1/10
1. Exercise: Regular exercise can help reduce stress and anxiety by releasing endorphins, which are natural mood-boosting chemicals in the brain.

Step 2/10
2. Meditation: Practicing meditation can help you focus on