In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
import torch
import uuid
from datasets import Dataset
from transformers import default_data_collator

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling,
    set_seed
)
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#  Check Versions 
print(f"Torch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU NOT DETECTED")

Torch Version: 2.4.1+cu121
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 4080 SUPER


In [3]:
#  REPRODUCIBILITY SETUP 
SEED = 42

def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    set_seed(seed)
    print(f"Reproducibility locked with Seed: {seed}")

set_reproducibility(SEED)

Reproducibility locked with Seed: 42


In [4]:
#  PATHS 
TARGET_FILE = "data/puma_clean_dataset_smart_v2.json"          
MODEL_PATH = "microsoft/phi-2"   

# Unique run hash
run_hash = str(uuid.uuid4())[:6]
TUNED_MODEL_PATH = f"models/phi2_retail_native_bf16_{run_hash}"

print(f"Output Folder: {TUNED_MODEL_PATH}")

#  HYPERPARAMETERS 
MAX_LENGTH = 1024       
RANK = 32               
ALPHA = 64             
DROPOUT = 0.05
LEARNING_RATE = 2e-4
BATCH_SIZE = 6          
GRAD_ACC_STEPS = 4      
NUM_EPOCHS = 10         
WEIGHT_DECAY = 0.01

#  THE ANCHOR (Updated with ASIN Rule) 
# SYSTEM_PROMPT = (
#     "You are the PUMA Holographic Assistant. Follow these strict operational rules:\n"
#     "1. If Context is 'N/A': Handle general greetings or PUMA-related brand questions. "
#     "If the query is completely unrelated to PUMA, sports, or retail, politely refuse to answer.\n"
#     "2. If Context is 'No products found.': Inform the user that no matching footwear was found "
#     "and suggest they try a different style or category.\n"
#     "3. If Context contains Product Lists: Provide a high-level highlight of the collection "
#     "and transition the user into the immersive 3D view.\n"
#     "4. If Context contains T&C/Policies: Use the information provided to answer the user query accurately.\n"
#     "5. If User Query is '<GESTURE_EXIT>': Acknowledge that the user has closed the 3D display, "
#     "briefly summarize the product they just viewed, and ask if they need further assistance."
# )

## version 2 (so OOD dont get easily triggered)
SYSTEM_PROMPT = (
    "You are the PUMA Holographic Assistant, an intelligent 3D AI retail guide. "
    "Follow these strict operational rules:\n"
    "1. If Context is 'N/A' and the user greets you, says goodbye, or asks who you are: "
    "Respond enthusiastically in character as the PUMA Holographic AI Assistant and pivot to exploring PUMA gear.\n"
    "2. If Context is 'N/A' and the query is completely unrelated to PUMA, sports, or retail: "
    "Politely refuse to answer, stay in character, and pivot back to PUMA footwear or gear.\n"
    "3. If Context is 'No products found.': Inform the user that no matching footwear was found "
    "and suggest they try a different style or category.\n"
    "4. If Context contains Product Lists: Provide a high-level highlight of the collection "
    "and transition the user into the immersive 3D view.\n"
    "5. If Context contains T&C/Policies: Use the information provided to answer the user query accurately.\n"
    "6. If User Query is '<GESTURE_EXIT>': Acknowledge that the user has closed the 3D display, "
    "briefly summarize the product they just viewed, and ask if they need further assistance."
)

Output Folder: models/phi2_retail_native_bf16_38f4a5


In [5]:
# Load all fine tuning datasets
all_data = []

with open(TARGET_FILE, "r", encoding="utf-8") as f:
    entries = json.load(f)

# Normalize into list
if isinstance(entries, list):
    all_data = entries
else:
    all_data = [entries]

df = pd.DataFrame(all_data)

print(f"Total Rows Loaded: {len(df)}")
print("\nSAMPLE RAW ENTRY (Index 0)")
print(df.iloc[0].to_dict())

Total Rows Loaded: 1649

SAMPLE RAW ENTRY (Index 0)
{'context': 'For your convenience, we accept payment by credit and debit cards, Apple Pay, PayPal, Afterpay, Google Pay and PUMA Gift Cards. Paying by Credit/Debit Card We accept the following cards: Visa Mastercard American Express Discover For your security, we will contact your card issuer to request pre-authorization for your order amount. Your card will not be charged until after your order has shipped. Paying with Afterpay Shop now and pay later with Afterpay, now available on us. puma.', 'user': 'What payment options can I use at PUMA, and how does the creditcard preauthorization work?', 'assistant': "We're thrilled you asked  you can zip through checkout with Visa, Mastercard, American Express, Discover, Apple Pay, Google Pay, PayPal, Afterpay or a PUMA Gift Card! When you choose a credit or debit card, we reach out to your issuer for a quick preauthorization, but don't worry, your card isn't charged until your order ships. Ha

In [6]:
# Load phi 2 model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,  
    device_map="auto"
)

# Enable memory saving
model.gradient_checkpointing_enable()


# check first block to see if 'q_proj', 'fc1' etc exist
for name, module in model.named_modules():
    if "layers.0" in name and "proj" in name: 
        print(f"   Found layer: {name}")
    if "layers.0" in name and "fc" in name:
        print(f"   Found layer: {name}")

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]


   Found layer: model.layers.0.self_attn.q_proj
   Found layer: model.layers.0.self_attn.k_proj
   Found layer: model.layers.0.self_attn.v_proj
   Found layer: model.layers.0.mlp.fc1
   Found layer: model.layers.0.mlp.fc2


In [7]:
# Based on inspection, these are the standard Phi-2 modules
targets = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "dense"]


peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=RANK,
    lora_alpha=ALPHA,
    target_modules=targets,
    lora_dropout=DROPOUT,
    bias="none"
)

peft_model = get_peft_model(model, peft_config)
print("\nLoRA ADAPTER ATTACHED:")
peft_model.print_trainable_parameters()


LoRA ADAPTER ATTACHED:
trainable params: 47,185,920 || all params: 2,826,869,760 || trainable%: 1.6692


In [8]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

def format_and_tokenize(row):

    raw_context = row['context']
    user_query = row['user']
    assistant_response = row['assistant']

    # Explicit stop marker
    END_TOKEN = "<END_OF_RESPONSE>"

    full_text = (
        f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
        f"### Context:\n{raw_context}\n\n"
        f"### User Query:\n{user_query}\n\n"
        f"### Response:\n"
        f"{assistant_response}\n{END_TOKEN}{tokenizer.eos_token}"
    )

    user_part = (
        f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
        f"### Context:\n{raw_context}\n\n"
        f"### User Query:\n{user_query}\n\n"
        f"### Response:\n"
    )

    tokenized_full = tokenizer(
        full_text,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True
    )

    tokenized_user = tokenizer(
        user_part,
        add_special_tokens=False,
        truncation=True
    )

    input_ids = tokenized_full["input_ids"]
    labels = list(input_ids)

    # Mask everything before assistant response
    user_len = len(tokenized_user["input_ids"])
    for i in range(min(user_len, len(labels))):
        labels[i] = -100

    # Mask padding
    for i, token_id in enumerate(input_ids):
        if token_id == tokenizer.pad_token_id:
            labels[i] = -100

    return {
        "input_ids": input_ids, ## this is the full text in tokenized format ([ 50,  60,  99,   0,   0,   0,   0,   0 ])
        "attention_mask": tokenized_full["attention_mask"], ## this is the mask we will be mask ([  1,   1,   1,   0,   0,   0,   0,   0 ])
        "labels": labels ## this is the labels where grading will happened ([-100,  60,  99,-100,-100,-100,-100,-100 ])
    }


# Apply formatting
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(format_and_tokenize, remove_columns=list(df.columns))

# 
print("\nDECODED TRAINING SAMPLE (WHAT THE MODEL SEES) ")
decoded_sample = tokenizer.decode(tokenized_dataset[1]['input_ids'], skip_special_tokens=True)
print(decoded_sample)

Map: 100%|██████████| 1649/1649 [00:05<00:00, 300.37 examples/s]


DECODED TRAINING SAMPLE (WHAT THE MODEL SEES) 
### Instruction:
You are the PUMA Holographic Assistant, an intelligent 3D AI retail guide. Follow these strict operational rules:
1. If Context is 'N/A' and the user greets you, says goodbye, or asks who you are: Respond enthusiastically in character as the PUMA Holographic AI Assistant and pivot to exploring PUMA gear.
2. If Context is 'N/A' and the query is completely unrelated to PUMA, sports, or retail: Politely refuse to answer, stay in character, and pivot back to PUMA footwear or gear.
3. If Context is 'No products found.': Inform the user that no matching footwear was found and suggest they try a different style or category.
4. If Context contains Product Lists: Provide a high-level highlight of the collection and transition the user into the immersive 3D view.
5. If Context contains T&C/Policies: Use the information provided to answer the user query accurately.
6. If User Query is '<GESTURE_EXIT>': Acknowledge that the user has 




In [None]:
# from tqdm import tqdm

# # -- ANALYSIS --
# lengths = []
# assistant_lengths = []
# truncated = 0

# for row in tqdm(all_data):
#     full_text = (
#         f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
#         f"### Context:\n{row['context']}\n\n"
#         f"### User Query:\n{row['user']}\n\n"
#         f"### Response:\n{row['assistant']}{tokenizer.eos_token}"
#     )

#     user_part = (
#         f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
#         f"### Context:\n{row['context']}\n\n"
#         f"### User Query:\n{row['user']}\n\n"
#         f"### Response:\n"
#     )

#     full_ids = tokenizer(full_text, add_special_tokens=False)["input_ids"]
#     user_ids = tokenizer(user_part, add_special_tokens=False)["input_ids"]

#     total_len = len(full_ids)
#     assistant_len = max(0, total_len - len(user_ids))

#     lengths.append(total_len)
#     assistant_lengths.append(assistant_len)

#     if total_len > MAX_LENGTH:
#         truncated += 1

# # -- REPORT --
# lengths = np.array(lengths)
# assistant_lengths = np.array(assistant_lengths)

# print("\n====== TOKEN LENGTH REPORT ======")
# print(f"Max tokens           : {lengths.max()}")
# print(f"Mean tokens          : {lengths.mean():.1f}")
# print(f"95th percentile      : {np.percentile(lengths, 95):.1f}")
# print(f"Samples > {MAX_LENGTH}: {truncated} ({truncated / len(lengths) * 100:.2f}%)")

# print("\n====== ASSISTANT RESPONSE LENGTH ======")
# print(f"Max assistant tokens : {assistant_lengths.max()}")
# print(f"Mean assistant tokens: {assistant_lengths.mean():.1f}")

# print("\n====== RECOMMENDATION ======")
# if np.percentile(lengths, 95) < 512:
#     print(" 1024 is excessive — reduce MAX_LENGTH to 512")
# elif np.percentile(lengths, 95) < 768:
#     print("1024 is mostly unused — consider 768")
# else:
#     print(" 1024 is justified for your dataset")

In [9]:
# Shuffle and split with fixed seed
tokenized_dataset = tokenized_dataset.shuffle(seed=SEED)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=SEED)

print(f"Training Set: {len(split_dataset['train'])} rows")
print(f"Test Set:     {len(split_dataset['test'])} rows")

Training Set: 1484 rows
Test Set:     165 rows


In [10]:
# Fine tuning setup
data_collator = default_data_collator
training_args = TrainingArguments(
    output_dir=TUNED_MODEL_PATH,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=5,
    seed=SEED,                  
    data_seed=SEED,             
    
    save_strategy="steps",
    save_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    
    fp16=False,
    bf16=True,             
    
    warmup_steps=20,
    save_total_limit=2,
    # group_by_length=True,
    report_to="none",
    remove_unused_columns=False
)

trainer = Trainer(
    model=peft_model, ## LoRA adapter config.
    args=training_args, ## Fine Tuning config.
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)
print("Trainer Initialized.")

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Trainer Initialized.


In [11]:
print("\n+++++++++++++ STARTING TRAINING +++++++++++++")
trainer.train()

print("\n+++++++++++++ SAVING MODEL +++++++++++++")
trainer.model.save_pretrained(TUNED_MODEL_PATH)
tokenizer.save_pretrained(TUNED_MODEL_PATH)
print(f"DONE! Model saved to: {TUNED_MODEL_PATH}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



+++++++++++++ STARTING TRAINING +++++++++++++


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
50,1.5166,1.433009
100,1.3462,1.243407
150,1.1952,1.15826
200,1.0854,1.135347
250,1.0814,1.117189
300,1.0194,1.090308
350,0.9627,1.081613
400,0.9132,1.071211
450,0.9179,1.063907
500,0.9019,1.057081


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast


+++++++++++++ SAVING MODEL +++++++++++++
DONE! Model saved to: models/phi2_retail_native_bf16_38f4a5
