In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
import torch
import uuid
from datasets import Dataset
from transformers import default_data_collator

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling,
    set_seed
)
from peft import LoraConfig, get_peft_model

In [None]:
#  Check Versions 
print(f"Torch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU NOT DETECTED")

In [None]:
#  REPRODUCIBILITY SETUP 
SEED = 42

def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    set_seed(seed)
    print(f"Reproducibility locked with Seed: {seed}")

set_reproducibility(SEED)

In [None]:
#  PATHS 
INPUT_JSON_DIR = "data"          
MODEL_PATH = "microsoft/phi-2"   

# Unique run hash
run_hash = str(uuid.uuid4())[:6]
TUNED_MODEL_PATH = f"models/phi2_retail_native_bf16_{run_hash}"

print(f"Output Folder: {TUNED_MODEL_PATH}")

#  HYPERPARAMETERS 
MAX_LENGTH = 1024       
RANK = 32               
ALPHA = 64             
DROPOUT = 0.05
LEARNING_RATE = 2e-4
BATCH_SIZE = 6          
GRAD_ACC_STEPS = 4      
NUM_EPOCHS = 10         
WEIGHT_DECAY = 0.01

#  THE ANCHOR (Updated with ASIN Rule) 
SYSTEM_PROMPT = (
    "You are the PUMA Holographic Assistant. Follow these strict operational rules:\n"
    "1. If Context is 'N/A': Handle general greetings or PUMA-related brand questions. "
    "If the query is completely unrelated to PUMA, sports, or retail, politely refuse to answer.\n"
    "2. If Context is 'No products found.': Inform the user that no matching footwear was found "
    "and suggest they try a different style or category.\n"
    "3. If Context contains Product Lists: Provide a high-level highlight of the collection "
    "and transition the user into the immersive 3D view.\n"
    "4. If Context contains T&C/Policies: Use the information provided to answer the user query accurately.\n"
    "5. If User Query is '<GESTURE_EXIT>': Acknowledge that the user has closed the 3D display, "
    "briefly summarize the product they just viewed, and ask if they need further assistance."
)

In [None]:
# Load all fine tuning datasets
all_data = []
if os.path.isdir(INPUT_JSON_DIR):
    for filename in os.listdir(INPUT_JSON_DIR):
        if filename.endswith(".json"):
            filepath = os.path.join(INPUT_JSON_DIR, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                entries = json.load(f)
                if isinstance(entries, list):
                    all_data.extend(entries)
                else:
                    all_data.append(entries)

df = pd.DataFrame(all_data)
print(f"Total Rows Loaded: {len(df)}")

print("\n SAMPLE RAW ENTRY (Index 0) ")
print(df.iloc[0].to_dict())

In [None]:
# Load phi 2 model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,  
    device_map="auto"
)

# Enable memory saving
model.gradient_checkpointing_enable()


# check first block to see if 'q_proj', 'fc1' etc exist
for name, module in model.named_modules():
    if "layers.0" in name and "proj" in name: 
        print(f"   Found layer: {name}")
    if "layers.0" in name and "fc" in name:
        print(f"   Found layer: {name}")

In [None]:
# Based on inspection, these are the standard Phi-2 modules
targets = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "dense"]


peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=RANK,
    lora_alpha=ALPHA,
    target_modules=targets,
    lora_dropout=DROPOUT,
    bias="none"
)

peft_model = get_peft_model(model, peft_config)
print("\nLoRA ADAPTER ATTACHED:")
peft_model.print_trainable_parameters()

In [None]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

def format_and_tokenize(row):

    raw_context = row['context']
    user_query = row['user']
    assistant_response = row['assistant']

    # Explicit stop marker
    END_TOKEN = "<END_OF_RESPONSE>"

    full_text = (
        f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
        f"### Context:\n{raw_context}\n\n"
        f"### User Query:\n{user_query}\n\n"
        f"### Response:\n"
        f"{assistant_response}\n{END_TOKEN}{tokenizer.eos_token}"
    )

    user_part = (
        f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
        f"### Context:\n{raw_context}\n\n"
        f"### User Query:\n{user_query}\n\n"
        f"### Response:\n"
    )

    tokenized_full = tokenizer(
        full_text,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True
    )

    tokenized_user = tokenizer(
        user_part,
        add_special_tokens=False,
        truncation=True
    )

    input_ids = tokenized_full["input_ids"]
    labels = list(input_ids)

    # Mask everything before assistant response
    user_len = len(tokenized_user["input_ids"])
    for i in range(min(user_len, len(labels))):
        labels[i] = -100

    # Mask padding
    for i, token_id in enumerate(input_ids):
        if token_id == tokenizer.pad_token_id:
            labels[i] = -100

    return {
        "input_ids": input_ids, ## this is the full text in tokenized format ([ 50,  60,  99,   0,   0,   0,   0,   0 ])
        "attention_mask": tokenized_full["attention_mask"], ## this is the mask we will be mask ([  1,   1,   1,   0,   0,   0,   0,   0 ])
        "labels": labels ## this is the labels where grading will happened ([-100,  60,  99,-100,-100,-100,-100,-100 ])
    }


# Apply formatting
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(format_and_tokenize, remove_columns=list(df.columns))

# 
print("\nDECODED TRAINING SAMPLE (WHAT THE MODEL SEES) ")
decoded_sample = tokenizer.decode(tokenized_dataset[1]['input_ids'], skip_special_tokens=True)
print(decoded_sample)

In [None]:
# from tqdm import tqdm

# # -- ANALYSIS --
# lengths = []
# assistant_lengths = []
# truncated = 0

# for row in tqdm(all_data):
#     full_text = (
#         f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
#         f"### Context:\n{row['context']}\n\n"
#         f"### User Query:\n{row['user']}\n\n"
#         f"### Response:\n{row['assistant']}{tokenizer.eos_token}"
#     )

#     user_part = (
#         f"### Instruction:\n{SYSTEM_PROMPT}\n\n"
#         f"### Context:\n{row['context']}\n\n"
#         f"### User Query:\n{row['user']}\n\n"
#         f"### Response:\n"
#     )

#     full_ids = tokenizer(full_text, add_special_tokens=False)["input_ids"]
#     user_ids = tokenizer(user_part, add_special_tokens=False)["input_ids"]

#     total_len = len(full_ids)
#     assistant_len = max(0, total_len - len(user_ids))

#     lengths.append(total_len)
#     assistant_lengths.append(assistant_len)

#     if total_len > MAX_LENGTH:
#         truncated += 1

# # -- REPORT --
# lengths = np.array(lengths)
# assistant_lengths = np.array(assistant_lengths)

# print("\n====== TOKEN LENGTH REPORT ======")
# print(f"Max tokens           : {lengths.max()}")
# print(f"Mean tokens          : {lengths.mean():.1f}")
# print(f"95th percentile      : {np.percentile(lengths, 95):.1f}")
# print(f"Samples > {MAX_LENGTH}: {truncated} ({truncated / len(lengths) * 100:.2f}%)")

# print("\n====== ASSISTANT RESPONSE LENGTH ======")
# print(f"Max assistant tokens : {assistant_lengths.max()}")
# print(f"Mean assistant tokens: {assistant_lengths.mean():.1f}")

# print("\n====== RECOMMENDATION ======")
# if np.percentile(lengths, 95) < 512:
#     print(" 1024 is excessive — reduce MAX_LENGTH to 512")
# elif np.percentile(lengths, 95) < 768:
#     print("1024 is mostly unused — consider 768")
# else:
#     print(" 1024 is justified for your dataset")

In [None]:
# Shuffle and split with fixed seed
tokenized_dataset = tokenized_dataset.shuffle(seed=SEED)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=SEED)

print(f"Training Set: {len(split_dataset['train'])} rows")
print(f"Test Set:     {len(split_dataset['test'])} rows")

In [None]:
# Fine tuning setup
data_collator = default_data_collator
training_args = TrainingArguments(
    output_dir=TUNED_MODEL_PATH,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=5,
    seed=SEED,                  
    data_seed=SEED,             
    
    save_strategy="steps",
    save_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    
    fp16=False,
    bf16=True,             
    
    warmup_steps=20,
    save_total_limit=2,
    # group_by_length=True,
    report_to="none",
    remove_unused_columns=False
)

trainer = Trainer(
    model=peft_model, ## LoRA adapter config.
    args=training_args, ## Fine Tuning config.
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)
print("Trainer Initialized.")

In [None]:
print("\n+++++++++++++ STARTING TRAINING +++++++++++++")
trainer.train()

print("\n+++++++++++++ SAVING MODEL +++++++++++++")
trainer.model.save_pretrained(TUNED_MODEL_PATH)
tokenizer.save_pretrained(TUNED_MODEL_PATH)
print(f"DONE! Model saved to: {TUNED_MODEL_PATH}")