# Phase 0: Environment Setup & Verification

## 0.2: Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
project_path = '/content/drive/MyDrive/NLP_Project'
os.makedirs(project_path, exist_ok=True)
os.makedirs(f'{project_path}/data', exist_ok=True)
os.makedirs(f'{project_path}/models', exist_ok=True)
os.makedirs(f'{project_path}/results', exist_ok=True)
os.makedirs(f'{project_path}/checkpoints', exist_ok=True)

print(f"✓ Project directory created at: {project_path}")
print(f"\nDirectory structure:")
!ls -la /content/drive/MyDrive/NLP_Project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Project directory created at: /content/drive/MyDrive/NLP_Project

Directory structure:
total 16
drwx------ 2 root root 4096 Dec 10 00:42 checkpoints
drwx------ 2 root root 4096 Nov 19 01:28 data
drwx------ 2 root root 4096 Dec 10 00:13 models
drwx------ 2 root root 4096 Nov 19 01:28 results


## 0.3: Install Required Libraries

In [4]:
print("Installing required packages...")
!pip install -q peft accelerate bitsandbytes
!pip install -q sentence-transformers faiss-cpu
!pip install -q rouge-score bert-score
!pip install -q datasets
!pip install -U bitsandbytes accelerate

print("\n" + "="*50)
print("VERIFYING INSTALLATIONS")
print("="*50)

# Verify installations
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import peft
import sentence_transformers
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

print("✓ All core libraries imported successfully!")
print(f"\nLibrary versions:")
print(f"  PyTorch: {torch.__version__}")
print(f"  Transformers: {transformers.__version__}")
print(f"  PEFT: {peft.__version__}")
print(f"  Sentence Transformers: {sentence_transformers.__version__}")

Installing required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone

VERIFYING INSTALLATIONS
✓ All core libraries imported successfully!

Library versions:
  PyTorch: 2.9.0+cu126
  Transformers: 4.57.3
  PEFT: 0.18.0
  Sentence Transformers: 5.1.2


# LLM Fine-Tuning Pilot Phi-2

### LoRA Setup & Small-Scale Training

In [None]:
print("Loading MASTER TRAIN dataset...")
df_train = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/train_dataset.csv')

# Filter for Indeterministic (LLM) rows from TRAIN only
df_indet_train = df_train[df_train['label'] == 1].reset_index(drop=True)

# Sample pilot data from this safe training set
df_pilot = df_indet_train.sample(n=100, random_state=42)

print("\n3. Loading Phi-2 model (full precision for A100)...")
model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16  # Half precision is fine
)

print(f"   ✓ Model loaded")
print(f"   GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Prepare model for LoRA training
print("\n4. Preparing model for LoRA training...")
# Skip prepare_model_for_kbit_training since we're not using quantization

from peft import LoraConfig, get_peft_model

# LoRA configuration
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Phi-2 attention modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print(f"   ✓ LoRA configured")
print(f"   GPU memory after LoRA: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

from sklearn.model_selection import train_test_split

# 1. Split the 100 pilot examples into Train (90) and Val (10)
df_train_pilot, df_val_pilot = train_test_split(df_pilot, test_size=0.1, random_state=42, stratify=df_pilot['category'])

# 2. Define the formatting function (Instruction -> Response)
def format_prompt(row):
    return f"Customer: {row['instruction']}\nAssistant: {row['response']}"

# 3. Create the missing text variables
print("Formatting prompts...")
train_texts = df_train_pilot.apply(format_prompt, axis=1).tolist()
val_texts = df_val_pilot.apply(format_prompt, axis=1).tolist()

print(f"Created {len(train_texts)} training prompts and {len(val_texts)} validation prompts.")

Loading MASTER TRAIN dataset...

3. Loading Phi-2 model (full precision for A100)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Model loaded
   GPU memory: 11.18 GB

4. Preparing model for LoRA training...
trainable params: 2,621,440 || all params: 2,782,305,280 || trainable%: 0.0942
   ✓ LoRA configured
   GPU memory after LoRA: 11.19 GB
Formatting prompts...
Created 90 training prompts and 10 validation prompts.


### Tokenization & Dataset Preparation

In [None]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

print("\n5. Tokenizing datasets...")

def tokenize_function(texts):
    return tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding=False
    )

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create HuggingFace datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask']
})

from datasets import Dataset

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask']
})

print(f"   ✓ Train dataset: {len(train_dataset):,} examples")
print(f"   ✓ Val dataset: {len(val_dataset):,} examples")

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

print("   ✓ Data collator ready")


5. Tokenizing datasets...
   ✓ Train dataset: 90 examples
   ✓ Val dataset: 10 examples
   ✓ Data collator ready


### Training

In [None]:
from transformers import Trainer, TrainingArguments

print("\n6. Setting up training arguments...")

output_dir = '/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 2*4 = 8
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=50,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    warmup_steps=50,
)

print("   ✓ Training arguments configured")
print(f"     - Epochs: {training_args.num_train_epochs}")
print(f"     - Batch size: {training_args.per_device_train_batch_size}")
print(f"     - Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"     - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"     - Learning rate: {training_args.learning_rate}")

# Initialize trainer
print("\n7. Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("   ✓ Trainer initialized")
print(f"   GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Train!
print("\n8. Starting training...")
print("=" * 60)

import time
start_time = time.time()

trainer.train()

elapsed_time = time.time() - start_time
print("\n" + "="*60)
print(f"✓ Training complete in {elapsed_time/60:.1f} minutes")
print("="*60)

# Save final model
print("\n9. Saving model...")
model.save_pretrained(f"{output_dir}/final_model")
tokenizer.save_pretrained(f"{output_dir}/final_model")
print(f"   ✓ Model saved to {output_dir}/final_model")

The model is already on multiple devices. Skipping the move to device specified in `args`.



6. Setting up training arguments...
   ✓ Training arguments configured
     - Epochs: 3
     - Batch size: 2
     - Gradient accumulation: 4
     - Effective batch size: 8
     - Learning rate: 0.0002

7. Initializing Trainer...
   ✓ Trainer initialized
   GPU memory: 11.19 GB

8. Starting training...


Step,Training Loss,Validation Loss



✓ Training complete in 0.4 minutes

9. Saving model...
   ✓ Model saved to /content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model


### Test Generation Quality

In [None]:
print("="*60)
print("TESTING FINE-TUNED MODEL GENERATION")
print("="*60)

# Load the fine-tuned model
print("\n1. Loading fine-tuned model...")
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

finetuned_model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model"
)

tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model"
)

print("   ✓ Fine-tuned model loaded")

# Test on validation examples
print("\n2. Generating responses for validation examples...")

test_queries = df_val_pilot['instruction'].head(5).tolist()
true_responses = df_val_pilot['response'].head(5).tolist()

for i, (query, true_response) in enumerate(zip(test_queries, true_responses), 1):
    print(f"\n{'='*60}")
    print(f"TEST EXAMPLE {i}")
    print('='*60)
    print(f"Customer Query: {query}")
    print(f"\nTrue Response:\n{true_response[:300]}...")

    # Generate response
    prompt = f"Customer: {query}\nAssistant:"
    inputs = tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)

    with torch.no_grad():
        outputs = finetuned_model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant response
    assistant_response = generated_text.split("Assistant:")[-1].strip()

    print(f"\nGenerated Response:\n{assistant_response}")
    print("-"*60)

TESTING FINE-TUNED MODEL GENERATION

1. Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Fine-tuned model loaded

2. Generating responses for validation examples...

TEST EXAMPLE 1
Customer Query: is there an e-mail to send my feedbakc for ur products

True Response:
We appreciate your interest in providing feedback for our products! To ensure that your feedback reaches the right team, we have a dedicated email address specifically for this purpose. Please send your feedback to [productfeedback@company.com]. Our team will carefully review and consider your input...

Generated Response:
Yes, you can send an e-mail to our customer support team to request a backup of your product feed. To do so, please follow these steps:
1. Visit our website and navigate to the customer support section.
2. Look for a contact form or email address provided on the page.
3. Fill out the contact form with your name, email address, and a brief message explaining the purpose of your request.
4. Review the information you provided to ensure accuracy.
5. Click the submit button or send the e-ma

# Sparse Query Testing

In [7]:
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
import faiss
from sentence_transformers import SentenceTransformer
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("="*70)
print("STRESS TEST: Sparse Categories with Novel Wordings")
print("="*70)

# Load the 100-example pilot training data used for fine-tuning
print("\nLoading pilot training data...")
df_train = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/train_dataset.csv')
df_indet_train = df_train[df_train['label'] == 1].reset_index(drop=True)
df_pilot = df_indet_train.sample(n=100, random_state=42)

from sklearn.model_selection import train_test_split
df_train_pilot, _ = train_test_split(df_pilot, test_size=0.1, random_state=42, stratify=df_pilot['category'])

# Analyze category distribution
print("\nCategory distribution in 100-example pilot training set:")
category_counts = df_train_pilot['category'].value_counts().sort_values()
print(category_counts)

# Identify sparse categories (bottom 2-3)
sparse_categories = category_counts.head(2).index.tolist()
print(f"\nIdentified SPARSE categories for testing:")
for cat in sparse_categories:
    count = category_counts[cat]
    print(f"  {cat}: {count} examples")
    print(f"  Sample queries:")
    for q in df_train_pilot[df_train_pilot['category'] == cat]['instruction'].head(2):
        print(f"    - {q}")

STRESS TEST: Sparse Categories with Novel Wordings

Loading pilot training data...

Category distribution in 100-example pilot training set:
category
FEEDBACK    19
ORDER       29
ACCOUNT     42
Name: count, dtype: int64

Identified SPARSE categories for testing:
  FEEDBACK: 19 examples
  Sample queries:
    - i dont know what to do to write a review for a service
    - how can I make a customer complaint against your company?
  ORDER: 29 examples
  Sample queries:
    - how to swap an article of purchase {{Order Number}}?
    - how do i shop an item


In [None]:
novel_queries = [
    # FEEDBACK category - 7 novel wordings (19 training examples)
    {
        'query': "I'd like to share my thoughts about the product quality and service I received",
        'category': 'FEEDBACK',
        'reference': "Thank you for your willingness to share your thoughts about the product quality and service you received. We greatly value your feedback as it helps us improve our offerings and customer experience. To submit your review, you can visit our website and navigate to the product page, where you'll find a review section. Alternatively, you can reply to your order confirmation email with your feedback. We appreciate you taking the time to share your experience!"
    },
    {
        'query': "Where can I write about my experience with your company?",
        'category': 'FEEDBACK',
        'reference': "We're thrilled that you want to share your experience with our company! Customer feedback is incredibly valuable to us. You can write about your experience in several ways: on our website's review section, by replying to your order confirmation email, through our social media pages, or on third-party review platforms. Which method would be most convenient for you? We look forward to hearing about your experience!"
    },
    {
        'query': "The shipping was fast but the product didn't meet my expectations - how do I let you know?",
        'category': 'FEEDBACK',
        'reference': "Thank you for sharing this feedback with us, and I apologize that the product didn't meet your expectations despite our fast shipping. Your honest feedback is important to us. To formally submit your review, you can visit the product page on our website and leave a detailed review there. This helps both us and other customers. Additionally, I'd like to understand more about what didn't meet your expectations so we can address this concern properly."
    },
    {
        'query': "I want to give you guys a 5-star rating, what's the best way to do that?",
        'category': 'FEEDBACK',
        'reference': "We're absolutely delighted to hear you want to give us a 5-star rating! Your positive feedback means the world to us. The best ways to leave your rating are: on our website's product or service page, through the review link in your order confirmation email, or on Google Reviews and other platforms. Your 5-star rating will help other customers discover our services. Thank you so much for taking the time to share your positive experience!"
    },
    {
        'query': "Can I post feedback about both the product and customer service in one place?",
        'category': 'FEEDBACK',
        'reference': "Absolutely! We encourage comprehensive feedback that covers both product and customer service experience. You can post combined feedback on our website's review section, where you can rate and comment on multiple aspects of your experience. This holistic feedback is incredibly valuable as it gives us a complete picture of your customer journey. Feel free to be as detailed as you'd like - we read every review carefully!"
    },
    {
        'query': "I had a mixed experience and want to tell you what went well and what didn't",
        'category': 'FEEDBACK',
        'reference': "We truly appreciate your honesty and willingness to share both the positive and negative aspects of your experience. Balanced feedback like yours is incredibly valuable as it helps us understand what we're doing right and where we need to improve. Please feel free to provide your detailed review on our website's feedback section or reply to your order confirmation email. We take all feedback seriously and use it to continuously enhance our products and services."
    },
    {
        'query': "Is there a form where I can rate my recent purchase?",
        'category': 'FEEDBACK',
        'reference': "Yes, there certainly is! We'd love to hear your thoughts on your recent purchase. You can find our rating and review form on the product page of our website. Simply navigate to the item you purchased, scroll to the review section, and you'll see the rating form. Alternatively, check your order confirmation email - it should contain a direct link to submit your rating. Your feedback helps both us and other customers make better decisions."
    },

    # ORDER category - 7 novel wordings (29 training examples)
    {
        'query': "I need to change the quantity of items in my order before it ships",
        'category': 'ORDER',
        'reference': "I understand you'd like to modify the quantity of items in your order before it ships. I'm here to help with that. To change your order quantity, please provide me with your order number and let me know which items you'd like to adjust and the new quantities you need. I'll check if your order is still in processing status - if it hasn't shipped yet, we should be able to make the changes. If it's already shipped, we can discuss options like refusing delivery or setting up a return for the excess items."
    },
    {
        'query': "Can I swap one product for another in my existing order?",
        'category': 'ORDER',
        'reference': "I'd be happy to help you swap a product in your existing order. To assist you with this, I'll need your order number and details about which product you'd like to swap and what you'd like to replace it with. If your order hasn't shipped yet, we can often make this change directly. However, if it's already shipped, we'll need to process this as a return and new order. Let me check the status of your order - could you provide the order number?"
    },
    {
        'query': "My order is processing but I realized I need it delivered to a different address",
        'category': 'ORDER',
        'reference': "I understand you need to change the delivery address for your order that's currently processing. Time is of the essence here. Please provide your order number immediately so I can check if we can still update the shipping address before it leaves our warehouse. If the order hasn't been dispatched yet, we should be able to change the address. If it's already shipped, we may be able to reroute it or arrange for a hold at a carrier facility. What's your order number?"
    },
    {
        'query': "I accidentally ordered the wrong size, can I fix this before delivery?",
        'category': 'ORDER',
        'reference': "I completely understand - ordering the wrong size is a common concern and we're here to help! To assist you, I'll need your order number to check the status. If your order is still processing and hasn't shipped, we can usually cancel it and place a new order with the correct size. If it's already shipped, don't worry - you can return the wrong size and we'll send you the correct size. What's your order number and what size do you actually need?"
    },
    {
        'query': "Is it possible to split my order into two separate shipments?",
        'category': 'ORDER',
        'reference': "I appreciate you reaching out about splitting your order into separate shipments. This request depends on the current status of your order. If it's still being processed, we may be able to split it, though this might affect shipping costs. If you have specific items you need urgently while others can wait, please provide your order number and specify which items you'd like in each shipment. I'll check what's possible and provide you with options, including any additional shipping fees that may apply."
    },
    {
        'query': "I want to add more items to an order I placed yesterday",
        'category': 'ORDER',
        'reference': "I understand you'd like to add more items to your order from yesterday. Unfortunately, once an order is placed, we typically cannot add items to it directly as our system begins processing immediately. However, I have a couple of solutions: 1) You can place a new separate order for the additional items, or 2) If your original order hasn't shipped yet, we might be able to cancel it and help you place a new combined order. Could you provide your order number so I can check the status and see which option would work best for you?"
    },
    {
        'query': "My order shows delivered but I never received it, what should I do?",
        'category': 'ORDER',
        'reference': "I'm very sorry to hear your order shows as delivered but you haven't received it. This is certainly concerning and we'll help resolve this immediately. First, please check: 1) All possible delivery locations (front door, back door, mailbox, building lobby), 2) With family members or neighbors who might have accepted it, 3) Any safe place instructions you provided. If you still can't locate it, please provide your order number and we'll: investigate with the carrier, file a claim if necessary, and either refund you or send a replacement. Your satisfaction is our priority."
    }
]

# Load models
print("\nLoading models...")

# Load retrieval system
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
train_embeddings = embedding_model.encode(
    df_train_pilot['instruction'].tolist(),
    show_progress_bar=False,
    convert_to_numpy=True
)
index = faiss.IndexFlatL2(train_embeddings.shape[1])
index.add(train_embeddings.astype('float32'))

# Load fine-tuned LLM
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
llm_model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model"
)
llm_tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model"
)
print("✓ Models loaded")

# Test novel queries
rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

retrieval_scores = []
llm_scores = []
retrieval_distances = []

print("\n" + "="*70)
print("TESTING NOVEL QUERIES")
print("="*70)

for i, item in enumerate(novel_queries, 1):
    query = item['query']
    reference = item['reference']
    category = item['category']

    print(f"\n{'='*70}")
    print(f"Query {i}/{len(novel_queries)} - Category: {category}")
    print(f"Novel Query: {query}")
    print("-"*70)

    # RETRIEVAL
    query_emb = embedding_model.encode([query], convert_to_numpy=True)
    dist, ind = index.search(query_emb.astype('float32'), 1)
    retrieved_resp = df_train_pilot.iloc[ind[0][0]]['response']
    retrieved_query = df_train_pilot.iloc[ind[0][0]]['instruction']
    retrieved_category = df_train_pilot.iloc[ind[0][0]]['category']

    ret_score = rouge_scorer_obj.score(reference, retrieved_resp)['rougeL'].fmeasure
    retrieval_distances.append(dist[0][0])

    print(f"\nRetrieval:")
    print(f"  Matched to: '{retrieved_query[:70]}...'")
    print(f"  Matched category: {retrieved_category} {'✓' if retrieved_category == category else '✗ WRONG!'}")
    print(f"  Distance: {dist[0][0]:.3f} {'(VERY FAR!)' if dist[0][0] > 1.0 else '(far)' if dist[0][0] > 0.5 else '(close)'}")
    print(f"  ROUGE-L: {ret_score:.3f}")
    print(f"  Response: {retrieved_resp[:120]}...")

    # LLM GENERATION
    prompt = f"Customer: {query}\nAssistant:"
    inputs = llm_tokenizer(prompt, return_tensors="pt").to(llm_model.device)
    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=llm_tokenizer.eos_token_id
        )
    llm_resp = llm_tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip()
    llm_score = rouge_scorer_obj.score(reference, llm_resp)['rougeL'].fmeasure

    print(f"\nLLM:")
    print(f"  ROUGE-L: {llm_score:.3f}")
    print(f"  Response: {llm_resp[:120]}...")

    # Determine winner
    winner = "🔥 LLM" if llm_score > ret_score else "📚 Retrieval" if ret_score > llm_score else "🤝 Tie"
    print(f"\n{winner} (Δ {abs(llm_score - ret_score):.3f})")

    retrieval_scores.append(ret_score)
    llm_scores.append(llm_score)

# Final results
print("\n" + "="*70)
print("SPARSE CATEGORY STRESS TEST RESULTS")
print("="*70)

print(f"\nRetrieval (sparse training data): {np.mean(retrieval_scores):.4f} ROUGE-L")
print(f"LLM (trained on same data):       {np.mean(llm_scores):.4f} ROUGE-L")

print(f"\nAverage retrieval distance: {np.mean(retrieval_distances):.3f}")
print(f"  (Higher = worse matches, <0.5 is good, >1.0 is bad)")

diff = np.mean(llm_scores) - np.mean(retrieval_scores)
print(f"\nDifference: {diff:+.4f}")

llm_wins = sum([1 for l, r in zip(llm_scores, retrieval_scores) if l > r])
ret_wins = sum([1 for l, r in zip(llm_scores, retrieval_scores) if r > l])
ties = len(llm_scores) - llm_wins - ret_wins

print(f"\nHead-to-head: LLM {llm_wins}/{len(novel_queries)}, Retrieval {ret_wins}/{len(novel_queries)}, Ties {ties}/{len(novel_queries)}")

print("\n" + "="*70)
print("KEY INSIGHT")
print("="*70)

if diff > 0.05 and llm_wins > ret_wins:
    print("✅ LLM DOMINATES on novel queries with sparse training data!")
    print(f"   → With only {category_counts.min()}-{category_counts.iloc[1]} examples per sparse category, retrieval struggles")
    print("   → LLM learned patterns and can generalize to novel wordings")
    print("   → THIS IS THE VALUE OF FINE-TUNING!")
elif diff > 0:
    print("⚠️  LLM slightly better on sparse data")
    print("   → Results suggest LLM has some generalization advantage")
else:
    print("📚 Retrieval holds up even on sparse data")
    print("   → Semantic embeddings capture similarity well")


Loading models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✓ Models loaded

TESTING NOVEL QUERIES

Query 1/14 - Category: FEEDBACK
Novel Query: I'd like to share my thoughts about the product quality and service I received
----------------------------------------------------------------------

Retrieval:
  Matched to: 'I want to send feedback for your company, help me...'
  Matched category: FEEDBACK ✓
  Distance: 0.871 (far)
  ROUGE-L: 0.423
  Response: I'm on it! Your willingness to share feedback is truly appreciated, as it helps us understand your experience better and...

LLM:
  ROUGE-L: 0.374
  Response: Thank you for reaching out and sharing your thoughts about the product quality and service you received. Your feedback i...

📚 Retrieval (Δ 0.049)

Query 2/14 - Category: FEEDBACK
Novel Query: Where can I write about my experience with your company?
----------------------------------------------------------------------

Retrieval:
  Matched to: 'I want to send feedback for your company, help me...'
  Matched category: FEEDBACK ✓
  Distan

In [5]:
novel_queries = [
    # FEEDBACK category - 7 novel wordings (19 training examples)
    {
        'query': "I'd like to share my thoughts about the product quality and service I received",
        'category': 'FEEDBACK',
        'reference': "Thank you for your willingness to share your thoughts about the product quality and service you received. We greatly value your feedback as it helps us improve our offerings and customer experience. To submit your review, you can visit our website and navigate to the product page, where you'll find a review section. Alternatively, you can reply to your order confirmation email with your feedback. We appreciate you taking the time to share your experience!"
    },
    {
        'query': "Where can I write about my experience with your company?",
        'category': 'FEEDBACK',
        'reference': "We're thrilled that you want to share your experience with our company! Customer feedback is incredibly valuable to us. You can write about your experience in several ways: on our website's review section, by replying to your order confirmation email, through our social media pages, or on third-party review platforms. Which method would be most convenient for you? We look forward to hearing about your experience!"
    },
    {
        'query': "The shipping was fast but the product didn't meet my expectations - how do I let you know?",
        'category': 'FEEDBACK',
        'reference': "Thank you for sharing this feedback with us, and I apologize that the product didn't meet your expectations despite our fast shipping. Your honest feedback is important to us. To formally submit your review, you can visit the product page on our website and leave a detailed review there. This helps both us and other customers. Additionally, I'd like to understand more about what didn't meet your expectations so we can address this concern properly."
    },
    {
        'query': "I want to give you guys a 5-star rating, what's the best way to do that?",
        'category': 'FEEDBACK',
        'reference': "We're absolutely delighted to hear you want to give us a 5-star rating! Your positive feedback means the world to us. The best ways to leave your rating are: on our website's product or service page, through the review link in your order confirmation email, or on Google Reviews and other platforms. Your 5-star rating will help other customers discover our services. Thank you so much for taking the time to share your positive experience!"
    },
    {
        'query': "Can I post feedback about both the product and customer service in one place?",
        'category': 'FEEDBACK',
        'reference': "Absolutely! We encourage comprehensive feedback that covers both product and customer service experience. You can post combined feedback on our website's review section, where you can rate and comment on multiple aspects of your experience. This holistic feedback is incredibly valuable as it gives us a complete picture of your customer journey. Feel free to be as detailed as you'd like - we read every review carefully!"
    },
    {
        'query': "I had a mixed experience and want to tell you what went well and what didn't",
        'category': 'FEEDBACK',
        'reference': "We truly appreciate your honesty and willingness to share both the positive and negative aspects of your experience. Balanced feedback like yours is incredibly valuable as it helps us understand what we're doing right and where we need to improve. Please feel free to provide your detailed review on our website's feedback section or reply to your order confirmation email. We take all feedback seriously and use it to continuously enhance our products and services."
    },
    {
        'query': "Is there a form where I can rate my recent purchase?",
        'category': 'FEEDBACK',
        'reference': "Yes, there certainly is! We'd love to hear your thoughts on your recent purchase. You can find our rating and review form on the product page of our website. Simply navigate to the item you purchased, scroll to the review section, and you'll see the rating form. Alternatively, check your order confirmation email - it should contain a direct link to submit your rating. Your feedback helps both us and other customers make better decisions."
    },

    # ORDER category - 7 novel wordings (29 training examples)
    {
        'query': "I need to change the quantity of items in my order before it ships",
        'category': 'ORDER',
        'reference': "I understand you'd like to modify the quantity of items in your order before it ships. I'm here to help with that. To change your order quantity, please provide me with your order number and let me know which items you'd like to adjust and the new quantities you need. I'll check if your order is still in processing status - if it hasn't shipped yet, we should be able to make the changes. If it's already shipped, we can discuss options like refusing delivery or setting up a return for the excess items."
    },
    {
        'query': "Can I swap one product for another in my existing order?",
        'category': 'ORDER',
        'reference': "I'd be happy to help you swap a product in your existing order. To assist you with this, I'll need your order number and details about which product you'd like to swap and what you'd like to replace it with. If your order hasn't shipped yet, we can often make this change directly. However, if it's already shipped, we'll need to process this as a return and new order. Let me check the status of your order - could you provide the order number?"
    },
    {
        'query': "My order is processing but I realized I need it delivered to a different address",
        'category': 'ORDER',
        'reference': "I understand you need to change the delivery address for your order that's currently processing. Time is of the essence here. Please provide your order number immediately so I can check if we can still update the shipping address before it leaves our warehouse. If the order hasn't been dispatched yet, we should be able to change the address. If it's already shipped, we may be able to reroute it or arrange for a hold at a carrier facility. What's your order number?"
    },
    {
        'query': "I accidentally ordered the wrong size, can I fix this before delivery?",
        'category': 'ORDER',
        'reference': "I completely understand - ordering the wrong size is a common concern and we're here to help! To assist you, I'll need your order number to check the status. If your order is still processing and hasn't shipped, we can usually cancel it and place a new order with the correct size. If it's already shipped, don't worry - you can return the wrong size and we'll send you the correct size. What's your order number and what size do you actually need?"
    },
    {
        'query': "Is it possible to split my order into two separate shipments?",
        'category': 'ORDER',
        'reference': "I appreciate you reaching out about splitting your order into separate shipments. This request depends on the current status of your order. If it's still being processed, we may be able to split it, though this might affect shipping costs. If you have specific items you need urgently while others can wait, please provide your order number and specify which items you'd like in each shipment. I'll check what's possible and provide you with options, including any additional shipping fees that may apply."
    },
    {
        'query': "I want to add more items to an order I placed yesterday",
        'category': 'ORDER',
        'reference': "I understand you'd like to add more items to your order from yesterday. Unfortunately, once an order is placed, we typically cannot add items to it directly as our system begins processing immediately. However, I have a couple of solutions: 1) You can place a new separate order for the additional items, or 2) If your original order hasn't shipped yet, we might be able to cancel it and help you place a new combined order. Could you provide your order number so I can check the status and see which option would work best for you?"
    },
    {
        'query': "My order shows delivered but I never received it, what should I do?",
        'category': 'ORDER',
        'reference': "I'm very sorry to hear your order shows as delivered but you haven't received it. This is certainly concerning and we'll help resolve this immediately. First, please check: 1) All possible delivery locations (front door, back door, mailbox, building lobby), 2) With family members or neighbors who might have accepted it, 3) Any safe place instructions you provided. If you still can't locate it, please provide your order number and we'll: investigate with the carrier, file a claim if necessary, and either refund you or send a replacement. Your satisfaction is our priority."
    }
]

In [8]:
import torch
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from rouge_score import rouge_scorer
from bert_score import BERTScorer

# ------------------------------------------------------------------
# 1. LOAD MODELS
# ------------------------------------------------------------------
print("\nLoading models...")

# Load retrieval system (SentenceTransformers + FAISS)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
train_embeddings = embedding_model.encode(
    df_train_pilot['instruction'].tolist(),
    show_progress_bar=False,
    convert_to_numpy=True
)
index = faiss.IndexFlatL2(train_embeddings.shape[1])
index.add(train_embeddings.astype('float32'))

# Load fine-tuned LLM
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
llm_model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model"
)
llm_tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora_pilot/final_model"
)

# Load Evaluators
print("Loading Evaluators...")
rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
# lang="en" downloads a standard model (usually roberta-large) for English
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=False)

print("✓ Models and Evaluators loaded")

# ------------------------------------------------------------------
# 2. TEST NOVEL QUERIES
# ------------------------------------------------------------------
retrieval_rouge_scores = []
retrieval_bert_scores = []
llm_rouge_scores = []
llm_bert_scores = []
retrieval_distances = []

print("\n" + "="*70)
print("TESTING NOVEL QUERIES (ROUGE & BERTSCORE)")
print("="*70)

for i, item in enumerate(novel_queries, 1):
    query = item['query']
    reference = item['reference']
    category = item['category']

    print(f"\n{'='*70}")
    print(f"Query {i}/{len(novel_queries)} - Category: {category}")
    print(f"Novel Query: {query}")
    print("-"*70)

    # --- RETRIEVAL ---
    query_emb = embedding_model.encode([query], convert_to_numpy=True)
    dist, ind = index.search(query_emb.astype('float32'), 1)
    retrieved_resp = df_train_pilot.iloc[ind[0][0]]['response']
    retrieved_query = df_train_pilot.iloc[ind[0][0]]['instruction']
    retrieved_category = df_train_pilot.iloc[ind[0][0]]['category']

    # Calculate Scores (Retrieval)
    ret_rouge = rouge_scorer_obj.score(reference, retrieved_resp)['rougeL'].fmeasure
    # BERTScore returns P, R, F1 tensors. We take F1.
    P, R, F1 = bert_scorer.score([retrieved_resp], [reference])
    ret_bert = F1.item()

    retrieval_distances.append(dist[0][0])

    print(f"\n📚 Retrieval:")
    print(f"  Matched to: '{retrieved_query[:70]}...'")
    print(f"  Matched category: {retrieved_category} {'✓' if retrieved_category == category else '✗ WRONG!'}")
    print(f"  Distance: {dist[0][0]:.3f}")
    print(f"  ROUGE-L:   {ret_rouge:.3f}")
    print(f"  BERTScore: {ret_bert:.3f}")
    print(f"  Response:  {retrieved_resp[:100]}...")

    # --- LLM GENERATION ---
    prompt = f"Customer: {query}\nAssistant:"
    inputs = llm_tokenizer(prompt, return_tensors="pt").to(llm_model.device)
    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=llm_tokenizer.eos_token_id
        )
    llm_resp = llm_tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip()

    # Calculate Scores (LLM)
    llm_rouge = rouge_scorer_obj.score(reference, llm_resp)['rougeL'].fmeasure
    P, R, F1 = bert_scorer.score([llm_resp], [reference])
    llm_bert = F1.item()

    print(f"\n🔥 LLM:")
    print(f"  ROUGE-L:   {llm_rouge:.3f}")
    print(f"  BERTScore: {llm_bert:.3f}")
    print(f"  Response:  {llm_resp[:100]}...")

    # --- DETERMINE WINNER (Based on BERTScore) ---
    # We use BERTScore for the 'winner' logic as it correlates better with human judgment
    if llm_bert > ret_bert:
        winner = "🔥 LLM Wins"
    elif ret_bert > llm_bert:
        winner = "📚 Retrieval Wins"
    else:
        winner = "🤝 Tie"

    delta = abs(llm_bert - ret_bert)
    print(f"\nRESULT: {winner} (BERTScore Δ {delta:.3f})")

    # Append to lists
    retrieval_rouge_scores.append(ret_rouge)
    retrieval_bert_scores.append(ret_bert)
    llm_rouge_scores.append(llm_rouge)
    llm_bert_scores.append(llm_bert)

# ------------------------------------------------------------------
# 3. FINAL RESULTS
# ------------------------------------------------------------------
print("\n" + "="*70)
print("FINAL PERFORMANCE METRICS")
print("="*70)

print(f"Metric       | Retrieval Avg | LLM Avg | Difference")
print("-" * 55)
print(f"ROUGE-L      | {np.mean(retrieval_rouge_scores):.4f}        | {np.mean(llm_rouge_scores):.4f}  | {np.mean(llm_rouge_scores) - np.mean(retrieval_rouge_scores):+.4f}")
print(f"BERTScore    | {np.mean(retrieval_bert_scores):.4f}        | {np.mean(llm_bert_scores):.4f}  | {np.mean(llm_bert_scores) - np.mean(retrieval_bert_scores):+.4f}")

print("-" * 55)
print(f"Avg Ret Distance: {np.mean(retrieval_distances):.3f}")

# Calculate Win Rates (based on BERTScore)
llm_wins = sum([1 for l, r in zip(llm_bert_scores, retrieval_bert_scores) if l > r])
ret_wins = sum([1 for l, r in zip(llm_bert_scores, retrieval_bert_scores) if r > l])
ties = len(llm_bert_scores) - llm_wins - ret_wins

print(f"\nHead-to-head (BERTScore): LLM {llm_wins}, Retrieval {ret_wins}, Ties {ties}")

print("\n" + "="*70)
print("KEY INSIGHT")
print("="*70)

diff_bert = np.mean(llm_bert_scores) - np.mean(retrieval_bert_scores)

if diff_bert > 0.02:
    print("✅ LLM DOMINATES on semantic meaning!")
    print("   → The BERTScore indicates the LLM is generating responses that share")
    print("     more *meaning* with the reference, even if exact words differ.")
elif diff_bert > 0:
    print("⚠️  LLM slightly better on semantic meaning")
else:
    print("📚 Retrieval is more semantically accurate")
    print("   → The training data might be covering these queries well enough")
    print("     that the retrieval system's exact matches are superior.")


Loading models...


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading Evaluators...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Models and Evaluators loaded

TESTING NOVEL QUERIES (ROUGE & BERTSCORE)

Query 1/14 - Category: FEEDBACK
Novel Query: I'd like to share my thoughts about the product quality and service I received
----------------------------------------------------------------------

📚 Retrieval:
  Matched to: 'I want to send feedback for your company, help me...'
  Matched category: FEEDBACK ✓
  Distance: 0.871
  ROUGE-L:   0.423
  BERTScore: 0.906
  Response:  I'm on it! Your willingness to share feedback is truly appreciated, as it helps us understand your e...

🔥 LLM:
  ROUGE-L:   0.331
  BERTScore: 0.899
  Response:  We appreciate your feedback on the product quality and service you received. Your thoughts and opini...

RESULT: 📚 Retrieval Wins (BERTScore Δ 0.007)

Query 2/14 - Category: FEEDBACK
Novel Query: Where can I write about my experience with your company?
----------------------------------------------------------------------

📚 Retrieval:
  Matched to: 'I want to send feedback for you