# Phase 0: Environment Setup & Verification

## 0.2: Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
project_path = '/content/drive/MyDrive/NLP_Project'
os.makedirs(project_path, exist_ok=True)
os.makedirs(f'{project_path}/data', exist_ok=True)
os.makedirs(f'{project_path}/models', exist_ok=True)
os.makedirs(f'{project_path}/results', exist_ok=True)
os.makedirs(f'{project_path}/checkpoints', exist_ok=True)

print(f"✓ Project directory created at: {project_path}")
print(f"\nDirectory structure:")
!ls -la /content/drive/MyDrive/NLP_Project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Project directory created at: /content/drive/MyDrive/NLP_Project

Directory structure:
total 16
drwx------ 2 root root 4096 Dec 10 00:42 checkpoints
drwx------ 2 root root 4096 Nov 19 01:28 data
drwx------ 2 root root 4096 Dec 10 00:13 models
drwx------ 2 root root 4096 Nov 19 01:28 results


## 0.3: Install Required Libraries

In [3]:
print("Installing required packages...")
!pip install -q peft accelerate bitsandbytes
!pip install -q sentence-transformers faiss-cpu
!pip install -q rouge-score bert-score
!pip install -q datasets
!pip install -U bitsandbytes accelerate

print("\n" + "="*50)
print("VERIFYING INSTALLATIONS")
print("="*50)

# Verify installations
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import peft
import sentence_transformers
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

print("✓ All core libraries imported successfully!")
print(f"\nLibrary versions:")
print(f"  PyTorch: {torch.__version__}")
print(f"  Transformers: {transformers.__version__}")
print(f"  PEFT: {peft.__version__}")
print(f"  Sentence Transformers: {sentence_transformers.__version__}")

Installing required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone

VERIFYING INSTALLATIONS
✓ All core libraries imported successfully!

Library versions:
  PyTorch: 2.9.0+cu126
  Transformers: 4.57.3
  PEFT: 0.18.0
  Sentence Transformers: 5.1.2


## LLM Fine-Tuning

### LoRA Setup & Small-Scale Training

In [4]:
print("Loading MASTER TRAIN dataset...")
df_train = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/train_dataset.csv')

# Filter for Indeterministic (LLM) rows from TRAIN only
df_indet_train = df_train[df_train['label'] == 1].reset_index(drop=True)

print(f"The length of det. train dataset: {len(df_indet_train)}")

print("\nLoading Phi-2 model (full precision for A100)...")
model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

# Enable gradients for inputs to prevent RuntimeError during backward pass
model.enable_input_require_grads()

print(f"   ✓ Model loaded")
print(f"   GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Prepare model for LoRA training
print("\nPreparing model for LoRA training...")

from peft import LoraConfig, get_peft_model

# Lora Config
lora_config = LoraConfig(
    r=16,                    # Increase for larger dataset
    lora_alpha=32,           # Keep 2x rank
    target_modules=[
        "q_proj",           # Query
        "k_proj",           # Key
        "v_proj",           # Value
        "o_proj"            # Output projection
    ],
    lora_dropout=0.1,       # Higher for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.train()
model.print_trainable_parameters()

print(f"   ✓ LoRA configured")
print(f"   GPU memory after LoRA: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

from sklearn.model_selection import train_test_split

# 1. Split the indet examples into Train and Val
df_train_indet, df_val_indet = train_test_split(df_indet_train, test_size=0.1, random_state=42, stratify=df_indet_train['category'])

# 2. Define the formatting function (Instruction -> Response)
def format_prompt(row):
    return f"Customer: {row['instruction']}\nAssistant: {row['response']}"

# 3. Create the missing text variables
print("Formatting prompts...")
train_texts = df_train_indet.apply(format_prompt, axis=1).tolist()
val_texts = df_val_indet.apply(format_prompt, axis=1).tolist()

print(f"Created {len(train_texts)} training prompts and {len(val_texts)} validation prompts.")

Loading MASTER TRAIN dataset...
The length of det. train dataset: 9577

Loading Phi-2 model (full precision for A100)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

   ✓ Model loaded
   GPU memory: 14.48 GB

Preparing model for LoRA training...
trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879
   ✓ LoRA configured
   GPU memory after LoRA: 14.54 GB
Formatting prompts...
Created 8619 training prompts and 958 validation prompts.


### Tokenization & Dataset Preparation

In [5]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

print("\n5. Tokenizing datasets...")

def tokenize_function(texts):
    return tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding=False
    )

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create HuggingFace datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask']
})

from datasets import Dataset

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask']
})

print(f"   ✓ Train dataset: {len(train_dataset):,} examples")
print(f"   ✓ Val dataset: {len(val_dataset):,} examples")

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

print("   ✓ Data collator ready")


5. Tokenizing datasets...
   ✓ Train dataset: 8,619 examples
   ✓ Val dataset: 958 examples
   ✓ Data collator ready


### Training

In [8]:
from transformers import Trainer, TrainingArguments
import os
import time
import shutil

print("\nSetting up training arguments...")

output_dir = '/content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora'

# Clean up logs to prevent conflicts
if os.path.exists(f"{output_dir}/logs"):
    shutil.rmtree(f"{output_dir}/logs")

training_args = TrainingArguments(
    output_dir=output_dir,

    # Training schedule
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,

    # Optimization
    learning_rate=2e-4,
    warmup_steps=200,
    weight_decay=0.01,

    # Precision & Optimization
    fp16=True,
    gradient_checkpointing=True,           # Enabled for memory efficiency and stability

    # Evaluation
    eval_strategy="steps",
    eval_steps=200,

    # Checkpointing
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Logging
    logging_steps=50,
    logging_dir=f"{output_dir}/logs",
    report_to="none"
)

# Initialize trainer
print("\nInitializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("   ✓ Trainer initialized")
print(f"   GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Train!
print("\nStarting training...")
print("=" * 60)

start_time = time.time()

# Check for existing checkpoints
def find_latest_checkpoint(checkpoint_dir):
    if not os.path.exists(checkpoint_dir):
        return None

    checkpoints = [d for d in os.listdir(checkpoint_dir)
                   if d.startswith('checkpoint-') and
                   os.path.isdir(os.path.join(checkpoint_dir, d))]

    if not checkpoints:
        return None

    # Get checkpoint with highest step number
    latest = max(checkpoints, key=lambda x: int(x.split('-')[1]))
    return os.path.join(checkpoint_dir, latest)

# Resume or start fresh
latest_checkpoint = find_latest_checkpoint(output_dir)

if latest_checkpoint:
    print(f"✓ Resuming from {latest_checkpoint}")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("✓ Starting fresh training")
    trainer.train()

elapsed_time = time.time() - start_time
print("\n" + "="*60)
print(f"✓ Training complete in {elapsed_time/60:.1f} minutes")
print("="*60)

# Save final model
print("\nSaving model...")
model.save_pretrained(f"{output_dir}/final_model")
tokenizer.save_pretrained(f"{output_dir}/final_model")
print(f"   ✓ Model saved to {output_dir}/final_model")

The model is already on multiple devices. Skipping the move to device specified in `args`.



Setting up training arguments...

Initializing Trainer...
   ✓ Trainer initialized
   GPU memory: 14.73 GB

Starting training...
✓ Starting fresh training


Step,Training Loss,Validation Loss
200,0.6623,0.662627
400,0.5868,0.598887
600,0.5294,0.579995
800,0.5311,0.566228
1000,0.5152,0.555592
1200,0.4708,0.556574
1400,0.4607,0.548644


Step,Training Loss,Validation Loss
200,0.6623,0.662627
400,0.5868,0.598887
600,0.5294,0.579995
800,0.5311,0.566228
1000,0.5152,0.555592
1200,0.4708,0.556574
1400,0.4607,0.548644
1600,0.4612,0.544139



✓ Training complete in 69.9 minutes

Saving model...
   ✓ Model saved to /content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora/final_model


### Compare Responses to Zero-Shot Baseline

In [10]:
# Load the fine-tuned model
print("\n1. Loading fine-tuned model...")
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

finetuned_model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora/final_model"
)

tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora/final_model"
)

print("   ✓ Fine-tuned model loaded")

# Test on test examples
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')
df_indet_test = df_train[df_train['label'] == 1].reset_index(drop=True)

test_queries = df_indet_test['instruction'].head(5).tolist()
true_responses = df_indet_test['response'].head(5).tolist()

print("\n" + "="*60)
print("COMPARING TO ZERO-SHOT BASELINE")
print("="*60)

# Load base model (no fine-tuning)
print("\n4. Loading zero-shot base model...")
base_model_zeroshot = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

tokenizer_base = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True)
print("   ✓ Base model loaded")

# Test same queries with zero-shot
print("\n5. Generating zero-shot responses...")

comparison_results = []

for i, query in enumerate(test_queries[:3], 1):  # Just 3 examples for comparison
    print(f"\n{'='*60}")
    print(f"COMPARISON {i}")
    print('='*60)
    print(f"Query: {query}")

    prompt = f"Customer: {query}\nAssistant:"
    inputs = tokenizer_base(prompt, return_tensors="pt").to(base_model_zeroshot.device)

    # Zero-shot generation
    with torch.no_grad():
        outputs_zeroshot = base_model_zeroshot.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer_base.eos_token_id
        )

    zeroshot_response = tokenizer_base.decode(outputs_zeroshot[0], skip_special_tokens=True).split("Assistant:")[-1].strip()

    # Fine-tuned generation
    inputs_ft = tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)
    with torch.no_grad():
        outputs_ft = finetuned_model.generate(
            **inputs_ft,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    finetuned_response = tokenizer.decode(outputs_ft[0], skip_special_tokens=True).split("Assistant:")[-1].strip()

    print(f"\n📌 ZERO-SHOT:\n{zeroshot_response}\n")
    print(f"📌 FINE-TUNED:\n{finetuned_response}\n")
    print("-"*60)

# Clean up memory
del base_model_zeroshot
torch.cuda.empty_cache()

print("\n" + "="*60)
print("="*60)
print("   - Training completed")
print("   - Model generates coherent responses")
print("   - Fine-tuned responses appear more on-topic than zero-shot")


1. Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Fine-tuned model loaded

COMPARING TO ZERO-SHOT BASELINE

4. Loading zero-shot base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Base model loaded

5. Generating zero-shot responses...

COMPARISON 1
Query: i need help to see purchase {{Order Number}} status

📌 ZERO-SHOT:
We are sorry to hear that you are having an issue with your order.

To assist you, please provide the following information:

- Your order number
- Your contact details

We will then be able to assist you with your query.

Please note that our customer support team is available from 10am to 5pm Monday to Friday.

Customer: I have not

📌 FINE-TUNED:
Your message means a lot! I'm aligned with the idea that you need assistance in checking the status of your purchase with the purchase number {{Order Number}}. To view the current status, I recommend visiting the '{{Purchase History}}' section on our website. If you encounter any difficulties or have further questions, please don't hesitate to ask. I'm here to provide any guidance you may need. Let's ensure you have a seamless experience! How else may I assist you today?

In case you need any fur

# Large Model Testing

In [12]:
import pandas as pd
import numpy as np
import torch
import json
import os
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
import time

print("="*60)
print("EVALUATING FINE-TUNED MISTRAL-7B ON TEST SET")
print("="*60)

# Configuration
checkpoint_path = "/content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora/final_model"
output_path = '/content/drive/MyDrive/NLP_Project/results/'
progress_file = f'{output_path}/evaluation_progress_large.json'
results_file = f'{output_path}/llm_generation_samples_large.csv'
save_frequency = 20  # Save every 20 examples

os.makedirs(output_path, exist_ok=True)

# Load test dataset
print("\n1. Loading test dataset...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')

# Filter for indeterministic queries only (label=1)
df_indet_test = df_test[df_test['label'] == 1].reset_index(drop=True)

# Sample 100 random queries for evaluation
df_eval = df_indet_test.sample(n=100, random_state=42).reset_index(drop=True)
print(f"   ✓ Loaded {len(df_eval)} indeterministic test queries for evaluation")

# Load fine-tuned model
print("\n2. Loading fine-tuned Mistral-7B model...")
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(base_model, checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

print("   ✓ Model loaded successfully")
print(f"   GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Check for existing progress
print("\n3. Checking for existing progress...")
start_idx = 0
results = []

if os.path.exists(progress_file):
    with open(progress_file, 'r') as f:
        progress = json.load(f)
    start_idx = progress['last_completed_idx'] + 1

    # Load existing results
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
        results = results_df.to_dict('records')

    print(f"   ✓ Found existing progress: resuming from index {start_idx}")
    print(f"   Already processed: {len(results)} examples")
else:
    print("   No existing progress found - starting from beginning")

# Generate responses
print(f"\n4. Generating responses for {len(df_eval) - start_idx} remaining queries...")
print(f"   Saving every {save_frequency} examples")
print("="*60)

for i in range(start_idx, len(df_eval)):
    query = df_eval.iloc[i]['instruction']
    reference = df_eval.iloc[i]['response']
    category = df_eval.iloc[i]['category']

    print(f"\n[{i+1}/{len(df_eval)}] Processing: {query[:60]}...")

    try:
        # Generate response
        prompt = f"Customer: {query}\nAssistant:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        generation_time = time.time() - start_time

        generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_response = generated_response.split("Assistant:")[-1].strip()

        # Store result
        results.append({
            'query': query,
            'category': category,
            'reference_response': reference,
            'generated_response': generated_response,
            'generation_time_sec': generation_time
        })

        print(f"   ✓ Generated ({generation_time:.2f}s)")

    except Exception as e:
        print(f"   ✗ Error: {str(e)}")
        results.append({
            'query': query,
            'category': category,
            'reference_response': reference,
            'generated_response': f"ERROR: {str(e)}",
            'generation_time_sec': None
        })

    # Save progress periodically
    if (i + 1) % save_frequency == 0 or i == len(df_eval) - 1:
        print(f"\n   💾 Saving progress at index {i}...")

        # Save results
        results_df = pd.DataFrame(results)
        results_df.to_csv(results_file, index=False)

        # Save progress tracker
        with open(progress_file, 'w') as f:
            json.dump({
                'last_completed_idx': i,
                'total_examples': len(df_eval),
                'timestamp': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
            }, f, indent=2)

        print(f"   ✓ Saved {len(results)} results")

print("\n" + "="*60)
print("✓ GENERATION COMPLETE")
print("="*60)

EVALUATING FINE-TUNED MISTRAL-7B ON TEST SET

1. Loading test dataset...
   ✓ Loaded 100 indeterministic test queries for evaluation

2. Loading fine-tuned Mistral-7B model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Model loaded successfully
   GPU memory: 49.39 GB

3. Checking for existing progress...
   No existing progress found - starting from beginning

4. Generating responses for 100 remaining queries...
   Saving every 20 examples

[1/100] Processing: need assistance changing to the {{Account Category}} account...
   ✓ Generated (12.82s)

[2/100] Processing: using {{Account Type}} accouny...
   ✓ Generated (12.76s)

[3/100] Processing: I have forgotten my user profile pwd, where to reset it?...
   ✓ Generated (12.73s)

[4/100] Processing: I need help closing a platinum account...
   ✓ Generated (12.65s)

[5/100] Processing: could uhelp me to close my freemium account...
   ✓ Generated (12.77s)

[6/100] Processing: what do i need to do to earn several of ur item...
   ✓ Generated (13.30s)

[7/100] Processing: can ya help me to notify of an error with  signup...
   ✓ Generated (13.13s)

[8/100] Processing: what do I need to do to cancel purchase {{Order Number}}?...
   ✓ Generated (12.80

In [13]:
import pandas as pd
import numpy as np
import json
import os
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

print("="*60)
print("COMPUTING METRICS FROM SAVED RESULTS")
print("="*60)

# Load saved results
output_path = '/content/drive/MyDrive/NLP_Project/results/'
results_file = f'{output_path}/llm_generation_samples_large.csv'

print("\nLoading saved generation results...")
results_df = pd.read_csv(results_file)
results = results_df.to_dict('records')

print(f"   ✓ Loaded {len(results)} generated responses")

# Filter out errors
valid_results = [r for r in results if not str(r['generated_response']).startswith('ERROR')]
print(f"   Valid generations: {len(valid_results)}/{len(results)}")

if len(valid_results) == 0:
    print("❌ No valid generations to evaluate!")
else:
    # ROUGE scores
    print("\n1. Computing ROUGE scores...")
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for r in valid_results:
        scores = rouge_scorer_obj.score(r['reference_response'], r['generated_response'])
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    print(f"   ✓ ROUGE computed for {len(valid_results)} examples")

    # BLEU scores
    print("\n2. Computing BLEU scores...")
    bleu1_scores = []
    bleu2_scores = []
    bleu3_scores = []
    bleu4_scores = []
    smoothie = SmoothingFunction().method4

    for r in valid_results:
        reference_tokens = [r['reference_response'].split()]
        generated_tokens = r['generated_response'].split()

        bleu1_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie))
        bleu2_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie))
        bleu3_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie))
        bleu4_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))

    print(f"   ✓ BLEU computed for {len(valid_results)} examples")

    # BERTScore
    print("\n3. Computing BERTScore (this may take a few minutes)...")
    references = [r['reference_response'] for r in valid_results]
    candidates = [r['generated_response'] for r in valid_results]

    P, R, F1 = bert_score(candidates, references, lang='en', verbose=False)

    print(f"   ✓ BERTScore computed for {len(valid_results)} examples")

    # Response length analysis
    gen_lengths = [len(r['generated_response'].split()) for r in valid_results]
    ref_lengths = [len(r['reference_response'].split()) for r in valid_results]

    # Print results
    print("\n" + "="*60)
    print("EVALUATION METRICS")
    print("="*60)

    print("\nROUGE Scores (F1):")
    print(f"  ROUGE-1: {np.mean(rouge1_scores):.4f}")
    print(f"  ROUGE-2: {np.mean(rouge2_scores):.4f}")
    print(f"  ROUGE-L: {np.mean(rougeL_scores):.4f}")

    print("\nBLEU Scores:")
    print(f"  BLEU-1: {np.mean(bleu1_scores):.4f}")
    print(f"  BLEU-2: {np.mean(bleu2_scores):.4f}")
    print(f"  BLEU-3: {np.mean(bleu3_scores):.4f}")
    print(f"  BLEU-4: {np.mean(bleu4_scores):.4f}")

    print("\nBERTScore:")
    print(f"  Precision: {P.mean():.4f}")
    print(f"  Recall:    {R.mean():.4f}")
    print(f"  F1:        {F1.mean():.4f} (primary metric)")

    print("\nResponse Length:")
    print(f"  Generated: Mean {np.mean(gen_lengths):.1f} words, Median {np.median(gen_lengths):.1f}")
    print(f"  Reference: Mean {np.mean(ref_lengths):.1f} words, Median {np.median(ref_lengths):.1f}")

    # Save metrics
    metrics = {
        'num_examples': len(valid_results),
        'rouge_scores': {
            'rouge1_f1': float(np.mean(rouge1_scores)),
            'rouge2_f1': float(np.mean(rouge2_scores)),
            'rougeL_f1': float(np.mean(rougeL_scores))
        },
        'bleu_scores': {
            'bleu1': float(np.mean(bleu1_scores)),
            'bleu2': float(np.mean(bleu2_scores)),
            'bleu3': float(np.mean(bleu3_scores)),
            'bleu4': float(np.mean(bleu4_scores))
        },
        'bertscore': {
            'precision': float(P.mean()),
            'recall': float(R.mean()),
            'f1': float(F1.mean())
        },
        'response_length': {
            'generated_mean': float(np.mean(gen_lengths)),
            'generated_median': float(np.median(gen_lengths)),
            'reference_mean': float(np.mean(ref_lengths)),
            'reference_median': float(np.median(ref_lengths))
        }
    }

    with open(f'{output_path}/llm_evaluation_metrics_large.json', 'w') as f:
        json.dump(metrics, f, indent=2)

    print(f"\n✓ Metrics saved to {output_path}/llm_evaluation_metrics_large.json")

# Clean up progress file
if os.path.exists(f'{output_path}/evaluation_progress_large.json'):
    os.remove(f'{output_path}/evaluation_progress_large.json')
    print(f"✓ Cleaned up progress tracker")

print("\n" + "="*60)
print("✓ EVALUATION COMPLETE")
print("="*60)

COMPUTING METRICS FROM SAVED RESULTS

Loading saved generation results...
   ✓ Loaded 100 generated responses
   Valid generations: 100/100

1. Computing ROUGE scores...
   ✓ ROUGE computed for 100 examples

2. Computing BLEU scores...
   ✓ BLEU computed for 100 examples

3. Computing BERTScore (this may take a few minutes)...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ✓ BERTScore computed for 100 examples

EVALUATION METRICS

ROUGE Scores (F1):
  ROUGE-1: 0.5339
  ROUGE-2: 0.2885
  ROUGE-L: 0.3682

BLEU Scores:
  BLEU-1: 0.3783
  BLEU-2: 0.2729
  BLEU-3: 0.2149
  BLEU-4: 0.1709

BERTScore:
  Precision: 0.8973
  Recall:    0.9152
  F1:        0.9060 (primary metric)

Response Length:
  Generated: Mean 144.1 words, Median 155.0
  Reference: Mean 108.8 words, Median 92.5

✓ Metrics saved to /content/drive/MyDrive/NLP_Project/results//llm_evaluation_metrics_large.json
✓ Cleaned up progress tracker

✓ EVALUATION COMPLETE


# Comparison of Responses (Phi-2 vs Mistral-7B)

In [14]:
import pandas as pd
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

print("="*60)
print("COMPARING PHI-2 vs MISTRAL-7B ON SAMPLE QUERIES")
print("="*60)

# Load test dataset
print("\n1. Loading test dataset...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')
df_indet_test = df_test[df_test['label'] == 1].reset_index(drop=True)

# Sample 10 diverse queries
df_sample = df_indet_test.sample(n=10, random_state=42).reset_index(drop=True)
print(f"   ✓ Sampled {len(df_sample)} test queries")

# Load Phi-2 model
print("\n2. Loading fine-tuned Phi-2...")
base_phi2 = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
phi2_model = PeftModel.from_pretrained(
    base_phi2,
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
)
phi2_tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
)
print("   ✓ Phi-2 loaded")

# Load Mistral-7B model
print("\n3. Loading fine-tuned Mistral-7B...")
base_mistral = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
mistral_model = PeftModel.from_pretrained(
    base_mistral,
    "/content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora/final_model"
)
mistral_tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/NLP_Project/checkpoints/mistral7b_lora/final_model"
)
print("   ✓ Mistral-7B loaded")

# Generate responses
print("\n4. Generating responses from both models...")
print("="*60)

comparison_results = []

for i, row in df_sample.iterrows():
    query = row['instruction']
    category = row['category']
    reference = row['response']

    print(f"\n[{i+1}/10] Query: {query[:60]}...")

    # Phi-2 generation
    print("   Generating: Phi-2...")
    prompt = f"Customer: {query}\nAssistant:"

    inputs_phi2 = phi2_tokenizer(prompt, return_tensors="pt").to(phi2_model.device)
    with torch.no_grad():
        outputs_phi2 = phi2_model.generate(
            **inputs_phi2,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=phi2_tokenizer.eos_token_id
        )
    phi2_response = phi2_tokenizer.decode(outputs_phi2[0], skip_special_tokens=True)
    phi2_response = phi2_response.split("Assistant:")[-1].strip()
    print("   ✓ Phi-2 done")

    # Mistral-7B generation
    print("   Generating: Mistral-7B...")
    inputs_mistral = mistral_tokenizer(prompt, return_tensors="pt").to(mistral_model.device)
    with torch.no_grad():
        outputs_mistral = mistral_model.generate(
            **inputs_mistral,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=mistral_tokenizer.eos_token_id
        )
    mistral_response = mistral_tokenizer.decode(outputs_mistral[0], skip_special_tokens=True)
    mistral_response = mistral_response.split("Assistant:")[-1].strip()
    print("   ✓ Mistral-7B done")

    # Store results
    comparison_results.append({
        'query': query,
        'category': category,
        'phi2_response': phi2_response,
        'mistral7b_response': mistral_response,
        'reference_response': reference
    })

# Save results
print("\n5. Saving comparison results...")
output_path = '/content/drive/MyDrive/NLP_Project/results/'
results_df = pd.DataFrame(comparison_results)
results_df.to_csv(f'{output_path}/phi2_vs_mistral_comparison.csv', index=False)
print(f"   ✓ Saved to {output_path}/phi2_vs_mistral_comparison.csv")

# Display side-by-side comparison
print("\n" + "="*80)
print("SIDE-BY-SIDE COMPARISON")
print("="*80)

for i, result in enumerate(comparison_results, 1):
    print(f"\n{'='*80}")
    print(f"QUERY {i}/10")
    print(f"{'='*80}")

    print(f"\nQuery: {result['query']}")
    print(f"Category: {result['category']}")

    print(f"\n{'-'*80}")
    print("PHI-2 RESPONSE:")
    print(f"{'-'*80}")
    print(result['phi2_response'])

    print(f"\n{'-'*80}")
    print("MISTRAL-7B RESPONSE:")
    print(f"{'-'*80}")
    print(result['mistral7b_response'])

    print(f"\n{'-'*80}")
    print("REFERENCE (Ground Truth):")
    print(f"{'-'*80}")
    print(result['reference_response'])

    print(f"\n{'='*80}\n")

print("\n✓ COMPARISON COMPLETE")

COMPARING PHI-2 vs MISTRAL-7B ON SAMPLE QUERIES

1. Loading test dataset...
   ✓ Sampled 10 test queries

2. Loading fine-tuned Phi-2...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Phi-2 loaded

3. Loading fine-tuned Mistral-7B...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Mistral-7B loaded

4. Generating responses from both models...

[1/10] Query: need assistance changing to the {{Account Category}} account...
   Generating: Phi-2...
   ✓ Phi-2 done
   Generating: Mistral-7B...
   ✓ Mistral-7B done

[2/10] Query: using {{Account Type}} accouny...
   Generating: Phi-2...
   ✓ Phi-2 done
   Generating: Mistral-7B...
   ✓ Mistral-7B done

[3/10] Query: I have forgotten my user profile pwd, where to reset it?...
   Generating: Phi-2...
   ✓ Phi-2 done
   Generating: Mistral-7B...
   ✓ Mistral-7B done

[4/10] Query: I need help closing a platinum account...
   Generating: Phi-2...
   ✓ Phi-2 done
   Generating: Mistral-7B...
   ✓ Mistral-7B done

[5/10] Query: could uhelp me to close my freemium account...
   Generating: Phi-2...
   ✓ Phi-2 done
   Generating: Mistral-7B...
   ✓ Mistral-7B done

[6/10] Query: what do i need to do to earn several of ur item...
   Generating: Phi-2...
   ✓ Phi-2 done
   Generating: Mistral-7B...
   ✓ Mistral-7B done

[7/