In [2]:
# ===================================================================
# IMPROVED BERT SCORE EVALUATION FOR GOOGLE COLAB
# Fixed Version with Multiple Prompt Format Options
# ===================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q bert-score sentencepiece accelerate bitsandbytes peft transformers

# Note: After running Cell 1, restart runtime: Runtime → Restart runtime
# Then run cells 2-10 (skip cell 1 after restart)


# ========== CELL 2: Authentication (if using gated models) ==========
from huggingface_hub import login

# Option A: Manual login (you'll paste token when prompted)
login()

# Option B: Use Colab Secrets (recommended)
# Uncomment below if you set up HF_TOKEN in Colab secrets
# from google.colab import userdata
# hf_token = userdata.get('HF_TOKEN')
# login(token=hf_token)


# ========== CELL 3: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from bert_score import score
import pandas as pd
import time


# ========== CELL 4: Define Your Dataset ==========
# CUSTOMIZE THIS: Replace with your own test questions and expected answers
dataset = [
    {
        "question": "What is the recommended lubrication for the engine of the BSA D14/4 Bantam Supreme motorcycle?",
        "answer": "Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with a minimum of 10W-40 oil, for the engine of the BSA D14/4 Bantam Supreme motorcycle."
    },
    {
        "question": "Where should an inexperienced owner consult for assistance with major repair work?",
        "answer": "His B.S.A. dealer"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the B.S.A. guarantee?",
        "answer": "Claim assistance through the dealer from whom the motorcycle was purchased."
    },
    {
        "question": "What is the correct address of the B.S.A. Service Department?",
        "answer": "B.S.A. MOTOR CYCLES LIMITED, SERVICE DEPARTMENT, ARMOURY ROAD, BIRMINGHAM 11"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the guarantee for a new motorcycle?",
        "answer": "The owner must do so through the dealer from whom the machine was purchased."
    },
    {
        "question": "What is the recommended torque wrench setting for the Supreme model?",
        "answer": "1 to 3"
    }
]


# ========== CELL 5: Configure Your Model ==========
# CUSTOMIZE THIS: Change to your model name
adapter_name = "Prithwiraj731/Gemma2-2b_Two-Wheeler"
base_model_name = "google/gemma-2-2b"

# IMPORTANT: Choose which prompt format to use
# Try different formats until you find what works for YOUR model
PROMPT_FORMAT = "simple"  # Options: "simple", "instruction", "gemma", "chat"

print("🔧 Model Configuration:")
print(f"   Adapter: {adapter_name}")
print(f"   Base Model: {base_model_name}")
print(f"   Prompt Format: {PROMPT_FORMAT}")


# ========== CELL 6: Load Model ==========
print("\n📥 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)

print("📥 Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

print("📥 Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_name)
model.eval()

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("✅ Model loaded successfully!\n")


# ========== CELL 7: Define Answer Generation Function with Multiple Format Options ==========
def generate_answer(question, max_new_tokens=100, temperature=0.1):
    """
    Generate answer with multiple prompt format options.

    Args:
        question: The input question
        max_new_tokens: Maximum tokens to generate (reduced to 100)
        temperature: Generation randomness (0.1=very deterministic)

    Returns:
        Generated answer text
    """

    # Choose prompt format based on PROMPT_FORMAT setting
    if PROMPT_FORMAT == "simple":
        # Simple Q&A format (most common for fine-tuned models)
        prompt = f"{question}\n"

    elif PROMPT_FORMAT == "instruction":
        # Instruction-style format
        prompt = f"### Question:\n{question}\n\n### Answer:\n"

    elif PROMPT_FORMAT == "gemma":
        # Gemma 2 Chat Template Format
        prompt = f"<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n"

    elif PROMPT_FORMAT == "chat":
        # Generic chat format
        prompt = f"User: {question}\nAssistant:"

    else:
        # Default fallback
        prompt = question

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=False,  # Greedy decoding for most factual answers
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2  # Prevent repetition
        )

    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract answer based on format
    if PROMPT_FORMAT == "simple":
        # Remove the prompt
        answer = generated_text[len(prompt):].strip()

    elif PROMPT_FORMAT == "instruction":
        if "### Answer:" in generated_text:
            answer = generated_text.split("### Answer:")[-1].strip()
        else:
            answer = generated_text[len(prompt):].strip()

    elif PROMPT_FORMAT == "gemma":
        if "<start_of_turn>model" in generated_text:
            answer = generated_text.split("<start_of_turn>model")[-1].strip()
            if "<end_of_turn>" in answer:
                answer = answer.split("<end_of_turn>")[0].strip()
        else:
            answer = generated_text[len(prompt):].strip()

    elif PROMPT_FORMAT == "chat":
        if "Assistant:" in generated_text:
            answer = generated_text.split("Assistant:")[-1].strip()
        else:
            answer = generated_text[len(prompt):].strip()

    else:
        answer = generated_text.strip()

    # Clean up common issues
    answer = answer.split('\n')[0].strip()  # Take only first line

    # Handle empty answers
    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    return answer


# ========== CELL 8: Generate Predictions ==========
print("=" * 70)
print("🤖 GENERATING ANSWERS")
print("=" * 70)

predictions = []
references = []
generation_times = []

for i, item in enumerate(dataset):
    question = item["question"]
    reference = item["answer"]

    print(f"\n📌 Question {i+1}/{len(dataset)}")
    print(f"Q: {question[:80]}...")

    # Time the generation
    start_time = time.time()
    prediction = generate_answer(question, temperature=0.1)
    gen_time = time.time() - start_time
    generation_times.append(gen_time)

    print(f"✨ Generated: {prediction[:100]}...")
    print(f"📖 Reference: {reference[:100]}...")
    print(f"⏱️  Time: {gen_time:.2f}s")

    predictions.append(prediction)
    references.append(reference)

print(f"\n⏱️  Average generation time: {sum(generation_times)/len(generation_times):.2f}s")


# ========== CELL 9: Calculate BERT Score ==========
print("\n" + "=" * 70)
print("📊 CALCULATING BERT SCORES")
print("=" * 70)

P, R, F1 = score(
    predictions,
    references,
    lang="en",
    verbose=True,
    rescale_with_baseline=True
)


# ========== CELL 10: Display Detailed Results ==========
print("\n" + "=" * 70)
print("📈 BERT SCORE RESULTS (DETAILED)")
print("=" * 70)

for i, item in enumerate(dataset):
    print(f"\n{'='*70}")
    print(f"Question {i+1}: {item['question'][:60]}...")
    print(f"{'-'*70}")
    print(f"Generated: {predictions[i][:120]}...")
    print(f"Reference: {references[i][:120]}...")
    print(f"{'-'*70}")
    print(f"  📊 Precision: {P[i].item():.4f}")
    print(f"  📊 Recall:    {R[i].item():.4f}")
    print(f"  📊 F1 Score:  {F1[i].item():.4f}")  # FIXED: Was 'F' now 'F1'


# ========== CELL 11: Display Summary Statistics ==========
print("\n" + "=" * 70)
print("🎯 AVERAGE BERT SCORES")
print("=" * 70)

avg_precision = P.mean().item()
avg_recall = R.mean().item()
avg_f1 = F1.mean().item()

print(f"\n  📊 Average Precision: {avg_precision:.4f} ({avg_precision*100:.2f}%)")
print(f"  📊 Average Recall:    {avg_recall:.4f} ({avg_recall*100:.2f}%)")
print(f"  📊 Average F1 Score:  {avg_f1:.4f} ({avg_f1*100:.2f}%)")

# Score interpretation
print("\n" + "=" * 70)
print("📖 SCORE INTERPRETATION")
print("=" * 70)
print("\nBERT Score Range: -1.0 (worst) to 1.0 (best)")
print("\nQuality Guide:")
print("  🟢 0.7 - 1.0  : Excellent semantic similarity")
print("  🟡 0.5 - 0.7  : Good similarity")
print("  🟠 0.3 - 0.5  : Moderate similarity")
print("  🔴 0.0 - 0.3  : Poor similarity")
print("  ⚫ < 0.0       : Very poor / opposing meaning")

if avg_f1 >= 0.7:
    status = "🟢 EXCELLENT"
elif avg_f1 >= 0.5:
    status = "🟡 GOOD"
elif avg_f1 >= 0.3:
    status = "🟠 MODERATE"
else:
    status = "🔴 NEEDS IMPROVEMENT"

print(f"\nYour Model Status: {status}")


# ========== CELL 12: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [item['question'][:50] + '...' if len(item['question']) > 50 else item['question'] for item in dataset],
    'Generated': [p[:50] + '...' if len(p) > 50 else p for p in predictions],
    'Reference': [r[:50] + '...' if len(r) > 50 else r for r in references],
    'Precision': [f"{p.item():.4f}" for p in P],
    'Recall': [f"{r.item():.4f}" for r in R],
    'F1': [f"{f.item():.4f}" for f in F1]
})

print("\n" + "=" * 70)
print("📋 RESULTS SUMMARY TABLE")
print("=" * 70)
display(results_df)


# ========== CELL 13: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('bert_score_results.csv', index=False)
# print("\n✅ Results saved to 'bert_score_results.csv'")

# from google.colab import files
# files.download('bert_score_results.csv')


# ========== CELL 14: Test All Prompt Formats (DIAGNOSTIC) ==========
# Run this cell to test which format works best for your model

print("\n" + "=" * 70)
print("🧪 TESTING ALL PROMPT FORMATS")
print("=" * 70)

test_question = dataset[0]["question"]
formats = ["simple", "instruction", "gemma", "chat"]

print(f"\nTest Question: {test_question}\n")

for fmt in formats:
    old_format = PROMPT_FORMAT
    PROMPT_FORMAT = fmt

    answer = generate_answer(test_question, max_new_tokens=80, temperature=0.1)

    print(f"\n{'='*70}")
    print(f"Format: {fmt.upper()}")
    print(f"{'-'*70}")
    print(f"Answer: {answer[:150]}")

    PROMPT_FORMAT = old_format

print("\n" + "=" * 70)
print("💡 RECOMMENDATION:")
print("Look at the outputs above and choose the format that gives actual")
print("answers (not repetitions). Then update PROMPT_FORMAT in Cell 5!")
print("=" * 70)



🔧 Model Configuration:
   Adapter: Prithwiraj731/Gemma2-2b_Two-Wheeler
   Base Model: google/gemma-2-2b
   Prompt Format: simple

📥 Loading tokenizer...
📥 Loading base model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

📥 Loading LoRA adapter...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Model loaded successfully!

🤖 GENERATING ANSWERS

📌 Question 1/6
Q: What is the recommended lubrication for the engine of the BSA D14/4 Bantam Supre...
✨ Generated: The oil should be a good grade, and not too thin....
📖 Reference: Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with a minimum of 10W-40 oil, for ...
⏱️  Time: 1.47s

📌 Question 2/6
Q: Where should an inexperienced owner consult for assistance with major repair wor...
✨ Generated: A. The dealer is the only one who can perform all repairs and service operations on your vehicle, as...
📖 Reference: His B.S.A. dealer...
⏱️  Time: 5.32s

📌 Question 3/6
Q: What is the recommended procedure for claiming assistance under the B.S.A. guara...
✨ Generated: The following should be observed:...
📖 Reference: Claim assistance through the dealer from whom the motorcycle was purchased....
⏱️  Time: 6.95s

📌 Question 4/6
Q: What is the correct address of the B.S.A. Service Department?...
✨ Generated: The following are s

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.16 seconds, 37.86 sentences/sec

📈 BERT SCORE RESULTS (DETAILED)

Question 1: What is the recommended lubrication for the engine of the BS...
----------------------------------------------------------------------
Generated: The oil should be a good grade, and not too thin....
Reference: Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with a minimum of 10W-40 oil, for the engine of the BS...
----------------------------------------------------------------------
  📊 Precision: 0.2918
  📊 Recall:    0.0269
  📊 F1 Score:  0.1573

Question 2: Where should an inexperienced owner consult for assistance w...
----------------------------------------------------------------------
Generated: A. The dealer is the only one who can perform all repairs and service operations on your vehicle, as he has been trained...
Reference: His B.S.A. dealer...
----------------------------------------------------------------------
  📊 Precision: -0.2295
  📊 Recall:    0.1757
  📊 F1 Sco

Unnamed: 0,Question,Generated,Reference,Precision,Recall,F1
0,What is the recommended lubrication for the en...,"The oil should be a good grade, and not too thin.",Engine lubrication: BSA recommends using a mix...,0.2918,0.0269,0.1573
1,Where should an inexperienced owner consult fo...,A. The dealer is the only one who can perform ...,His B.S.A. dealer,-0.2295,0.1757,-0.0336
2,What is the recommended procedure for claiming...,The following should be observed:,Claim assistance through the dealer from whom ...,0.2247,0.2195,0.2233
3,What is the correct address of the B.S.A. Serv...,The following are some points to remember when...,"B.S.A. MOTOR CYCLES LIMITED, SERVICE DEPARTMEN...",0.0172,-0.3319,-0.1618
4,What is the recommended procedure for claiming...,The warranty period of 12 months or maximum fr...,The owner must do so through the dealer from w...,-0.0771,0.2164,0.0669
5,What is the recommended torque wrench setting ...,"10.5 N·m (96 kgf-m, 78 lb.-ft)",1 to 3,-0.324,0.2342,-0.0591



🧪 TESTING ALL PROMPT FORMATS

Test Question: What is the recommended lubrication for the engine of the BSA D14/4 Bantam Supreme motorcycle?


Format: SIMPLE
----------------------------------------------------------------------
Answer: The oil should be a good grade, and not too thin.

Format: INSTRUCTION
----------------------------------------------------------------------
Answer: The oil should be a good grade, and not too thin.

Format: GEMMA
----------------------------------------------------------------------
Answer: ycle ignition system.

Format: CHAT
----------------------------------------------------------------------
Answer: How often should I change my oil and filter, or how many miles between changes if not specified in this section.

💡 RECOMMENDATION:
Look at the outputs above and choose the format that gives actual
answers (not repetitions). Then update PROMPT_FORMAT in Cell 5!


In [4]:
# ===================================================================
# IMPROVED BERT SCORE EVALUATION FOR 4-WHEELER MODEL (LEXUS)
# Using Optimized Settings from 2-Wheeler Testing
# ===================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q bert-score sentencepiece accelerate bitsandbytes peft transformers

# Note: After running Cell 1, restart runtime: Runtime → Restart runtime
# Then run cells 2-12 (skip cell 1 after restart)



# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from peft import PeftModel  # ← Remove this line
from bert_score import score
import pandas as pd
import time


# ========== CELL 3: Define 4-Wheeler Dataset (Lexus) ==========
dataset = [
    {
        "question": "What is the purpose of the SRS airbags in the vehicle?",
        "answer": "The SRS airbags are designed to deploy in the event of a crash or sudden stop, providing protection for the occupants of the vehicle."
    },
    {
        "question": "What is the function of the steering wheel?",
        "answer": "Adjusting the steering wheel"
    },
    {
        "question": "What is the procedure for connecting a Bluetooth audio player?",
        "answer": "Connecting a Bluetooth audio player involves selecting a Bluetooth device, registering the device, and then connecting it to the vehicle's Bluetooth system."
    },
    {
        "question": "If your vehicle overheats",
        "answer": "Check the coolant level and condition, and refer to the owner's manual for guidance on how to address the issue."
    },
    {
        "question": "What is the recommended approach for replacing genuine Lexus parts or accessories in the vehicle?",
        "answer": "Lexus recommends using genuine Lexus parts or accessories for replacement, but other parts or accessories of matching quality can also be used."
    },
    {
        "question": "What is the recommended procedure for removing and disposing of the SRS airbag and seat belt pretensioner devices from a Lexus vehicle before scrapping?",
        "answer": "Have the systems removed and disposed of by an authorized Lexus dealer or a duly qualified and equipped professional."
    }
]


# ========== CELL 4: Configure 4-Wheeler Model ==========
# CUSTOMIZE THIS: Change to your 4-wheeler model
adapter_name = "Prithwiraj731/FourWheeler-Gemma-2B"
base_model_name = "google/gemma-2-2b"

# Based on 2-wheeler testing, "simple" format worked best
PROMPT_FORMAT = "simple"

print("🔧 Model Configuration:")
print(f"   Adapter: {adapter_name}")
print(f"   Base Model: {base_model_name}")
print(f"   Prompt Format: {PROMPT_FORMAT}")


# ========== CELL 5: Configure 4-Wheeler Model ==========
adapter_name = "Prithwiraj731/FourWheeler-Gemma-2B"  # Full merged model
base_model_name = None  # Not needed for merged models

PROMPT_FORMAT = "simple"

print("🔧 Model Configuration:")
print(f"   Model: {adapter_name}")
print(f"   Type: Full merged model (not LoRA)")
print(f"   Prompt Format: {PROMPT_FORMAT}")


# ========== CELL 6: Load Model (FULL MODEL VERSION) ==========
print("\n📥 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)

print("📥 Loading full model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load as a complete model (not base + adapter)
model = AutoModelForCausalLM.from_pretrained(
    adapter_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("✅ Model loaded successfully!\n")

# ========== CELL 7: Generate Predictions ==========
print("=" * 70)
print("🚗 GENERATING ANSWERS (4-WHEELER LEXUS MODEL)")
print("=" * 70)

predictions = []
references = []
generation_times = []

for i, item in enumerate(dataset):
    question = item["question"]
    reference = item["answer"]

    print(f"\n📌 Question {i+1}/{len(dataset)}")
    print(f"Q: {question[:80]}...")

    start_time = time.time()
    prediction = generate_answer(question, temperature=0.1)
    gen_time = time.time() - start_time
    generation_times.append(gen_time)

    print(f"✨ Generated: {prediction[:100]}...")
    print(f"📖 Reference: {reference[:100]}...")
    print(f"⏱️  Time: {gen_time:.2f}s")

    predictions.append(prediction)
    references.append(reference)

print(f"\n⏱️  Average generation time: {sum(generation_times)/len(generation_times):.2f}s")


# ========== CELL 8: Calculate BERT Score ==========
print("\n" + "=" * 70)
print("📊 CALCULATING BERT SCORES")
print("=" * 70)

P, R, F1 = score(
    predictions,
    references,
    lang="en",
    verbose=True,
    rescale_with_baseline=True
)


# ========== CELL 9: Display Detailed Results ==========
print("\n" + "=" * 70)
print("📈 BERT SCORE RESULTS (4-WHEELER LEXUS MODEL)")
print("=" * 70)

for i, item in enumerate(dataset):
    print(f"\n{'='*70}")
    print(f"Question {i+1}: {item['question'][:60]}...")
    print(f"{'-'*70}")
    print(f"Generated: {predictions[i][:120]}...")
    print(f"Reference: {references[i][:120]}...")
    print(f"{'-'*70}")
    print(f"  📊 Precision: {P[i].item():.4f}")
    print(f"  📊 Recall:    {R[i].item():.4f}")
    print(f"  📊 F1 Score:  {F1[i].item():.4f}")


# ========== CELL 10: Summary Statistics ==========
print("\n" + "=" * 70)
print("🎯 AVERAGE BERT SCORES (4-WHEELER MODEL)")
print("=" * 70)

avg_precision = P.mean().item()
avg_recall = R.mean().item()
avg_f1 = F1.mean().item()

print(f"\n  📊 Average Precision: {avg_precision:.4f} ({avg_precision*100:.2f}%)")
print(f"  📊 Average Recall:    {avg_recall:.4f} ({avg_recall*100:.2f}%)")
print(f"  📊 Average F1 Score:  {avg_f1:.4f} ({avg_f1*100:.2f}%)")

print("\n" + "=" * 70)
print("📖 SCORE INTERPRETATION")
print("=" * 70)
print("\nBERT Score Range: -1.0 (worst) to 1.0 (best)")
print("\nQuality Guide:")
print("  🟢 0.7 - 1.0  : Excellent")
print("  🟡 0.5 - 0.7  : Good")
print("  🟠 0.3 - 0.5  : Moderate")
print("  🔴 0.0 - 0.3  : Poor")
print("  ⚫ < 0.0       : Very poor")

if avg_f1 >= 0.7:
    status = "🟢 EXCELLENT"
elif avg_f1 >= 0.5:
    status = "🟡 GOOD"
elif avg_f1 >= 0.3:
    status = "🟠 MODERATE"
else:
    status = "🔴 NEEDS IMPROVEMENT"

print(f"\n4-Wheeler Model Status: {status}")


# ========== CELL 11: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [item['question'][:50] + '...' if len(item['question']) > 50 else item['question'] for item in dataset],
    'Generated': [p[:50] + '...' if len(p) > 50 else p for p in predictions],
    'Reference': [r[:50] + '...' if len(r) > 50 else r for r in references],
    'Precision': [f"{p.item():.4f}" for p in P],
    'Recall': [f"{r.item():.4f}" for r in R],
    'F1': [f"{f.item():.4f}" for f in F1]
})

print("\n" + "=" * 70)
print("📋 RESULTS SUMMARY TABLE")
print("=" * 70)
display(results_df)


# ========== CELL 12: Save Results (Optional) ==========
# Uncomment to save and download

# results_df.to_csv('bert_score_4wheeler_results.csv', index=False)
# print("\n✅ Results saved to 'bert_score_4wheeler_results.csv'")

# from google.colab import files
# files.download('bert_score_4wheeler_results.csv')


# ========== CELL 13: Compare with 2-Wheeler ==========
print("\n" + "=" * 70)
print("📊 MODEL COMPARISON")
print("=" * 70)
print("\n🏍️  2-Wheeler (BSA) Model BERT F1: 0.0321 (3.21%)")
print(f"🚗 4-Wheeler (Lexus) Model BERT F1: {avg_f1:.4f} ({avg_f1*100:.2f}%)")

if avg_f1 > 0.0321:
    diff = ((avg_f1 - 0.0321) / 0.0321) * 100
    print(f"\n✅ 4-Wheeler performs {diff:.1f}% BETTER than 2-Wheeler")
elif avg_f1 < 0.0321:
    diff = ((0.0321 - avg_f1) / 0.0321) * 100
    print(f"\n⚠️ 4-Wheeler performs {diff:.1f}% WORSE than 2-Wheeler")
else:
    print("\n➡️ Both models perform similarly")


# ========== CELL 14: Test Different Formats (Diagnostic) ==========
# Run this if results are poor to test other prompt formats

print("\n" + "=" * 70)
print("🧪 TESTING ALL PROMPT FORMATS")
print("=" * 70)

test_question = dataset[0]["question"]
formats_to_test = {
    "simple": f"{test_question}\n",
    "instruction": f"### Question:\n{test_question}\n\n### Answer:\n",
    "qa": f"Question: {test_question}\nAnswer:",
    "chat": f"User: {test_question}\nAssistant:"
}

print(f"\nTest Question: {test_question}\n")

for fmt_name, fmt_prompt in formats_to_test.items():
    inputs = tokenizer(fmt_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated[len(fmt_prompt):].strip().split('\n')[0]

    print(f"\n{'='*70}")
    print(f"Format: {fmt_name.upper()}")
    print(f"{'-'*70}")
    print(f"Answer: {answer[:150]}")

print("\n" + "=" * 70)
print("💡 If 'simple' format doesn't work well, update PROMPT_FORMAT in Cell 4")
print("=" * 70)



🔧 Model Configuration:
   Adapter: Prithwiraj731/FourWheeler-Gemma-2B
   Base Model: google/gemma-2-2b
   Prompt Format: simple
🔧 Model Configuration:
   Model: Prithwiraj731/FourWheeler-Gemma-2B
   Type: Full merged model (not LoRA)
   Prompt Format: simple

📥 Loading tokenizer...
📥 Loading full model with 4-bit quantization...


model.safetensors:   0%|          | 0.00/5.23G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

✅ Model loaded successfully!

🚗 GENERATING ANSWERS (4-WHEELER LEXUS MODEL)

📌 Question 1/6
Q: What is the purpose of the SRS airbags in the vehicle?...
✨ Generated: The SRS airbags are designed to deploy during a crash, providing additional protection for occupants...
📖 Reference: The SRS airbags are designed to deploy in the event of a crash or sudden stop, providing protection ...
⏱️  Time: 1.12s

📌 Question 2/6
Q: What is the function of the steering wheel?...
✨ Generated: The...
📖 Reference: Adjusting the steering wheel...
⏱️  Time: 0.15s

📌 Question 3/6
Q: What is the procedure for connecting a Bluetooth audio player?...
✨ Generated: model...
📖 Reference: Connecting a Bluetooth audio player involves selecting a Bluetooth device, registering the device, a...
⏱️  Time: 1.42s

📌 Question 4/6
Q: If your vehicle overheats...
✨ Generated: model...
📖 Reference: Check the coolant level and condition, and refer to the owner's manual for guidance on how to addres...
⏱️  Time: 0.91s

📌 Quest

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.13 seconds, 46.72 sentences/sec

📈 BERT SCORE RESULTS (4-WHEELER LEXUS MODEL)

Question 1: What is the purpose of the SRS airbags in the vehicle?...
----------------------------------------------------------------------
Generated: The SRS airbags are designed to deploy during a crash, providing additional protection for occupants....
Reference: The SRS airbags are designed to deploy in the event of a crash or sudden stop, providing protection for the occupants of...
----------------------------------------------------------------------
  📊 Precision: 0.8745
  📊 Recall:    0.6907
  📊 F1 Score:  0.7815

Question 2: What is the function of the steering wheel?...
----------------------------------------------------------------------
Generated: The...
Reference: Adjusting the steering wheel...
----------------------------------------------------------------------
  📊 Precision: 0.9992
  📊 Recall:    0.2357
  📊 F1 Score:  0.5918

Question 3: What is the procedure for connecting a B

Unnamed: 0,Question,Generated,Reference,Precision,Recall,F1
0,What is the purpose of the SRS airbags in the ...,The SRS airbags are designed to deploy during ...,The SRS airbags are designed to deploy in the ...,0.8745,0.6907,0.7815
1,What is the function of the steering wheel?,The,Adjusting the steering wheel,0.9992,0.2357,0.5918
2,What is the procedure for connecting a Bluetoo...,model,Connecting a Bluetooth audio player involves s...,0.0523,-0.1905,-0.0704
3,If your vehicle overheats,model,"Check the coolant level and condition, and ref...",0.0466,-0.1725,-0.0637
4,What is the recommended approach for replacing...,The only way to replace a part with an origina...,Lexus recommends using genuine Lexus parts or ...,0.109,0.1149,0.1134
5,What is the recommended procedure for removing...,The removal process should be carried out in a...,Have the systems removed and disposed of by an...,0.1575,0.0709,0.1153



📊 MODEL COMPARISON

🏍️  2-Wheeler (BSA) Model BERT F1: 0.0321 (3.21%)
🚗 4-Wheeler (Lexus) Model BERT F1: 0.2447 (24.47%)

✅ 4-Wheeler performs 662.2% BETTER than 2-Wheeler

🧪 TESTING ALL PROMPT FORMATS

Test Question: What is the purpose of the SRS airbags in the vehicle?


Format: SIMPLE
----------------------------------------------------------------------
Answer: The SRS airbags are designed to deploy during a crash, providing additional protection for occupants.

Format: INSTRUCTION
----------------------------------------------------------------------
Answer: The SRS

Format: QA
----------------------------------------------------------------------
Answer: The SRS airbags are designed to deploy during a crash, providing additional protection for occupants.

Format: CHAT
----------------------------------------------------------------------
Answer: The

💡 If 'simple' format doesn't work well, update PROMPT_FORMAT in Cell 4


In [5]:
!pip install -q accelerate bitsandbytes peft transformers

In [1]:
# ==============================================================================
# TASK ACCURACY EVALUATION - TWO-WHEELER MODEL (BSA)
# Metrics: Exact Match, Partial Match, Keyword Score
# ==============================================================================

# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
import re
import time


# ========== CELL 3: Define Dataset ==========
dataset = [
    {
        "question": "What is the recommended lubrication for the engine of the BSA D14/4 Bantam Supreme motorcycle?",
        "answer": "Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with a minimum of 10W-40 oil, for the engine of the BSA D14/4 Bantam Supreme motorcycle."
    },
    {
        "question": "Where should an inexperienced owner consult for assistance with major repair work?",
        "answer": "His B.S.A. dealer"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the B.S.A. guarantee?",
        "answer": "Claim assistance through the dealer from whom the motorcycle was purchased."
    },
    {
        "question": "What is the correct address of the B.S.A. Service Department?",
        "answer": "B.S.A. MOTOR CYCLES LIMITED, SERVICE DEPARTMENT, ARMOURY ROAD, BIRMINGHAM 11"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the guarantee for a new motorcycle?",
        "answer": "The owner must do so through the dealer from whom the machine was purchased."
    },
    {
        "question": "What is the recommended torque wrench setting for the Supreme model?",
        "answer": "1 to 3"
    }
]


# ========== CELL 4: Define Accuracy Metrics Functions ==========
def normalize_text(text):
    """Normalize text for comparison by lowercasing and removing punctuation"""
    text = text.lower().strip()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def exact_match(prediction, reference):
    """Check if prediction exactly matches reference after normalization"""
    return normalize_text(prediction) == normalize_text(reference)

def partial_match(prediction, reference, threshold=0.3):
    """Check if prediction contains at least threshold% of reference words"""
    pred_words = set(normalize_text(prediction).split())
    ref_words = set(normalize_text(reference).split())
    if len(ref_words) == 0:
        return False
    overlap = len(pred_words & ref_words) / len(ref_words)
    return overlap >= threshold

def keyword_match(prediction, reference):
    """Calculate percentage of non-stopword keywords from reference in prediction"""
    ref_words = set(normalize_text(reference).split())
    pred_words = set(normalize_text(prediction).split())

    stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
                 'to', 'of', 'and', 'or', 'for', 'in', 'on', 'at', 'by', 'with'}

    ref_keywords = ref_words - stopwords
    pred_keywords = pred_words - stopwords

    if len(ref_keywords) == 0:
        return 0.0

    return len(pred_keywords & ref_keywords) / len(ref_keywords)


# ========== CELL 5: Configure Model ==========
adapter_name = "Prithwiraj731/Gemma2-2b_Two-Wheeler"
base_model_name = "google/gemma-2-2b"

print("Model Configuration:")
print(f"  Adapter: {adapter_name}")
print(f"  Base Model: {base_model_name}")


# ========== CELL 6: Load Model ==========
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)

print("Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_name)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully\n")


# ========== CELL 7: Define Answer Generation Function ==========
def generate_answer(question, max_new_tokens=100):
    """Generate answer using optimized settings"""
    prompt = f"{question}\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text[len(prompt):].strip()
    answer = answer.split('\n')[0].strip()

    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    return answer


# ========== CELL 8: Generate Predictions and Calculate Metrics ==========
print("="*70)
print("GENERATING ANSWERS AND CALCULATING ACCURACY METRICS")
print("="*70)

results = []
exact_matches = 0
partial_matches = 0
generation_times = []

for i, item in enumerate(dataset):
    question = item["question"]
    reference = item["answer"]

    print(f"\nQuestion {i+1}/{len(dataset)}")
    print(f"Q: {question[:70]}...")

    start_time = time.time()
    prediction = generate_answer(question)
    gen_time = time.time() - start_time
    generation_times.append(gen_time)

    # Calculate metrics
    is_exact = exact_match(prediction, reference)
    is_partial = partial_match(prediction, reference, threshold=0.3)
    keyword_score = keyword_match(prediction, reference)

    if is_exact:
        exact_matches += 1
    if is_partial:
        partial_matches += 1

    results.append({
        'question': question,
        'reference': reference,
        'prediction': prediction,
        'exact_match': is_exact,
        'partial_match': is_partial,
        'keyword_score': keyword_score
    })

    print(f"Generated: {prediction[:70]}...")
    print(f"Reference: {reference[:70]}...")
    print(f"Exact Match: {'YES' if is_exact else 'NO'}")
    print(f"Partial Match (30%): {'YES' if is_partial else 'NO'}")
    print(f"Keyword Score: {keyword_score:.1%}")
    print(f"Time: {gen_time:.2f}s")

avg_gen_time = sum(generation_times) / len(generation_times)
print(f"\nAverage generation time: {avg_gen_time:.2f}s")


# ========== CELL 9: Display Summary Results ==========
print("\n" + "="*70)
print("TWO-WHEELER MODEL - TASK ACCURACY RESULTS")
print("="*70)

total = len(dataset)
exact_accuracy = exact_matches / total * 100
partial_accuracy = partial_matches / total * 100
avg_keyword_score = sum(r['keyword_score'] for r in results) / total * 100

print(f"\nSummary Metrics:")
print(f"  Exact Match Accuracy:   {exact_matches}/{total} = {exact_accuracy:.1f}%")
print(f"  Partial Match Accuracy: {partial_matches}/{total} = {partial_accuracy:.1f}%")
print(f"  Average Keyword Score:  {avg_keyword_score:.1f}%")

print("\nMetric Definitions:")
print("  - Exact Match: Prediction matches reference exactly (after normalization)")
print("  - Partial Match: At least 30% of reference words appear in prediction")
print("  - Keyword Score: Percentage of important words (non-stopwords) matched")


# ========== CELL 10: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [r['question'][:45] + '...' if len(r['question']) > 45 else r['question'] for r in results],
    'Prediction': [r['prediction'][:45] + '...' if len(r['prediction']) > 45 else r['prediction'] for r in results],
    'Exact': ['YES' if r['exact_match'] else 'NO' for r in results],
    'Partial': ['YES' if r['partial_match'] else 'NO' for r in results],
    'Keyword%': [f"{r['keyword_score']*100:.1f}%" for r in results]
})

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
display(results_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('task_accuracy_2wheeler_results.csv', index=False)
# print("\nResults saved to 'task_accuracy_2wheeler_results.csv'")

# from google.colab import files
# files.download('task_accuracy_2wheeler_results.csv')



Model Configuration:
  Adapter: Prithwiraj731/Gemma2-2b_Two-Wheeler
  Base Model: google/gemma-2-2b

Loading tokenizer...
Loading base model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

Loading LoRA adapter...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model loaded successfully

GENERATING ANSWERS AND CALCULATING ACCURACY METRICS

Question 1/6
Q: What is the recommended lubrication for the engine of the BSA D14/4 Ba...
Generated: The oil should be a good grade, and not too thin....
Reference: Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with...
Exact Match: NO
Partial Match (30%): NO
Keyword Score: 7.1%
Time: 2.89s

Question 2/6
Q: Where should an inexperienced owner consult for assistance with major ...
Generated: A. The dealer is the only one who can perform all repairs and service ...
Reference: His B.S.A. dealer...
Exact Match: NO
Partial Match (30%): YES
Keyword Score: 66.7%
Time: 5.37s

Question 3/6
Q: What is the recommended procedure for claiming assistance under the B....
Generated: The following should be observed:...
Reference: Claim assistance through the dealer from whom the motorcycle was purch...
Exact Match: NO
Partial Match (30%): NO
Keyword Score: 0.0%
Time: 6.98s

Question 4/6
Q: What is the co

Unnamed: 0,Question,Prediction,Exact,Partial,Keyword%
0,What is the recommended lubrication for the e...,"The oil should be a good grade, and not too t...",NO,NO,7.1%
1,Where should an inexperienced owner consult f...,A. The dealer is the only one who can perform...,NO,YES,66.7%
2,What is the recommended procedure for claimin...,The following should be observed:,NO,NO,0.0%
3,What is the correct address of the B.S.A. Ser...,The following are some points to remember whe...,NO,NO,0.0%
4,What is the recommended procedure for claimin...,The warranty period of 12 months or maximum f...,NO,NO,10.0%
5,What is the recommended torque wrench setting...,"10.5 N·m (96 kgf-m, 78 lb.-ft)",NO,NO,0.0%


In [2]:
# ==============================================================================
# TASK ACCURACY EVALUATION - FOUR-WHEELER MODEL (LEXUS)
# Metrics: Exact Match, Partial Match, Keyword Score
# ==============================================================================

# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import re
import time


# ========== CELL 3: Define Dataset ==========
dataset = [
    {
        "question": "What is the purpose of the SRS airbags in the vehicle?",
        "answer": "The SRS airbags are designed to deploy in the event of a crash or sudden stop, providing protection for the occupants of the vehicle."
    },
    {
        "question": "What is the function of the steering wheel?",
        "answer": "Adjusting the steering wheel"
    },
    {
        "question": "What is the procedure for connecting a Bluetooth audio player?",
        "answer": "Connecting a Bluetooth audio player involves selecting a Bluetooth device, registering the device, and then connecting it to the vehicle's Bluetooth system."
    },
    {
        "question": "If your vehicle overheats",
        "answer": "Check the coolant level and condition, and refer to the owner's manual for guidance on how to address the issue."
    },
    {
        "question": "What is the recommended approach for replacing genuine Lexus parts or accessories in the vehicle?",
        "answer": "Lexus recommends using genuine Lexus parts or accessories for replacement, but other parts or accessories of matching quality can also be used."
    },
    {
        "question": "What is the recommended procedure for removing and disposing of the SRS airbag and seat belt pretensioner devices from a Lexus vehicle before scrapping?",
        "answer": "Have the systems removed and disposed of by an authorized Lexus dealer or a duly qualified and equipped professional."
    }
]


# ========== CELL 4: Define Accuracy Metrics Functions ==========
def normalize_text(text):
    """Normalize text for comparison by lowercasing and removing punctuation"""
    text = text.lower().strip()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def exact_match(prediction, reference):
    """Check if prediction exactly matches reference after normalization"""
    return normalize_text(prediction) == normalize_text(reference)

def partial_match(prediction, reference, threshold=0.3):
    """Check if prediction contains at least threshold% of reference words"""
    pred_words = set(normalize_text(prediction).split())
    ref_words = set(normalize_text(reference).split())
    if len(ref_words) == 0:
        return False
    overlap = len(pred_words & ref_words) / len(ref_words)
    return overlap >= threshold

def keyword_match(prediction, reference):
    """Calculate percentage of non-stopword keywords from reference in prediction"""
    ref_words = set(normalize_text(reference).split())
    pred_words = set(normalize_text(prediction).split())

    stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
                 'to', 'of', 'and', 'or', 'for', 'in', 'on', 'at', 'by', 'with'}

    ref_keywords = ref_words - stopwords
    pred_keywords = pred_words - stopwords

    if len(ref_keywords) == 0:
        return 0.0

    return len(pred_keywords & ref_keywords) / len(ref_keywords)


# ========== CELL 5: Configure Model ==========
model_name = "Prithwiraj731/FourWheeler-Gemma-2B"

print("Model Configuration:")
print(f"  Model: {model_name}")
print(f"  Type: Full merged model")


# ========== CELL 6: Load Model ==========
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading full model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully\n")


# ========== CELL 7: Define Answer Generation Function ==========
def generate_answer(question, max_new_tokens=100):
    """Generate answer using optimized settings"""
    prompt = f"{question}\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text[len(prompt):].strip()
    answer = answer.split('\n')[0].strip()

    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    return answer


# ========== CELL 8: Generate Predictions and Calculate Metrics ==========
print("="*70)
print("GENERATING ANSWERS AND CALCULATING ACCURACY METRICS")
print("="*70)

results = []
exact_matches = 0
partial_matches = 0
generation_times = []

for i, item in enumerate(dataset):
    question = item["question"]
    reference = item["answer"]

    print(f"\nQuestion {i+1}/{len(dataset)}")
    print(f"Q: {question[:70]}...")

    start_time = time.time()
    prediction = generate_answer(question)
    gen_time = time.time() - start_time
    generation_times.append(gen_time)

    # Calculate metrics
    is_exact = exact_match(prediction, reference)
    is_partial = partial_match(prediction, reference, threshold=0.3)
    keyword_score = keyword_match(prediction, reference)

    if is_exact:
        exact_matches += 1
    if is_partial:
        partial_matches += 1

    results.append({
        'question': question,
        'reference': reference,
        'prediction': prediction,
        'exact_match': is_exact,
        'partial_match': is_partial,
        'keyword_score': keyword_score
    })

    print(f"Generated: {prediction[:70]}...")
    print(f"Reference: {reference[:70]}...")
    print(f"Exact Match: {'YES' if is_exact else 'NO'}")
    print(f"Partial Match (30%): {'YES' if is_partial else 'NO'}")
    print(f"Keyword Score: {keyword_score:.1%}")
    print(f"Time: {gen_time:.2f}s")

avg_gen_time = sum(generation_times) / len(generation_times)
print(f"\nAverage generation time: {avg_gen_time:.2f}s")


# ========== CELL 9: Display Summary Results ==========
print("\n" + "="*70)
print("FOUR-WHEELER MODEL - TASK ACCURACY RESULTS")
print("="*70)

total = len(dataset)
exact_accuracy = exact_matches / total * 100
partial_accuracy = partial_matches / total * 100
avg_keyword_score = sum(r['keyword_score'] for r in results) / total * 100

print(f"\nSummary Metrics:")
print(f"  Exact Match Accuracy:   {exact_matches}/{total} = {exact_accuracy:.1f}%")
print(f"  Partial Match Accuracy: {partial_matches}/{total} = {partial_accuracy:.1f}%")
print(f"  Average Keyword Score:  {avg_keyword_score:.1f}%")

print("\nMetric Definitions:")
print("  - Exact Match: Prediction matches reference exactly (after normalization)")
print("  - Partial Match: At least 30% of reference words appear in prediction")
print("  - Keyword Score: Percentage of important words (non-stopwords) matched")


# ========== CELL 10: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [r['question'][:45] + '...' if len(r['question']) > 45 else r['question'] for r in results],
    'Prediction': [r['prediction'][:45] + '...' if len(r['prediction']) > 45 else r['prediction'] for r in results],
    'Exact': ['YES' if r['exact_match'] else 'NO' for r in results],
    'Partial': ['YES' if r['partial_match'] else 'NO' for r in results],
    'Keyword%': [f"{r['keyword_score']*100:.1f}%" for r in results]
})

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
display(results_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('task_accuracy_4wheeler_results.csv', index=False)
# print("\nResults saved to 'task_accuracy_4wheeler_results.csv'")

# from google.colab import files
# files.download('task_accuracy_4wheeler_results.csv')



Model Configuration:
  Model: Prithwiraj731/FourWheeler-Gemma-2B
  Type: Full merged model

Loading tokenizer...
Loading full model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

Model loaded successfully

GENERATING ANSWERS AND CALCULATING ACCURACY METRICS

Question 1/6
Q: What is the purpose of the SRS airbags in the vehicle?...
Generated: The SRS airbags are designed to deploy during a crash, providing addit...
Reference: The SRS airbags are designed to deploy in the event of a crash or sudd...
Exact Match: NO
Partial Match (30%): YES
Keyword Score: 66.7%
Time: 1.33s

Question 2/6
Q: What is the function of the steering wheel?...
Generated: The...
Reference: Adjusting the steering wheel...
Exact Match: NO
Partial Match (30%): NO
Keyword Score: 0.0%
Time: 0.18s

Question 3/6
Q: What is the procedure for connecting a Bluetooth audio player?...
Generated: model...
Reference: Connecting a Bluetooth audio player involves selecting a Bluetooth dev...
Exact Match: NO
Partial Match (30%): NO
Keyword Score: 0.0%
Time: 1.73s

Question 4/6
Q: If your vehicle overheats...
Generated: model...
Reference: Check the coolant level and condition, and refer to the owner's manu

Unnamed: 0,Question,Prediction,Exact,Partial,Keyword%
0,What is the purpose of the SRS airbags in the...,The SRS airbags are designed to deploy during...,NO,YES,66.7%
1,What is the function of the steering wheel?,The,NO,NO,0.0%
2,What is the procedure for connecting a Blueto...,model,NO,NO,0.0%
3,If your vehicle overheats,model,NO,NO,0.0%
4,What is the recommended approach for replacin...,The only way to replace a part with an origin...,NO,NO,7.1%
5,What is the recommended procedure for removin...,The removal process should be carried out in ...,NO,NO,0.0%


In [3]:
# ==============================================================================
# BLEU AND ROUGE-L EVALUATION - TWO-WHEELER MODEL (BSA)
# Metrics: BLEU-1, BLEU-2, BLEU-4, ROUGE-L
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes peft transformers rouge-score nltk

# Note: After running Cell 1, restart runtime then run cells 2-10


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import time

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)


# ========== CELL 3: Define Dataset ==========
dataset = [
    {
        "question": "What is the recommended lubrication for the engine of the BSA D14/4 Bantam Supreme motorcycle?",
        "answer": "Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with a minimum of 10W-40 oil, for the engine of the BSA D14/4 Bantam Supreme motorcycle."
    },
    {
        "question": "Where should an inexperienced owner consult for assistance with major repair work?",
        "answer": "His B.S.A. dealer"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the B.S.A. guarantee?",
        "answer": "Claim assistance through the dealer from whom the motorcycle was purchased."
    },
    {
        "question": "What is the correct address of the B.S.A. Service Department?",
        "answer": "B.S.A. MOTOR CYCLES LIMITED, SERVICE DEPARTMENT, ARMOURY ROAD, BIRMINGHAM 11"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the guarantee for a new motorcycle?",
        "answer": "The owner must do so through the dealer from whom the machine was purchased."
    },
    {
        "question": "What is the recommended torque wrench setting for the Supreme model?",
        "answer": "1 to 3"
    }
]


# ========== CELL 4: Define BLEU and ROUGE Functions ==========
def calculate_bleu(prediction, reference):
    """Calculate BLEU-1, BLEU-2, and BLEU-4 scores"""
    smoothie = SmoothingFunction().method4
    reference_tokens = [reference.lower().split()]
    prediction_tokens = prediction.lower().split()

    if len(prediction_tokens) == 0:
        return 0.0, 0.0, 0.0

    bleu1 = sentence_bleu(reference_tokens, prediction_tokens,
                         weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu2 = sentence_bleu(reference_tokens, prediction_tokens,
                         weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu4 = sentence_bleu(reference_tokens, prediction_tokens,
                         weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    return bleu1, bleu2, bleu4

def calculate_rouge(prediction, reference):
    """Calculate ROUGE-L F-measure score"""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return scores['rougeL'].fmeasure


# ========== CELL 5: Configure Model ==========
adapter_name = "Prithwiraj731/Gemma2-2b_Two-Wheeler"
base_model_name = "google/gemma-2-2b"

print("Model Configuration:")
print(f"  Adapter: {adapter_name}")
print(f"  Base Model: {base_model_name}")


# ========== CELL 6: Load Model ==========
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)

print("Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_name)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully\n")


# ========== CELL 7: Define Answer Generation Function ==========
def generate_answer(question, max_new_tokens=100):
    """Generate answer using optimized settings"""
    prompt = f"{question}\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text[len(prompt):].strip()
    answer = answer.split('\n')[0].strip()

    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    return answer


# ========== CELL 8: Generate Predictions and Calculate Metrics ==========
print("="*70)
print("GENERATING ANSWERS AND CALCULATING BLEU/ROUGE METRICS")
print("="*70)

results = []
all_bleu1, all_bleu2, all_bleu4, all_rouge = [], [], [], []
generation_times = []

for i, item in enumerate(dataset):
    question = item["question"]
    reference = item["answer"]

    print(f"\nQuestion {i+1}/{len(dataset)}")
    print(f"Q: {question[:70]}...")

    start_time = time.time()
    prediction = generate_answer(question)
    gen_time = time.time() - start_time
    generation_times.append(gen_time)

    # Calculate metrics
    bleu1, bleu2, bleu4 = calculate_bleu(prediction, reference)
    rouge_l = calculate_rouge(prediction, reference)

    all_bleu1.append(bleu1)
    all_bleu2.append(bleu2)
    all_bleu4.append(bleu4)
    all_rouge.append(rouge_l)

    results.append({
        'question': question,
        'reference': reference,
        'prediction': prediction,
        'bleu1': bleu1,
        'bleu2': bleu2,
        'bleu4': bleu4,
        'rouge_l': rouge_l
    })

    print(f"Generated: {prediction[:70]}...")
    print(f"Reference: {reference[:70]}...")
    print(f"BLEU-1: {bleu1:.4f} | BLEU-2: {bleu2:.4f} | BLEU-4: {bleu4:.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"Time: {gen_time:.2f}s")

avg_gen_time = sum(generation_times) / len(generation_times)
print(f"\nAverage generation time: {avg_gen_time:.2f}s")


# ========== CELL 9: Display Summary Results ==========
print("\n" + "="*70)
print("TWO-WHEELER MODEL - BLEU/ROUGE-L RESULTS")
print("="*70)

avg_bleu1 = sum(all_bleu1) / len(all_bleu1)
avg_bleu2 = sum(all_bleu2) / len(all_bleu2)
avg_bleu4 = sum(all_bleu4) / len(all_bleu4)
avg_rouge = sum(all_rouge) / len(all_rouge)

print(f"\nAverage Scores:")
print(f"  BLEU-1:  {avg_bleu1:.4f} ({avg_bleu1*100:.2f}%)")
print(f"  BLEU-2:  {avg_bleu2:.4f} ({avg_bleu2*100:.2f}%)")
print(f"  BLEU-4:  {avg_bleu4:.4f} ({avg_bleu4*100:.2f}%)")
print(f"  ROUGE-L: {avg_rouge:.4f} ({avg_rouge*100:.2f}%)")

print("\nMetric Definitions:")
print("  - BLEU-1: Unigram precision (individual word matches)")
print("  - BLEU-2: Bigram precision (2-word phrase matches)")
print("  - BLEU-4: 4-gram precision (4-word phrase matches)")
print("  - ROUGE-L: Longest common subsequence F-measure")


# ========== CELL 10: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [r['question'][:40] + '...' if len(r['question']) > 40 else r['question'] for r in results],
    'BLEU-1': [f"{r['bleu1']:.4f}" for r in results],
    'BLEU-2': [f"{r['bleu2']:.4f}" for r in results],
    'BLEU-4': [f"{r['bleu4']:.4f}" for r in results],
    'ROUGE-L': [f"{r['rouge_l']:.4f}" for r in results]
})

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
display(results_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('bleu_rouge_2wheeler_results.csv', index=False)
# print("\nResults saved to 'bleu_rouge_2wheeler_results.csv'")

# from google.colab import files
# files.download('bleu_rouge_2wheeler_results.csv')



  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Model Configuration:
  Adapter: Prithwiraj731/Gemma2-2b_Two-Wheeler
  Base Model: google/gemma-2-2b

Loading tokenizer...
Loading base model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

Loading LoRA adapter...
Model loaded successfully

GENERATING ANSWERS AND CALCULATING BLEU/ROUGE METRICS

Question 1/6
Q: What is the recommended lubrication for the engine of the BSA D14/4 Ba...
Generated: The oil should be a good grade, and not too thin....
Reference: Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with...
BLEU-1: 0.0465 | BLEU-2: 0.0169 | BLEU-4: 0.0066
ROUGE-L: 0.1000
Time: 1.49s

Question 2/6
Q: Where should an inexperienced owner consult for assistance with major ...
Generated: A. The dealer is the only one who can perform all repairs and service ...
Reference: His B.S.A. dealer...
BLEU-1: 0.0444 | BLEU-2: 0.0196 | BLEU-4: 0.0079
ROUGE-L: 0.0800
Time: 5.88s

Question 3/6
Q: What is the recommended procedure for claiming assistance under the B....
Generated: The following should be observed:...
Reference: Claim assistance through the dealer from whom the motorcycle was purch...
BLEU-1: 0.0602 | BLEU-2: 0.0270 | BLEU-4: 0.0137
ROUGE-L: 0.1250
Time

Unnamed: 0,Question,BLEU-1,BLEU-2,BLEU-4,ROUGE-L
0,What is the recommended lubrication for ...,0.0465,0.0169,0.0066,0.1
1,Where should an inexperienced owner cons...,0.0444,0.0196,0.0079,0.08
2,What is the recommended procedure for cl...,0.0602,0.027,0.0137,0.125
3,What is the correct address of the B.S.A...,0.0,0.0,0.0,0.087
4,What is the recommended procedure for cl...,0.1333,0.0508,0.0197,0.1379
5,What is the recommended torque wrench se...,0.0,0.0,0.0,0.0


In [4]:
# ==============================================================================
# BLEU AND ROUGE-L EVALUATION - FOUR-WHEELER MODEL (LEXUS)
# Metrics: BLEU-1, BLEU-2, BLEU-4, ROUGE-L
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes transformers rouge-score nltk

# Note: After running Cell 1, restart runtime then run cells 2-10


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import time

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)


# ========== CELL 3: Define Dataset ==========
dataset = [
    {
        "question": "What is the purpose of the SRS airbags in the vehicle?",
        "answer": "The SRS airbags are designed to deploy in the event of a crash or sudden stop, providing protection for the occupants of the vehicle."
    },
    {
        "question": "What is the function of the steering wheel?",
        "answer": "Adjusting the steering wheel"
    },
    {
        "question": "What is the procedure for connecting a Bluetooth audio player?",
        "answer": "Connecting a Bluetooth audio player involves selecting a Bluetooth device, registering the device, and then connecting it to the vehicle's Bluetooth system."
    },
    {
        "question": "If your vehicle overheats",
        "answer": "Check the coolant level and condition, and refer to the owner's manual for guidance on how to address the issue."
    },
    {
        "question": "What is the recommended approach for replacing genuine Lexus parts or accessories in the vehicle?",
        "answer": "Lexus recommends using genuine Lexus parts or accessories for replacement, but other parts or accessories of matching quality can also be used."
    },
    {
        "question": "What is the recommended procedure for removing and disposing of the SRS airbag and seat belt pretensioner devices from a Lexus vehicle before scrapping?",
        "answer": "Have the systems removed and disposed of by an authorized Lexus dealer or a duly qualified and equipped professional."
    }
]


# ========== CELL 4: Define BLEU and ROUGE Functions ==========
def calculate_bleu(prediction, reference):
    """Calculate BLEU-1, BLEU-2, and BLEU-4 scores"""
    smoothie = SmoothingFunction().method4
    reference_tokens = [reference.lower().split()]
    prediction_tokens = prediction.lower().split()

    if len(prediction_tokens) == 0:
        return 0.0, 0.0, 0.0

    bleu1 = sentence_bleu(reference_tokens, prediction_tokens,
                         weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu2 = sentence_bleu(reference_tokens, prediction_tokens,
                         weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu4 = sentence_bleu(reference_tokens, prediction_tokens,
                         weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    return bleu1, bleu2, bleu4

def calculate_rouge(prediction, reference):
    """Calculate ROUGE-L F-measure score"""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return scores['rougeL'].fmeasure


# ========== CELL 5: Configure Model ==========
model_name = "Prithwiraj731/FourWheeler-Gemma-2B"

print("Model Configuration:")
print(f"  Model: {model_name}")
print(f"  Type: Full merged model")


# ========== CELL 6: Load Model ==========
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading full model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully\n")


# ========== CELL 7: Define Answer Generation Function ==========
def generate_answer(question, max_new_tokens=100):
    """Generate answer using optimized settings"""
    prompt = f"{question}\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text[len(prompt):].strip()
    answer = answer.split('\n')[0].strip()

    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    return answer


# ========== CELL 8: Generate Predictions and Calculate Metrics ==========
print("="*70)
print("GENERATING ANSWERS AND CALCULATING BLEU/ROUGE METRICS")
print("="*70)

results = []
all_bleu1, all_bleu2, all_bleu4, all_rouge = [], [], [], []
generation_times = []

for i, item in enumerate(dataset):
    question = item["question"]
    reference = item["answer"]

    print(f"\nQuestion {i+1}/{len(dataset)}")
    print(f"Q: {question[:70]}...")

    start_time = time.time()
    prediction = generate_answer(question)
    gen_time = time.time() - start_time
    generation_times.append(gen_time)

    # Calculate metrics
    bleu1, bleu2, bleu4 = calculate_bleu(prediction, reference)
    rouge_l = calculate_rouge(prediction, reference)

    all_bleu1.append(bleu1)
    all_bleu2.append(bleu2)
    all_bleu4.append(bleu4)
    all_rouge.append(rouge_l)

    results.append({
        'question': question,
        'reference': reference,
        'prediction': prediction,
        'bleu1': bleu1,
        'bleu2': bleu2,
        'bleu4': bleu4,
        'rouge_l': rouge_l
    })

    print(f"Generated: {prediction[:70]}...")
    print(f"Reference: {reference[:70]}...")
    print(f"BLEU-1: {bleu1:.4f} | BLEU-2: {bleu2:.4f} | BLEU-4: {bleu4:.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"Time: {gen_time:.2f}s")

avg_gen_time = sum(generation_times) / len(generation_times)
print(f"\nAverage generation time: {avg_gen_time:.2f}s")


# ========== CELL 9: Display Summary Results ==========
print("\n" + "="*70)
print("FOUR-WHEELER MODEL - BLEU/ROUGE-L RESULTS")
print("="*70)

avg_bleu1 = sum(all_bleu1) / len(all_bleu1)
avg_bleu2 = sum(all_bleu2) / len(all_bleu2)
avg_bleu4 = sum(all_bleu4) / len(all_bleu4)
avg_rouge = sum(all_rouge) / len(all_rouge)

print(f"\nAverage Scores:")
print(f"  BLEU-1:  {avg_bleu1:.4f} ({avg_bleu1*100:.2f}%)")
print(f"  BLEU-2:  {avg_bleu2:.4f} ({avg_bleu2*100:.2f}%)")
print(f"  BLEU-4:  {avg_bleu4:.4f} ({avg_bleu4*100:.2f}%)")
print(f"  ROUGE-L: {avg_rouge:.4f} ({avg_rouge*100:.2f}%)")

print("\nMetric Definitions:")
print("  - BLEU-1: Unigram precision (individual word matches)")
print("  - BLEU-2: Bigram precision (2-word phrase matches)")
print("  - BLEU-4: 4-gram precision (4-word phrase matches)")
print("  - ROUGE-L: Longest common subsequence F-measure")


# ========== CELL 10: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [r['question'][:40] + '...' if len(r['question']) > 40 else r['question'] for r in results],
    'BLEU-1': [f"{r['bleu1']:.4f}" for r in results],
    'BLEU-2': [f"{r['bleu2']:.4f}" for r in results],
    'BLEU-4': [f"{r['bleu4']:.4f}" for r in results],
    'ROUGE-L': [f"{r['rouge_l']:.4f}" for r in results]
})

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
display(results_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('bleu_rouge_4wheeler_results.csv', index=False)
# print("\nResults saved to 'bleu_rouge_4wheeler_results.csv'")

# from google.colab import files
# files.download('bleu_rouge_4wheeler_results.csv')



Model Configuration:
  Model: Prithwiraj731/FourWheeler-Gemma-2B
  Type: Full merged model

Loading tokenizer...
Loading full model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

Model loaded successfully

GENERATING ANSWERS AND CALCULATING BLEU/ROUGE METRICS

Question 1/6
Q: What is the purpose of the SRS airbags in the vehicle?...
Generated: The SRS airbags are designed to deploy during a crash, providing addit...
Reference: The SRS airbags are designed to deploy in the event of a crash or sudd...
BLEU-1: 0.4025 | BLEU-2: 0.3323 | BLEU-4: 0.2555
ROUGE-L: 0.6667
Time: 1.40s

Question 2/6
Q: What is the function of the steering wheel?...
Generated: The...
Reference: Adjusting the steering wheel...
BLEU-1: 0.0498 | BLEU-2: 0.0498 | BLEU-4: 0.0498
ROUGE-L: 0.4000
Time: 0.16s

Question 3/6
Q: What is the procedure for connecting a Bluetooth audio player?...
Generated: model...
Reference: Connecting a Bluetooth audio player involves selecting a Bluetooth dev...
BLEU-1: 0.0000 | BLEU-2: 0.0000 | BLEU-4: 0.0000
ROUGE-L: 0.0000
Time: 1.45s

Question 4/6
Q: If your vehicle overheats...
Generated: model...
Reference: Check the coolant level and condition, and refer to t

Unnamed: 0,Question,BLEU-1,BLEU-2,BLEU-4,ROUGE-L
0,What is the purpose of the SRS airbags i...,0.4025,0.3323,0.2555,0.6667
1,What is the function of the steering whe...,0.0498,0.0498,0.0498,0.4
2,What is the procedure for connecting a B...,0.0,0.0,0.0,0.0
3,If your vehicle overheats,0.0,0.0,0.0,0.0
4,What is the recommended approach for rep...,0.0,0.0,0.0,0.1143
5,What is the recommended procedure for re...,0.0526,0.0293,0.0136,0.1053


In [5]:
# ==============================================================================
# INFERENCE LATENCY EVALUATION - TWO-WHEELER MODEL (BSA)
# Metrics: Tokenization, Inference, Decoding Time, Throughput
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes peft transformers

# Note: After running Cell 1, restart runtime then run cells 2-9


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
import time
import numpy as np


# ========== CELL 3: Define Dataset ==========
dataset = [
    {
        "question": "What is the recommended lubrication for the engine of the BSA D14/4 Bantam Supreme motorcycle?",
        "answer": "Engine lubrication: BSA recommends using a mixture of 10W-30 oil, with a minimum of 10W-40 oil, for the engine of the BSA D14/4 Bantam Supreme motorcycle."
    },
    {
        "question": "Where should an inexperienced owner consult for assistance with major repair work?",
        "answer": "His B.S.A. dealer"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the B.S.A. guarantee?",
        "answer": "Claim assistance through the dealer from whom the motorcycle was purchased."
    },
    {
        "question": "What is the correct address of the B.S.A. Service Department?",
        "answer": "B.S.A. MOTOR CYCLES LIMITED, SERVICE DEPARTMENT, ARMOURY ROAD, BIRMINGHAM 11"
    },
    {
        "question": "What is the recommended procedure for claiming assistance under the guarantee for a new motorcycle?",
        "answer": "The owner must do so through the dealer from whom the machine was purchased."
    },
    {
        "question": "What is the recommended torque wrench setting for the Supreme model?",
        "answer": "1 to 3"
    }
]


# ========== CELL 4: Configure Model ==========
adapter_name = "Prithwiraj731/Gemma2-2b_Two-Wheeler"
base_model_name = "google/gemma-2-2b"

print("Model Configuration:")
print(f"  Adapter: {adapter_name}")
print(f"  Base Model: {base_model_name}")


# ========== CELL 5: Load Model ==========
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)

print("Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_name)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully\n")


# ========== CELL 6: Define Answer Generation with Timing ==========
def generate_answer_with_timing(question, max_new_tokens=100):
    """Generate answer and measure latency at each stage"""
    prompt = f"{question}\n"

    # Tokenization time
    tok_start = time.perf_counter()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    tok_time = time.perf_counter() - tok_start

    input_tokens = inputs['input_ids'].shape[1]

    # Inference time
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    inf_start = time.perf_counter()

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    torch.cuda.synchronize() if torch.cuda.is_available() else None
    inf_time = time.perf_counter() - inf_start

    output_tokens = outputs.shape[1]
    new_tokens = output_tokens - input_tokens

    # Decoding time
    dec_start = time.perf_counter()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    dec_time = time.perf_counter() - dec_start

    answer = generated_text[len(prompt):].strip()
    answer = answer.split('\n')[0].strip()

    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    total_time = tok_time + inf_time + dec_time
    tokens_per_sec = new_tokens / inf_time if inf_time > 0 else 0

    return {
        'answer': answer,
        'input_tokens': input_tokens,
        'output_tokens': new_tokens,
        'tokenization_ms': tok_time * 1000,
        'inference_ms': inf_time * 1000,
        'decoding_ms': dec_time * 1000,
        'total_ms': total_time * 1000,
        'tokens_per_sec': tokens_per_sec
    }


# ========== CELL 7: Warmup Run ==========
print("Warming up model...")
_ = generate_answer_with_timing("Test question")
print("Warmup complete\n")


# ========== CELL 8: Run Latency Tests ==========
print("="*70)
print("MEASURING INFERENCE LATENCY")
print("="*70)

results = []

for i, item in enumerate(dataset):
    question = item["question"]

    timing = generate_answer_with_timing(question)

    results.append({
        'question': question,
        **timing
    })

    print(f"\nQuestion {i+1}/{len(dataset)}")
    print(f"Q: {question[:60]}...")
    print(f"A: {timing['answer'][:60]}...")
    print(f"Input tokens:  {timing['input_tokens']}")
    print(f"Output tokens: {timing['output_tokens']}")
    print(f"Tokenization:  {timing['tokenization_ms']:.2f} ms")
    print(f"Inference:     {timing['inference_ms']:.2f} ms")
    print(f"Decoding:      {timing['decoding_ms']:.2f} ms")
    print(f"Total:         {timing['total_ms']:.2f} ms")
    print(f"Throughput:    {timing['tokens_per_sec']:.2f} tokens/sec")


# ========== CELL 9: Display Summary Results ==========
print("\n" + "="*70)
print("TWO-WHEELER MODEL - INFERENCE LATENCY RESULTS")
print("="*70)

avg_input = np.mean([r['input_tokens'] for r in results])
avg_output = np.mean([r['output_tokens'] for r in results])
avg_tok = np.mean([r['tokenization_ms'] for r in results])
avg_inf = np.mean([r['inference_ms'] for r in results])
avg_dec = np.mean([r['decoding_ms'] for r in results])
avg_total = np.mean([r['total_ms'] for r in results])
avg_tps = np.mean([r['tokens_per_sec'] for r in results])

min_latency = min([r['inference_ms'] for r in results])
max_latency = max([r['inference_ms'] for r in results])
p50_latency = np.percentile([r['inference_ms'] for r in results], 50)
p90_latency = np.percentile([r['inference_ms'] for r in results], 90)
p99_latency = np.percentile([r['inference_ms'] for r in results], 99)

print(f"\nLatency Statistics:")
print(f"  Avg Input Tokens:     {avg_input:.1f}")
print(f"  Avg Output Tokens:    {avg_output:.1f}")
print(f"  Avg Tokenization:     {avg_tok:.2f} ms")
print(f"  Avg Inference:        {avg_inf:.2f} ms")
print(f"  Avg Decoding:         {avg_dec:.2f} ms")
print(f"  Avg Total Latency:    {avg_total:.2f} ms")
print(f"  Avg Throughput:       {avg_tps:.2f} tokens/sec")

print(f"\nLatency Percentiles (Inference only):")
print(f"  Min:     {min_latency:.2f} ms")
print(f"  P50:     {p50_latency:.2f} ms")
print(f"  P90:     {p90_latency:.2f} ms")
print(f"  P99:     {p99_latency:.2f} ms")
print(f"  Max:     {max_latency:.2f} ms")


# ========== CELL 10: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [r['question'][:35] + '...' if len(r['question']) > 35 else r['question'] for r in results],
    'In Tok': [r['input_tokens'] for r in results],
    'Out Tok': [r['output_tokens'] for r in results],
    'Inference (ms)': [f"{r['inference_ms']:.2f}" for r in results],
    'Total (ms)': [f"{r['total_ms']:.2f}" for r in results],
    'Tok/sec': [f"{r['tokens_per_sec']:.1f}" for r in results]
})

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
display(results_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('latency_2wheeler_results.csv', index=False)
# print("\nResults saved to 'latency_2wheeler_results.csv'")

# from google.colab import files
# files.download('latency_2wheeler_results.csv')



Model Configuration:
  Adapter: Prithwiraj731/Gemma2-2b_Two-Wheeler
  Base Model: google/gemma-2-2b

Loading tokenizer...
Loading base model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

Loading LoRA adapter...
Model loaded successfully

Warming up model...
Warmup complete

MEASURING INFERENCE LATENCY

Question 1/6
Q: What is the recommended lubrication for the engine of the BS...
A: The oil should be a good grade, and not too thin....
Input tokens:  22
Output tokens: 14
Tokenization:  0.60 ms
Inference:     1454.90 ms
Decoding:      1.08 ms
Total:         1456.58 ms
Throughput:    9.62 tokens/sec

Question 2/6
Q: Where should an inexperienced owner consult for assistance w...
A: A. The dealer is the only one who can perform all repairs an...
Input tokens:  15
Output tokens: 52
Tokenization:  0.60 ms
Inference:     5403.04 ms
Decoding:      0.97 ms
Total:         5404.61 ms
Throughput:    9.62 tokens/sec

Question 3/6
Q: What is the recommended procedure for claiming assistance un...
A: The following should be observed:...
Input tokens:  20
Output tokens: 63
Tokenization:  0.69 ms
Inference:     7014.76 ms
Decoding:      1.67 ms
Total:         7017.11 ms
Throughput:   

Unnamed: 0,Question,In Tok,Out Tok,Inference (ms),Total (ms),Tok/sec
0,What is the recommended lubrication...,22,14,1454.9,1456.58,9.6
1,Where should an inexperienced owner...,15,52,5403.04,5404.61,9.6
2,What is the recommended procedure f...,20,63,7014.76,7017.11,9.0
3,What is the correct address of the ...,18,91,9891.16,9893.32,9.2
4,What is the recommended procedure f...,18,20,2107.77,2109.84,9.5
5,What is the recommended torque wren...,14,23,2429.07,2431.57,9.5


In [6]:
# ==============================================================================
# INFERENCE LATENCY EVALUATION - FOUR-WHEELER MODEL (LEXUS)
# Metrics: Tokenization, Inference, Decoding Time, Throughput
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes transformers

# Note: After running Cell 1, restart runtime then run cells 2-9


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import time
import numpy as np


# ========== CELL 3: Define Dataset ==========
dataset = [
    {
        "question": "What is the purpose of the SRS airbags in the vehicle?",
        "answer": "The SRS airbags are designed to deploy in the event of a crash or sudden stop, providing protection for the occupants of the vehicle."
    },
    {
        "question": "What is the function of the steering wheel?",
        "answer": "Adjusting the steering wheel"
    },
    {
        "question": "What is the procedure for connecting a Bluetooth audio player?",
        "answer": "Connecting a Bluetooth audio player involves selecting a Bluetooth device, registering the device, and then connecting it to the vehicle's Bluetooth system."
    },
    {
        "question": "If your vehicle overheats",
        "answer": "Check the coolant level and condition, and refer to the owner's manual for guidance on how to address the issue."
    },
    {
        "question": "What is the recommended approach for replacing genuine Lexus parts or accessories in the vehicle?",
        "answer": "Lexus recommends using genuine Lexus parts or accessories for replacement, but other parts or accessories of matching quality can also be used."
    },
    {
        "question": "What is the recommended procedure for removing and disposing of the SRS airbag and seat belt pretensioner devices from a Lexus vehicle before scrapping?",
        "answer": "Have the systems removed and disposed of by an authorized Lexus dealer or a duly qualified and equipped professional."
    }
]


# ========== CELL 4: Configure Model ==========
model_name = "Prithwiraj731/FourWheeler-Gemma-2B"

print("Model Configuration:")
print(f"  Model: {model_name}")
print(f"  Type: Full merged model")


# ========== CELL 5: Load Model ==========
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading full model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully\n")


# ========== CELL 6: Define Answer Generation with Timing ==========
def generate_answer_with_timing(question, max_new_tokens=100):
    """Generate answer and measure latency at each stage"""
    prompt = f"{question}\n"

    # Tokenization time
    tok_start = time.perf_counter()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    tok_time = time.perf_counter() - tok_start

    input_tokens = inputs['input_ids'].shape[1]

    # Inference time
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    inf_start = time.perf_counter()

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    torch.cuda.synchronize() if torch.cuda.is_available() else None
    inf_time = time.perf_counter() - inf_start

    output_tokens = outputs.shape[1]
    new_tokens = output_tokens - input_tokens

    # Decoding time
    dec_start = time.perf_counter()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    dec_time = time.perf_counter() - dec_start

    answer = generated_text[len(prompt):].strip()
    answer = answer.split('\n')[0].strip()

    if not answer or len(answer.strip()) == 0:
        answer = "No answer generated"

    total_time = tok_time + inf_time + dec_time
    tokens_per_sec = new_tokens / inf_time if inf_time > 0 else 0

    return {
        'answer': answer,
        'input_tokens': input_tokens,
        'output_tokens': new_tokens,
        'tokenization_ms': tok_time * 1000,
        'inference_ms': inf_time * 1000,
        'decoding_ms': dec_time * 1000,
        'total_ms': total_time * 1000,
        'tokens_per_sec': tokens_per_sec
    }


# ========== CELL 7: Warmup Run ==========
print("Warming up model...")
_ = generate_answer_with_timing("Test question")
print("Warmup complete\n")


# ========== CELL 8: Run Latency Tests ==========
print("="*70)
print("MEASURING INFERENCE LATENCY")
print("="*70)

results = []

for i, item in enumerate(dataset):
    question = item["question"]

    timing = generate_answer_with_timing(question)

    results.append({
        'question': question,
        **timing
    })

    print(f"\nQuestion {i+1}/{len(dataset)}")
    print(f"Q: {question[:60]}...")
    print(f"A: {timing['answer'][:60]}...")
    print(f"Input tokens:  {timing['input_tokens']}")
    print(f"Output tokens: {timing['output_tokens']}")
    print(f"Tokenization:  {timing['tokenization_ms']:.2f} ms")
    print(f"Inference:     {timing['inference_ms']:.2f} ms")
    print(f"Decoding:      {timing['decoding_ms']:.2f} ms")
    print(f"Total:         {timing['total_ms']:.2f} ms")
    print(f"Throughput:    {timing['tokens_per_sec']:.2f} tokens/sec")


# ========== CELL 9: Display Summary Results ==========
print("\n" + "="*70)
print("FOUR-WHEELER MODEL - INFERENCE LATENCY RESULTS")
print("="*70)

avg_input = np.mean([r['input_tokens'] for r in results])
avg_output = np.mean([r['output_tokens'] for r in results])
avg_tok = np.mean([r['tokenization_ms'] for r in results])
avg_inf = np.mean([r['inference_ms'] for r in results])
avg_dec = np.mean([r['decoding_ms'] for r in results])
avg_total = np.mean([r['total_ms'] for r in results])
avg_tps = np.mean([r['tokens_per_sec'] for r in results])

min_latency = min([r['inference_ms'] for r in results])
max_latency = max([r['inference_ms'] for r in results])
p50_latency = np.percentile([r['inference_ms'] for r in results], 50)
p90_latency = np.percentile([r['inference_ms'] for r in results], 90)
p99_latency = np.percentile([r['inference_ms'] for r in results], 99)

print(f"\nLatency Statistics:")
print(f"  Avg Input Tokens:     {avg_input:.1f}")
print(f"  Avg Output Tokens:    {avg_output:.1f}")
print(f"  Avg Tokenization:     {avg_tok:.2f} ms")
print(f"  Avg Inference:        {avg_inf:.2f} ms")
print(f"  Avg Decoding:         {avg_dec:.2f} ms")
print(f"  Avg Total Latency:    {avg_total:.2f} ms")
print(f"  Avg Throughput:       {avg_tps:.2f} tokens/sec")

print(f"\nLatency Percentiles (Inference only):")
print(f"  Min:     {min_latency:.2f} ms")
print(f"  P50:     {p50_latency:.2f} ms")
print(f"  P90:     {p90_latency:.2f} ms")
print(f"  P99:     {p99_latency:.2f} ms")
print(f"  Max:     {max_latency:.2f} ms")


# ========== CELL 10: Results DataFrame ==========
results_df = pd.DataFrame({
    'Question': [r['question'][:35] + '...' if len(r['question']) > 35 else r['question'] for r in results],
    'In Tok': [r['input_tokens'] for r in results],
    'Out Tok': [r['output_tokens'] for r in results],
    'Inference (ms)': [f"{r['inference_ms']:.2f}" for r in results],
    'Total (ms)': [f"{r['total_ms']:.2f}" for r in results],
    'Tok/sec': [f"{r['tokens_per_sec']:.1f}" for r in results]
})

print("\n" + "="*70)
print("DETAILED RESULTS TABLE")
print("="*70)
display(results_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# results_df.to_csv('latency_4wheeler_results.csv', index=False)
# print("\nResults saved to 'latency_4wheeler_results.csv'")

# from google.colab import files
# files.download('latency_4wheeler_results.csv')



Model Configuration:
  Model: Prithwiraj731/FourWheeler-Gemma-2B
  Type: Full merged model

Loading tokenizer...
Loading full model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

Model loaded successfully

Warming up model...
Warmup complete

MEASURING INFERENCE LATENCY

Question 1/6
Q: What is the purpose of the SRS airbags in the vehicle?...
A: The SRS airbags are designed to deploy during a crash, provi...
Input tokens:  14
Output tokens: 18
Tokenization:  0.61 ms
Inference:     1107.01 ms
Decoding:      0.89 ms
Total:         1108.51 ms
Throughput:    16.26 tokens/sec

Question 2/6
Q: What is the function of the steering wheel?...
A: The...
Input tokens:  11
Output tokens: 2
Tokenization:  0.61 ms
Inference:     144.60 ms
Decoding:      0.28 ms
Total:         145.49 ms
Throughput:    13.83 tokens/sec

Question 3/6
Q: What is the procedure for connecting a Bluetooth audio playe...
A: model...
Input tokens:  13
Output tokens: 24
Tokenization:  0.42 ms
Inference:     1453.27 ms
Decoding:      0.36 ms
Total:         1454.05 ms
Throughput:    16.51 tokens/sec

Question 4/6
Q: If your vehicle overheats...
A: model...
Input tokens:  8
Output tokens: 15
Tokenizatio

Unnamed: 0,Question,In Tok,Out Tok,Inference (ms),Total (ms),Tok/sec
0,What is the purpose of the SRS airb...,14,18,1107.01,1108.51,16.3
1,What is the function of the steerin...,11,2,144.6,145.49,13.8
2,What is the procedure for connectin...,13,24,1453.27,1454.05,16.5
3,If your vehicle overheats,8,15,933.57,934.9,16.1
4,What is the recommended approach fo...,18,17,1045.78,1047.17,16.3
5,What is the recommended procedure f...,30,22,1355.79,1356.72,16.2


In [7]:
# ==============================================================================
# MEMORY FOOTPRINT EVALUATION - TWO-WHEELER MODEL (BSA)
# Metrics: GPU Memory, Model Size, Parameter Count, Memory Breakdown
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes peft transformers

# Note: After running Cell 1, restart runtime then run cells 2-11


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
import gc


# ========== CELL 3: Memory Utility Functions ==========
def get_gpu_memory():
    """Get current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024**3)
        reserved = torch.cuda.memory_reserved() / (1024**3)
        max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
        total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        return {
            'allocated_gb': allocated,
            'reserved_gb': reserved,
            'max_allocated_gb': max_allocated,
            'total_gpu_gb': total
        }
    return {'allocated_gb': 0, 'reserved_gb': 0, 'max_allocated_gb': 0, 'total_gpu_gb': 0}

def get_model_size(model):
    """Calculate model size in memory"""
    param_size = 0
    buffer_size = 0

    for param in model.parameters():
        param_size += param.nelement() * param.element_size()

    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    total_size = param_size + buffer_size
    return {
        'param_size_mb': param_size / (1024**2),
        'buffer_size_mb': buffer_size / (1024**2),
        'total_size_mb': total_size / (1024**2),
        'total_size_gb': total_size / (1024**3)
    }

def count_parameters(model):
    """Count trainable and total parameters"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        'total_params': total_params,
        'trainable_params': trainable_params,
        'total_params_millions': total_params / 1e6,
        'trainable_params_millions': trainable_params / 1e6
    }

def clear_memory():
    """Clear GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()


# ========== CELL 4: Clear Memory and Get Baseline ==========
clear_memory()
baseline_memory = get_gpu_memory()

print("="*70)
print("MEMORY FOOTPRINT MEASUREMENT - TWO-WHEELER MODEL")
print("="*70)
print(f"\nBaseline GPU Memory: {baseline_memory['allocated_gb']:.4f} GB")
print(f"Total GPU Memory: {baseline_memory['total_gpu_gb']:.2f} GB\n")


# ========== CELL 5: Configure Model ==========
adapter_name = "Prithwiraj731/Gemma2-2b_Two-Wheeler"
base_model_name = "google/gemma-2-2b"

print("Model Configuration:")
print(f"  Adapter: {adapter_name}")
print(f"  Base Model: {base_model_name}\n")


# ========== CELL 6: Load Tokenizer ==========
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)
after_tokenizer = get_gpu_memory()
print(f"After tokenizer: {after_tokenizer['allocated_gb']:.4f} GB\n")


# ========== CELL 7: Load Base Model ==========
print("Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

after_base = get_gpu_memory()
base_model_size = get_model_size(base_model)
base_params = count_parameters(base_model)

print(f"After base model: {after_base['allocated_gb']:.4f} GB")
print(f"Base model size: {base_model_size['total_size_mb']:.2f} MB")
print(f"Base parameters: {base_params['total_params_millions']:.2f} M\n")


# ========== CELL 8: Load LoRA Adapter ==========
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_name)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

after_adapter = get_gpu_memory()
full_model_size = get_model_size(model)
full_params = count_parameters(model)

print(f"After adapter: {after_adapter['allocated_gb']:.4f} GB")
print(f"Full model size: {full_model_size['total_size_mb']:.2f} MB")
print(f"Trainable parameters: {full_params['trainable_params_millions']:.2f} M\n")


# ========== CELL 9: Run Inference to Measure Peak Memory ==========
print("Running inference to measure peak memory...")
test_prompt = "What is the recommended lubrication for the engine?\n"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2
    )

after_inference = get_gpu_memory()
print(f"After inference: {after_inference['allocated_gb']:.4f} GB")
print(f"Peak memory: {after_inference['max_allocated_gb']:.4f} GB\n")


# ========== CELL 10: Display Summary Results ==========
print("="*70)
print("TWO-WHEELER MODEL - MEMORY FOOTPRINT RESULTS")
print("="*70)

print(f"\nGPU Memory Usage:")
print(f"  Baseline:              {baseline_memory['allocated_gb']:.4f} GB")
print(f"  After Tokenizer:       {after_tokenizer['allocated_gb']:.4f} GB")
print(f"  After Base Model:      {after_base['allocated_gb']:.4f} GB")
print(f"  After LoRA Adapter:    {after_adapter['allocated_gb']:.4f} GB")
print(f"  After Inference:       {after_inference['allocated_gb']:.4f} GB")
print(f"  Peak Memory:           {after_inference['max_allocated_gb']:.4f} GB")
print(f"  Reserved Memory:       {after_inference['reserved_gb']:.4f} GB")
print(f"  Total GPU Capacity:    {baseline_memory['total_gpu_gb']:.2f} GB")

print(f"\nModel Size:")
print(f"  Model in Memory:       {full_model_size['total_size_mb']:.2f} MB ({full_model_size['total_size_gb']:.4f} GB)")
print(f"  Parameters:            {full_model_size['param_size_mb']:.2f} MB")
print(f"  Buffers:               {full_model_size['buffer_size_mb']:.2f} MB")

print(f"\nParameter Count:")
print(f"  Total Parameters:      {full_params['total_params_millions']:.2f} M ({full_params['total_params']:,})")
print(f"  Trainable (LoRA):      {full_params['trainable_params_millions']:.2f} M ({full_params['trainable_params']:,})")
print(f"  Frozen:                {(full_params['total_params_millions'] - full_params['trainable_params_millions']):.2f} M")
print(f"  Trainable Ratio:       {(full_params['trainable_params'] / full_params['total_params'] * 100):.2f}%")

tokenizer_overhead = after_tokenizer['allocated_gb'] - baseline_memory['allocated_gb']
base_overhead = after_base['allocated_gb'] - after_tokenizer['allocated_gb']
adapter_overhead = after_adapter['allocated_gb'] - after_base['allocated_gb']
inference_overhead = after_inference['max_allocated_gb'] - after_adapter['allocated_gb']

print(f"\nMemory Breakdown:")
print(f"  Tokenizer Overhead:    {tokenizer_overhead:.4f} GB ({tokenizer_overhead*1024:.2f} MB)")
print(f"  Base Model Memory:     {base_overhead:.4f} GB ({base_overhead*1024:.2f} MB)")
print(f"  LoRA Adapter Overhead: {adapter_overhead:.4f} GB ({adapter_overhead*1024:.2f} MB)")
print(f"  Inference Overhead:    {inference_overhead:.4f} GB ({inference_overhead*1024:.2f} MB)")

gpu_utilization = (after_inference['max_allocated_gb'] / baseline_memory['total_gpu_gb']) * 100
print(f"\nGPU Utilization:       {gpu_utilization:.2f}%")


# ========== CELL 11: Memory Usage DataFrame ==========
memory_df = pd.DataFrame({
    'Stage': ['Baseline', 'After Tokenizer', 'After Base Model', 'After LoRA', 'After Inference', 'Peak'],
    'Allocated (GB)': [
        f"{baseline_memory['allocated_gb']:.4f}",
        f"{after_tokenizer['allocated_gb']:.4f}",
        f"{after_base['allocated_gb']:.4f}",
        f"{after_adapter['allocated_gb']:.4f}",
        f"{after_inference['allocated_gb']:.4f}",
        f"{after_inference['max_allocated_gb']:.4f}"
    ],
    'Delta (MB)': [
        "0.00",
        f"{tokenizer_overhead*1024:.2f}",
        f"{base_overhead*1024:.2f}",
        f"{adapter_overhead*1024:.2f}",
        f"{(after_inference['allocated_gb'] - after_adapter['allocated_gb'])*1024:.2f}",
        f"{inference_overhead*1024:.2f}"
    ]
})

print("\n" + "="*70)
print("DETAILED MEMORY USAGE TABLE")
print("="*70)
display(memory_df)


# ========== CELL 12: Save Results (Optional) ==========
# Uncomment to save and download results

# memory_df.to_csv('memory_footprint_2wheeler_results.csv', index=False)
# print("\nResults saved to 'memory_footprint_2wheeler_results.csv'")

# from google.colab import files
# files.download('memory_footprint_2wheeler_results.csv')



MEMORY FOOTPRINT MEASUREMENT - TWO-WHEELER MODEL

Baseline GPU Memory: 4.4588 GB
Total GPU Memory: 14.74 GB

Model Configuration:
  Adapter: Prithwiraj731/Gemma2-2b_Two-Wheeler
  Base Model: google/gemma-2-2b

Loading tokenizer...
After tokenizer: 4.4588 GB

Loading base model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

After base model: 5.4614 GB
Base model size: 2090.71 MB
Base parameters: 1602.20 M

Loading LoRA adapter...
After adapter: 4.4397 GB
Full model size: 2169.93 MB
Trainable parameters: 0.00 M

Running inference to measure peak memory...
After inference: 4.4397 GB
Peak memory: 6.7336 GB

TWO-WHEELER MODEL - MEMORY FOOTPRINT RESULTS

GPU Memory Usage:
  Baseline:              4.4588 GB
  After Tokenizer:       4.4588 GB
  After Base Model:      5.4614 GB
  After LoRA Adapter:    4.4397 GB
  After Inference:       4.4397 GB
  Peak Memory:           6.7336 GB
  Reserved Memory:       6.8789 GB
  Total GPU Capacity:    14.74 GB

Model Size:
  Model in Memory:       2169.93 MB (2.1191 GB)
  Parameters:            2169.93 MB
  Buffers:               0.00 MB

Parameter Count:
  Total Parameters:      1622.97 M (1,622,970,624)
  Trainable (LoRA):      0.00 M (0)
  Frozen:                1622.97 M
  Trainable Ratio:       0.00%

Memory Breakdown:
  Tokenizer Overhead:    0.0000 GB (0.00 MB)
  Base

Unnamed: 0,Stage,Allocated (GB),Delta (MB)
0,Baseline,4.4588,0.0
1,After Tokenizer,4.4588,0.0
2,After Base Model,5.4614,1026.65
3,After LoRA,4.4397,-1046.24
4,After Inference,4.4397,0.0
5,Peak,6.7336,2348.96


In [8]:
# ==============================================================================
# MEMORY FOOTPRINT EVALUATION - FOUR-WHEELER MODEL (LEXUS)
# Metrics: GPU Memory, Model Size, Parameter Count, Memory Breakdown
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes transformers

# Note: After running Cell 1, restart runtime then run cells 2-11


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import gc


# ========== CELL 3: Memory Utility Functions ==========
def get_gpu_memory():
    """Get current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024**3)
        reserved = torch.cuda.memory_reserved() / (1024**3)
        max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
        total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        return {
            'allocated_gb': allocated,
            'reserved_gb': reserved,
            'max_allocated_gb': max_allocated,
            'total_gpu_gb': total
        }
    return {'allocated_gb': 0, 'reserved_gb': 0, 'max_allocated_gb': 0, 'total_gpu_gb': 0}

def get_model_size(model):
    """Calculate model size in memory"""
    param_size = 0
    buffer_size = 0

    for param in model.parameters():
        param_size += param.nelement() * param.element_size()

    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    total_size = param_size + buffer_size
    return {
        'param_size_mb': param_size / (1024**2),
        'buffer_size_mb': buffer_size / (1024**2),
        'total_size_mb': total_size / (1024**2),
        'total_size_gb': total_size / (1024**3)
    }

def count_parameters(model):
    """Count trainable and total parameters"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        'total_params': total_params,
        'trainable_params': trainable_params,
        'total_params_millions': total_params / 1e6,
        'trainable_params_millions': trainable_params / 1e6
    }

def clear_memory():
    """Clear GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()


# ========== CELL 4: Clear Memory and Get Baseline ==========
clear_memory()
baseline_memory = get_gpu_memory()

print("="*70)
print("MEMORY FOOTPRINT MEASUREMENT - FOUR-WHEELER MODEL")
print("="*70)
print(f"\nBaseline GPU Memory: {baseline_memory['allocated_gb']:.4f} GB")
print(f"Total GPU Memory: {baseline_memory['total_gpu_gb']:.2f} GB\n")


# ========== CELL 5: Configure Model ==========
model_name = "Prithwiraj731/FourWheeler-Gemma-2B"

print("Model Configuration:")
print(f"  Model: {model_name}")
print(f"  Type: Full merged model\n")


# ========== CELL 6: Load Tokenizer ==========
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
after_tokenizer = get_gpu_memory()
print(f"After tokenizer: {after_tokenizer['allocated_gb']:.4f} GB\n")


# ========== CELL 7: Load Full Model ==========
print("Loading full model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

after_model = get_gpu_memory()
model_size = get_model_size(model)
params = count_parameters(model)

print(f"After model: {after_model['allocated_gb']:.4f} GB")
print(f"Model size: {model_size['total_size_mb']:.2f} MB")
print(f"Total parameters: {params['total_params_millions']:.2f} M\n")


# ========== CELL 8: Run Inference to Measure Peak Memory ==========
print("Running inference to measure peak memory...")
test_prompt = "What is the purpose of the SRS airbags in the vehicle?\n"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2
    )

after_inference = get_gpu_memory()
print(f"After inference: {after_inference['allocated_gb']:.4f} GB")
print(f"Peak memory: {after_inference['max_allocated_gb']:.4f} GB\n")


# ========== CELL 9: Display Summary Results ==========
print("="*70)
print("FOUR-WHEELER MODEL - MEMORY FOOTPRINT RESULTS")
print("="*70)

print(f"\nGPU Memory Usage:")
print(f"  Baseline:              {baseline_memory['allocated_gb']:.4f} GB")
print(f"  After Tokenizer:       {after_tokenizer['allocated_gb']:.4f} GB")
print(f"  After Model:           {after_model['allocated_gb']:.4f} GB")
print(f"  After Inference:       {after_inference['allocated_gb']:.4f} GB")
print(f"  Peak Memory:           {after_inference['max_allocated_gb']:.4f} GB")
print(f"  Reserved Memory:       {after_inference['reserved_gb']:.4f} GB")
print(f"  Total GPU Capacity:    {baseline_memory['total_gpu_gb']:.2f} GB")

print(f"\nModel Size:")
print(f"  Model in Memory:       {model_size['total_size_mb']:.2f} MB ({model_size['total_size_gb']:.4f} GB)")
print(f"  Parameters:            {model_size['param_size_mb']:.2f} MB")
print(f"  Buffers:               {model_size['buffer_size_mb']:.2f} MB")

print(f"\nParameter Count:")
print(f"  Total Parameters:      {params['total_params_millions']:.2f} M ({params['total_params']:,})")
print(f"  Trainable:             {params['trainable_params_millions']:.2f} M ({params['trainable_params']:,})")
print(f"  Frozen:                {(params['total_params_millions'] - params['trainable_params_millions']):.2f} M")
print(f"  Trainable Ratio:       {(params['trainable_params'] / params['total_params'] * 100):.2f}%")

tokenizer_overhead = after_tokenizer['allocated_gb'] - baseline_memory['allocated_gb']
model_overhead = after_model['allocated_gb'] - after_tokenizer['allocated_gb']
inference_overhead = after_inference['max_allocated_gb'] - after_model['allocated_gb']

print(f"\nMemory Breakdown:")
print(f"  Tokenizer Overhead:    {tokenizer_overhead:.4f} GB ({tokenizer_overhead*1024:.2f} MB)")
print(f"  Model Memory:          {model_overhead:.4f} GB ({model_overhead*1024:.2f} MB)")
print(f"  Inference Overhead:    {inference_overhead:.4f} GB ({inference_overhead*1024:.2f} MB)")

gpu_utilization = (after_inference['max_allocated_gb'] / baseline_memory['total_gpu_gb']) * 100
print(f"\nGPU Utilization:       {gpu_utilization:.2f}%")


# ========== CELL 10: Memory Usage DataFrame ==========
memory_df = pd.DataFrame({
    'Stage': ['Baseline', 'After Tokenizer', 'After Model', 'After Inference', 'Peak'],
    'Allocated (GB)': [
        f"{baseline_memory['allocated_gb']:.4f}",
        f"{after_tokenizer['allocated_gb']:.4f}",
        f"{after_model['allocated_gb']:.4f}",
        f"{after_inference['allocated_gb']:.4f}",
        f"{after_inference['max_allocated_gb']:.4f}"
    ],
    'Delta (MB)': [
        "0.00",
        f"{tokenizer_overhead*1024:.2f}",
        f"{model_overhead*1024:.2f}",
        f"{(after_inference['allocated_gb'] - after_model['allocated_gb'])*1024:.2f}",
        f"{inference_overhead*1024:.2f}"
    ]
})

print("\n" + "="*70)
print("DETAILED MEMORY USAGE TABLE")
print("="*70)
display(memory_df)


# ========== CELL 11: Save Results (Optional) ==========
# Uncomment to save and download results

# memory_df.to_csv('memory_footprint_4wheeler_results.csv', index=False)
# print("\nResults saved to 'memory_footprint_4wheeler_results.csv'")

# from google.colab import files
# files.download('memory_footprint_4wheeler_results.csv')



MEMORY FOOTPRINT MEASUREMENT - FOUR-WHEELER MODEL

Baseline GPU Memory: 2.2643 GB
Total GPU Memory: 14.74 GB

Model Configuration:
  Model: Prithwiraj731/FourWheeler-Gemma-2B
  Type: Full merged model

Loading tokenizer...
After tokenizer: 2.2643 GB

Loading full model with 4-bit quantization...


Loading weights:   0%|          | 0/288 [00:00<?, ?it/s]

After model: 4.4401 GB
Model size: 2090.71 MB
Total parameters: 1602.20 M

Running inference to measure peak memory...
After inference: 4.4401 GB
Peak memory: 4.6581 GB

FOUR-WHEELER MODEL - MEMORY FOOTPRINT RESULTS

GPU Memory Usage:
  Baseline:              2.2643 GB
  After Tokenizer:       2.2643 GB
  After Model:           4.4401 GB
  After Inference:       4.4401 GB
  Peak Memory:           4.6581 GB
  Reserved Memory:       5.8555 GB
  Total GPU Capacity:    14.74 GB

Model Size:
  Model in Memory:       2090.71 MB (2.0417 GB)
  Parameters:            2090.71 MB
  Buffers:               0.00 MB

Parameter Count:
  Total Parameters:      1602.20 M (1,602,203,904)
  Trainable:             590.07 M (590,065,920)
  Frozen:                1012.14 M
  Trainable Ratio:       36.83%

Memory Breakdown:
  Tokenizer Overhead:    0.0000 GB (0.00 MB)
  Model Memory:          2.1757 GB (2227.94 MB)
  Inference Overhead:    0.2180 GB (223.28 MB)

GPU Utilization:       31.60%

DETAILED MEMORY 

Unnamed: 0,Stage,Allocated (GB),Delta (MB)
0,Baseline,2.2643,0.0
1,After Tokenizer,2.2643,0.0
2,After Model,4.4401,2227.94
3,After Inference,4.4401,0.0
4,Peak,4.6581,223.28
