In [None]:
# ============================================================================
# NOTEBOOK 3: QLORA TRAINING
# Cell 1: Setup Environment
# ============================================================================

print("Setting up environment...")
print("="*80)

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("✓ Drive mounted")

# 2. Navigate to project folder
import os
os.chdir('/content/drive/MyDrive/maternal_health_project')
print(f"✓ Working directory: {os.getcwd()}")

# 3. Login to HuggingFace
from huggingface_hub import login, whoami
from google.colab import userdata

try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("✓ Token from Colab Secrets")
except:
    HF_TOKEN = input("Enter HuggingFace token: ").strip()

login(token=HF_TOKEN)
user = whoami()
print(f"✓ Logged in as: {user['name']}")

print("="*80)
print("✓ Setup complete!")
print("="*80 + "\n")

Setting up environment...
Mounted at /content/drive
✓ Drive mounted
✓ Working directory: /content/drive/MyDrive/maternal_health_project
Enter HuggingFace token: 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
✓ Logged in as: VaibhavAG02
✓ Setup complete!



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# ============================================================================
# Cell 2: Install Training Packages
# ============================================================================

print("Installing training packages...")
print("(This will take 2-3 minutes)\n")

!pip install -q transformers datasets peft accelerate bitsandbytes scipy trl

print("\n✓ Packages installed:")
print("  - transformers: LLaMA 2 support")
print("  - datasets: Data loading")
print("  - peft: QLoRA implementation")
print("  - accelerate: Training optimization")
print("  - bitsandbytes: 4-bit quantization")
print("  - trl: Supervised fine-tuning trainer")

Installing training packages...
(This will take 2-3 minutes)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25h
✓ Packages installed:
  - transformers: LLaMA 2 support
  - datasets: Data loading
  - peft: QLoRA implementation
  - accelerate: Training optimization
  - bitsandbytes: 4-bit quantization
  - trl: Supervised fine-tuning trainer


In [None]:
# ============================================================================
# Cell 3: Import Required Libraries
# ============================================================================

import torch
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import time
import json

print("✓ All libraries imported")
print(f"✓ PyTorch version: {torch.__version__}")
print(f"✓ CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
    print(f"✓ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️ WARNING: No GPU detected!")
    print("   Go to: Runtime → Change runtime type → GPU")

✓ All libraries imported
✓ PyTorch version: 2.8.0+cu126
✓ CUDA available: True
✓ GPU: Tesla T4
✓ GPU Memory: 15.83 GB


In [None]:
import torch
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import time
import json

print("Setting up QLoRA configuration...")

# Model Settings
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
OUTPUT_DIR = "./models/llama2-maternal-health-qlora"  # Different output path
MAX_LENGTH = 512

# QLoRA Configuration (Higher rank, more modules than LoRA)
LORA_R = 64              # Higher rank for QLoRA
LORA_ALPHA = 16          # Different scaling ratio
LORA_DROPOUT = 0.1       # Higher dropout
TARGET_MODULES = [       # More modules than LoRA
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",         # Additional
    "up_proj",           # Additional
    "down_proj"          # Additional
]

# Training Configuration (Optimized for QLoRA)
EPOCHS = 3
BATCH_SIZE = 2           # Smaller batch for memory efficiency
GRADIENT_ACCUMULATION = 1  # Adjusted to allow training steps
LEARNING_RATE = 1e-4     # Lower learning rate

print("✓ QLoRA Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  LoRA rank: {LORA_R} (higher than standard LoRA)")
print(f"  LoRA alpha: {LORA_ALPHA}")
print(f"  Target modules: {len(TARGET_MODULES)} (more than LoRA)")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE} (smaller for efficiency)")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Output directory: {OUTPUT_DIR}")

Setting up QLoRA configuration...
✓ QLoRA Configuration:
  Model: meta-llama/Llama-2-7b-chat-hf
  LoRA rank: 64 (higher than standard LoRA)
  LoRA alpha: 16
  Target modules: 7 (more than LoRA)
  Epochs: 3
  Batch size: 2 (smaller for efficiency)
  Gradient accumulation: 1
  Learning rate: 0.0001
  Output directory: ./models/llama2-maternal-health-qlora


In [None]:
# ============================================================================
# Cell 5: Load Base Model with QLoRA-specific Quantization
# ============================================================================

print("Loading LLaMA 2 model with QLoRA quantization...")
print("(This will take 3-5 minutes)\n")

# QLoRA-specific quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,  # bfloat16 for QLoRA (more stable)
    bnb_4bit_use_double_quant=True,         # Critical for QLoRA efficiency
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Prepare for k-bit training
model = prepare_model_for_kbit_training(model)

print("✓ Model loaded successfully with QLoRA settings!")
print(f"✓ Model size in memory: ~{torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
print("✓ Using bfloat16 compute dtype for better numerical stability")

Loading LLaMA 2 model with QLoRA quantization...
(This will take 3-5 minutes)



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

✓ Model loaded successfully with QLoRA settings!
✓ Model size in memory: ~4.39 GB
✓ Using bfloat16 compute dtype for better numerical stability


In [None]:
# ============================================================================
# Cell 6: Load Tokenizer
# ============================================================================

print("Loading tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("✓ Tokenizer loaded")
print(f"✓ Vocabulary size: {len(tokenizer):,}")

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

✓ Tokenizer loaded
✓ Vocabulary size: 32,000


In [None]:
# ============================================================================
# Cell 7: Configure QLoRA Adapter
# ============================================================================

print("Configuring QLoRA adapter...")

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply QLoRA to model
model = get_peft_model(model, lora_config)

# Calculate trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
trainable_percentage = 100 * trainable_params / total_params

print("✓ QLoRA configured")
print(f"✓ Trainable parameters: {trainable_params:,}")
print(f"✓ Total parameters: {total_params:,}")
print(f"✓ Trainable percentage: {trainable_percentage:.4f}%")
print(f"\n✨ Training {trainable_percentage:.4f}% of parameters with QLoRA!")
print(f"   (More than LoRA's ~0.06%, but still very efficient)")

Configuring QLoRA adapter...
✓ QLoRA configured
✓ Trainable parameters: 159,907,840
✓ Total parameters: 3,660,320,768
✓ Trainable percentage: 4.3687%

✨ Training 4.3687% of parameters with QLoRA!
   (More than LoRA's ~0.06%, but still very efficient)


In [None]:
# ============================================================================
# Cell 8: Load Training Dataset
# ============================================================================

print("Loading dataset...")

dataset = load_from_disk("data/maternal_health_dataset")

print("✓ Dataset loaded")
print(f"✓ Training samples: {len(dataset['train'])}")
print(f"✓ Validation samples: {len(dataset['validation'])}")

# Show example
print("\nExample training sample (first 400 chars):")
print("-" * 80)
print(dataset['train'][0]['text'][:400] + "...")
print("-" * 80)

Loading dataset...
✓ Dataset loaded
✓ Training samples: 93
✓ Validation samples: 24

Example training sample (first 400 chars):
--------------------------------------------------------------------------------
<s>[INST] <<SYS>>
You are a knowledgeable, empathetic maternal health assistant. Provide accurate, evidence-based information from trusted sources like WHO, CDC, ACOG, and NIH. Be supportive and encouraging. If a question requires urgent medical attention, advise seeking immediate care.
<</SYS>>

How often should I have prenatal checkups? [/INST] WHO recommends a minimum of eight antenatal contact...
--------------------------------------------------------------------------------


In [None]:
# ============================================================================
# Cell 9: Configure Training Arguments for QLoRA
# ============================================================================

print("Configuring training arguments...")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    bf16=True,  # Use bfloat16 for QLoRA (instead of fp16)
    optim="paged_adamw_32bit",  # QLoRA uses 32-bit optimizer
    report_to="none",
    save_total_limit=2,
    load_best_model_at_end=True,
)

print("✓ Training arguments configured")
print(f"  Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION}")
print(f"  Total optimization steps: ~{len(dataset['train']) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * EPOCHS}")
print(f"  Using bf16 precision (better for QLoRA)")
print(f"  Using 32-bit optimizer (QLoRA-specific)")

Configuring training arguments...
✓ Training arguments configured
  Effective batch size: 2
  Total optimization steps: ~138
  Using bf16 precision (better for QLoRA)
  Using 32-bit optimizer (QLoRA-specific)


In [None]:
# ============================================================================
# Cell 10: Create SFT Trainer (FINAL WORKING VERSION)
# ============================================================================

print("Creating trainer...")

from trl import SFTConfig

# Create SFT config without max_seq_length
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    bf16=True,  # QLoRA uses bf16
    optim="paged_adamw_32bit",  # QLoRA uses 32bit
    report_to="none",
    save_total_limit=2,
    dataset_text_field="text",
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    args=sft_config,
)

print("✓ Trainer created successfully!")
print("\n" + "="*80)
print("⚠️  READY TO START QLORA TRAINING")
print("="*80)
print("\n⏱️  Estimated time: 60-75 minutes")
print("📊 Training will happen in the next cell")
print("🚨 Do NOT close this browser tab during training!")
print("\n" + "="*80)

Creating trainer...
✓ Trainer created successfully!

⚠️  READY TO START QLORA TRAINING

⏱️  Estimated time: 60-75 minutes
📊 Training will happen in the next cell
🚨 Do NOT close this browser tab during training!



In [None]:
# ============================================================================
# Cell 11: Train the Model with QLoRA
# ============================================================================
# IMPORTANT: This takes 60-75 minutes!
# Don't close the browser while this is running

print("="*80)
print("STARTING QLORA TRAINING")
print("="*80)
print("\n⏱️  Estimated time: 60-75 minutes")
print("📊 Watch the progress bars and loss values")
print("📉 Loss should decrease over time (good sign!)")
print("💾 Model will be saved automatically after each epoch")
print("\n" + "="*80 + "\n")

# Record start time
start_time = time.time()

# Start training
trainer.train()

# Calculate training time
training_time = time.time() - start_time
training_minutes = training_time / 60

print("\n" + "="*80)
print("✅ QLORA TRAINING COMPLETE!")
print("="*80)
print(f"\n⏱️  Total training time: {training_minutes:.2f} minutes")

# Safely extract metrics from log history
if trainer.state.log_history:
    # Find the last entry with 'loss' key
    final_train_loss = None
    final_eval_loss = None

    for entry in reversed(trainer.state.log_history):
        if 'loss' in entry and final_train_loss is None:
            final_train_loss = entry['loss']
        if 'eval_loss' in entry and final_eval_loss is None:
            final_eval_loss = entry['eval_loss']
        if final_train_loss is not None and final_eval_loss is not None:
            break

    if final_train_loss is not None:
        print(f"📊 Final training loss: {final_train_loss:.4f}")
    if final_eval_loss is not None:
        print(f"📊 Final validation loss: {final_eval_loss:.4f}")
else:
    print("ℹ️  Training completed but detailed metrics not available in log history")

print("\n" + "="*80)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': None}.


STARTING QLORA TRAINING

⏱️  Estimated time: 60-75 minutes
📊 Watch the progress bars and loss values
📉 Loss should decrease over time (good sign!)
💾 Model will be saved automatically after each epoch




Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.9884,0.870745,0.899338,19166.0,0.78895
2,0.7327,0.806044,0.767431,38332.0,0.799318
3,0.5871,0.814042,0.669788,57498.0,0.799145



✅ QLORA TRAINING COMPLETE!

⏱️  Total training time: 25.58 minutes
📊 Final training loss: 0.5871
📊 Final validation loss: 0.8140



In [None]:
# ============================================================================
# Cell 12: Save Trained QLoRA Model
# ============================================================================

print("Saving QLoRA model and training metrics...")

# Save model
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✓ Model saved to: {OUTPUT_DIR}")

# Safely extract metrics from log history
final_train_loss = None
final_eval_loss = None

if trainer.state.log_history:
    for entry in reversed(trainer.state.log_history):
        if 'loss' in entry and final_train_loss is None:
            final_train_loss = entry['loss']
        if 'eval_loss' in entry and final_eval_loss is None:
            final_eval_loss = entry['eval_loss']
        if final_train_loss is not None and final_eval_loss is not None:
            break

# Prepare training metrics
metrics = {
    "method": "QLoRA",
    "model": MODEL_NAME,
    "lora_r": LORA_R,
    "lora_alpha": LORA_ALPHA,
    "lora_dropout": LORA_DROPOUT,
    "target_modules": TARGET_MODULES,
    "trainable_params": int(trainable_params),
    "total_params": int(total_params),
    "trainable_percentage": float(trainable_percentage),
    "training_time_minutes": float(training_minutes),
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "compute_dtype": "bfloat16",
    "optimizer": "paged_adamw_32bit",
}

# Add loss metrics only if available
if final_train_loss is not None:
    metrics["final_train_loss"] = float(final_train_loss)
if final_eval_loss is not None:
    metrics["final_eval_loss"] = float(final_eval_loss)

# Save metrics
with open(f"{OUTPUT_DIR}/training_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("✓ Training metrics saved")

# Display summary
print("\n" + "="*80)
print("QLORA TRAINING SUMMARY")
print("="*80)
for key, value in metrics.items():
    print(f"  {key}: {value}")
print("="*80)

Saving QLoRA model and training metrics...
✓ Model saved to: ./models/llama2-maternal-health-qlora
✓ Training metrics saved

QLORA TRAINING SUMMARY
  method: QLoRA
  model: meta-llama/Llama-2-7b-chat-hf
  lora_r: 64
  lora_alpha: 16
  lora_dropout: 0.1
  target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
  trainable_params: 159907840
  total_params: 3660320768
  trainable_percentage: 4.368683788535114
  training_time_minutes: 25.58257213830948
  epochs: 3
  batch_size: 2
  learning_rate: 0.0001
  compute_dtype: bfloat16
  optimizer: paged_adamw_32bit
  final_train_loss: 0.5871
  final_eval_loss: 0.8140416145324707


In [None]:
# ============================================================================
# Cell 13: Test the Trained QLoRA Model
# ============================================================================

print("Testing QLoRA model generation...\n")

# Prepare model for inference - ensure consistent dtype
model.eval()

# Option 1: Convert entire model to float32 (more compatible)
model = model.float()

def generate_response(prompt, max_new_tokens=256):
    """Generate response from trained model"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("[/INST]")[-1].strip()

# Test prompt
test_prompt = """<s>[INST] <<SYS>>
You are a knowledgeable, empathetic maternal health assistant. Provide accurate, evidence-based information from trusted sources like WHO, CDC, ACOG, and NIH. Be supportive and encouraging.
<</SYS>>

What are the signs of preeclampsia I should watch for? [/INST]"""

print("="*80)
print("TEST GENERATION - QLORA MODEL")
print("="*80)
print("\n📝 Question: What are the signs of preeclampsia I should watch for?")
print("\n🤖 AI Response:")
print("-" * 80)

response = generate_response(test_prompt)
print(response)

print("-" * 80)
print("\n✅ QLoRA model is working!")
print("\n✓ Check if the response:")
print("  • Mentions high blood pressure")
print("  • Lists warning signs (headache, vision changes, swelling)")
print("  • Sounds natural and helpful")
print("  • Is empathetic and supportive")

Testing QLoRA model generation...

TEST GENERATION - QLORA MODEL

📝 Question: What are the signs of preeclampsia I should watch for?

🤖 AI Response:
--------------------------------------------------------------------------------
Signs of preeclampsia can include sudden headache, severe abdominal pain, visual changes (blurriness or flashes), severe swelling, rapid weight gain, nausea, vomiting, fever, and changes in urine output; these symptoms often occur after 20 weeks but can occur earlier in certain cases. If you have any of these symptoms, contact your provider immediately because prompt evaluation is important to prevent complications such as seizures or stroke.
--------------------------------------------------------------------------------

✅ QLoRA model is working!

✓ Check if the response:
  • Mentions high blood pressure
  • Sounds natural and helpful
  • Is empathetic and supportive


In [None]:
# ============================================================================
# Cell 14: Verify All Files Were Saved Correctly
# ============================================================================

print("Verifying saved files...\n")

import os

# Check if output directory exists
if not os.path.exists(OUTPUT_DIR):
    print(f"❌ ERROR: Directory {OUTPUT_DIR} not found!")
else:
    print(f"✓ Directory exists: {OUTPUT_DIR}\n")

    # List all files
    model_files = os.listdir(OUTPUT_DIR)

    print(f"📁 Files in {OUTPUT_DIR}:")
    print("-" * 80)
    for file in sorted(model_files):
        file_path = os.path.join(OUTPUT_DIR, file)
        if os.path.isfile(file_path):
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"  ✓ {file:<40} {size_mb:>8.2f} MB")
        else:
            print(f"  ✓ {file}/ (folder)")
    print("-" * 80)

    # Check critical files
    print("\n✅ Critical files check:")
    critical_files = [
        "adapter_config.json",
        "adapter_model.bin",
        "training_metrics.json",
        "tokenizer_config.json"
    ]

    all_good = True
    for file in critical_files:
        exists = file in model_files
        status = "✓" if exists else "❌ MISSING"
        print(f"  {status} {file}")
        if not exists:
            all_good = False

    if all_good:
        print("\n" + "="*80)
        print("✅ ALL FILES SAVED SUCCESSFULLY!")
        print("="*80)
        print(f"\n📂 Model location: {OUTPUT_DIR}")
        print("✅ QLoRA model is ready for evaluation")
        print("✅ You now have 2 models to compare: LoRA and QLoRA")
        print("\n➡️  Next steps:")
        print("   • Option 1: Train Adapter (Notebook 4) for 3-way comparison")
        print("   • Option 2: Go to Evaluation (Notebook 5) to compare models")
    else:
        print("\n⚠️  WARNING: Some files are missing!")
        print("   Please check if training completed successfully")

Verifying saved files...

✓ Directory exists: ./models/llama2-maternal-health-qlora

📁 Files in ./models/llama2-maternal-health-qlora:
--------------------------------------------------------------------------------
  ✓ README.md                                    0.00 MB
  ✓ adapter_config.json                          0.00 MB
  ✓ adapter_model.safetensors                  610.06 MB
  ✓ chat_template.jinja                          0.00 MB
  ✓ checkpoint-141/ (folder)
  ✓ checkpoint-94/ (folder)
  ✓ special_tokens_map.json                      0.00 MB
  ✓ tokenizer.json                               3.45 MB
  ✓ tokenizer.model                              0.48 MB
  ✓ tokenizer_config.json                        0.00 MB
  ✓ training_metrics.json                        0.00 MB
--------------------------------------------------------------------------------

✅ Critical files check:
  ✓ adapter_config.json
  ❌ MISSING adapter_model.bin
  ✓ training_metrics.json
  ✓ tokenizer_config.json

 

In [None]:
# ============================================================================
# Cell: Deploy Streamlit App in Colab (FIXED)
# ============================================================================

print("🚀 Setting up Streamlit deployment for Colab...\n")

# 1. Install required packages
print("📦 Installing dependencies...")
!pip install -q streamlit pyngrok

# 2. Get the model path from your training
# Make sure these variables exist from your training cells
try:
    model_path = OUTPUT_DIR
    base_model = MODEL_NAME
    print(f"✓ Using model: {model_path}")
except NameError:
    # If variables don't exist, set them manually
    model_path = "./models/llama2-maternal-health-qlora"  # CHANGE THIS to your actual path
    base_model = "meta-llama/Llama-2-7b-chat-hf"
    print(f"⚠️  Using manual path: {model_path}")

# 3. Create the Streamlit app file
print("📝 Creating app.py file...")

app_code = f'''
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# Configuration
BEST_MODEL_PATH = "{model_path}"
BASE_MODEL = "{base_model}"

st.set_page_config(
    page_title="Maternal Health AI Assistant",
    page_icon="🤰",
    layout="wide"
)

@st.cache_resource
def load_model():
    with st.spinner("Loading AI model..."):
        tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_PATH)
        tokenizer.pad_token = tokenizer.eos_token

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )

        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )

        model = PeftModel.from_pretrained(base_model, BEST_MODEL_PATH)
        model.eval()

    return model, tokenizer

def generate_response(query, model, tokenizer):
    prompt = f"""<s>[INST] <<SYS>>
You are a knowledgeable, empathetic maternal health assistant. Provide accurate, evidence-based information.
<</SYS>>

{{query}} [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("[/INST]")[-1].strip()

# Main UI
st.title("🤰 Maternal Health AI Assistant")
st.caption("Evidence-based support for your pregnancy journey")

# Sidebar
with st.sidebar:
    st.header("ℹ️ About")
    st.write("AI-powered maternal health information from trusted sources (WHO, CDC, ACOG, NIH)")

    st.markdown("---")
    st.error("**⚠️ Emergency Care**: Call 911 for severe bleeding, chest pain, or other emergencies")

# Load model
try:
    model, tokenizer = load_model()
    st.success("✅ AI Assistant Ready!")
except Exception as e:
    st.error(f"Error loading model: {{e}}")
    st.stop()

# Chat interface
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Example questions (only show when no messages)
if len(st.session_state.messages) == 0:
    st.markdown("### 💡 Try asking:")
    col1, col2 = st.columns(2)

    with col1:
        if st.button("How much folic acid should I take?"):
            user_input = "How much folic acid should I take during pregnancy?"
            st.session_state.messages.append({{"role": "user", "content": user_input}})
            st.rerun()

    with col2:
        if st.button("What foods should I avoid?"):
            user_input = "What foods should I avoid during pregnancy?"
            st.session_state.messages.append({{"role": "user", "content": user_input}})
            st.rerun()

# Chat input
if prompt := st.chat_input("Ask me anything about maternal health..."):
    # Add user message
    st.session_state.messages.append({{"role": "user", "content": prompt}})
    with st.chat_message("user"):
        st.markdown(prompt)

    # Generate response
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            response = generate_response(prompt, model, tokenizer)
            st.markdown(response)
            st.caption("_This is AI-generated information. Always consult your healthcare provider._")

    st.session_state.messages.append({{"role": "assistant", "content": response}})

# Footer
st.markdown("---")
st.caption("💝 Built for maternal health education | Not a substitute for professional medical advice")
'''

# Write to file
with open('maternal_health_app.py', 'w') as f:
    f.write(app_code)

print("✅ App file created: maternal_health_app.py\n")

# 4. Run Streamlit with localtunnel
print("🌐 Starting Streamlit server with public URL...\n")
print("⏳ This will take 30-60 seconds to start...\n")

# Start streamlit in background
get_ipython().system_raw('streamlit run maternal_health_app.py --server.port 8501 --server.headless true &')

# Wait for streamlit to start
import time
time.sleep(15)

# Start localtunnel
print("🔗 Creating public URL...\n")
!npx localtunnel --port 8501

print("\n" + "="*80)
print("✅ LOOK FOR THE URL ABOVE!")
print("="*80)
print("\n📍 Find the line that says: 'your url is: https://xxxxx.loca.lt'")
print("🌐 CLICK THAT URL to access your chatbot!")
print("\n⚠️  Note: You may see a warning page - click 'Continue'")
print("="*80)

Checking saved model files...

Files in ./models/llama2-maternal-health-qlora:
  README.md                                      0.00 MB
  checkpoint-94                                  0.00 MB
  checkpoint-141                                 0.00 MB
  adapter_model.safetensors                    610.06 MB
  adapter_config.json                            0.00 MB
  chat_template.jinja                            0.00 MB
  tokenizer_config.json                          0.00 MB
  special_tokens_map.json                        0.00 MB
  tokenizer.model                                0.48 MB
  tokenizer.json                                 3.45 MB
  training_metrics.json                          0.00 MB

📊 Weight files found: 1
  ✓ adapter_model.safetensors

✅ Model weights exist - you can proceed with testing!
