In [2]:
!pip install pytesseract pdf2image pillow pdfplumber pandas





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
import re
import json
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from typing import Dict, Any, List

# ==========================================
# CONFIGURATION: Set the file path here
# ==========================================
# Change this path to the specific PDF you want to process right now.
PDF_PATH = r"C:\Users\saipr\Downloads\satish-patnaik-report-17-Jan-2026-1768654292841.pdf" 

# Point to Tesseract (only needed if PDF has no selectable text)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# ==========================================
# UNIVERSAL PATTERN MAP
# ==========================================
# Each key has a LIST of patterns. The script tries them in order.
EXTRACTORS = {
    "Glucose": [
        # Format 1: "Glucose fasting ... 146" (Satish)
        r"Glucose\s*[-\s]*fasting\b.*?\s+(\d{2,3})",
        # Format 2: "Fasting Blood Sugar ... H 141.0" (Lyubochka)
        r"Fasting\s+Blood\s+Sugar\b.*?\s+(?:H|L)?\s*(\d{2,3}(?:\.\d+)?)"
    ],
    "HbA1c": [
        # Format 1: "Glyco Hb (HbA1C) ... 8.2"
        r"Glyco\s+Hb\s*\(HbA1C\)\b.*?\s+(\d{1,2}\.\d{1,2})",
        # Format 2: "HbA1c ... H 7.10"
        r"HbA1c\b.*?\s+(?:H|L)?\s*(\d{1,2}\.\d{1,2})"
    ],
    "Creatinine": [
        # Format 1 & 2: "Creatinine" or "Creatinine, Serum"
        r"Creatinine(?:-serum|,\s*Serum)?\b.*?\s+(\d{1,2}\.\d{1,2})"
    ],
    "Urea": [
        # Format 1: "UREA* ... 18"
        r"UREA\s*\*?\s+(\d{2,3}(?:\.\d+)?)",
        # Format 2: "Urea ... L 18.0" (Handle L/H flags)
        r"Urea\b.*?\s+(?:H|L)?\s*(\d{2,3}(?:\.\d+)?)"
    ],
    "Hemoglobin": [
        # Format 1 (British spelling): "Haemoglobin"
        r"Haemoglobin\b.*?\s+(\d{1,2}\.\d{1,2})",
        # Format 2 (American spelling): "Hemoglobin"
        r"Hemoglobin\b.*?\s+(\d{1,2}\.\d{1,2})"
    ],
    "WBC": [
        # Format 1: "Total WBC Count"
        r"Total\s+WBC\s+Count\b.*?\s+(\d{4,6})",
        # Format 2: "WBC Count"
        r"WBC\s+Count\b.*?\s+(\d{4,6})"
    ],
    "Cholesterol": [
        # Format 1: "Cholesterol-Total"
        r"Cholesterol-Total\b.*?\s+(\d{2,3})",
        # Format 2: "Cholesterol" (Negative lookahead avoids matching HDL/LDL lines)
        r"Cholesterol\b(?!.*HDL).*?\s+(\d{2,3}(?:\.\d+)?)"
    ],
    "Triglycerides": [
        # Universal: Matches "Triglyceride" or "Triglycerides" + ignores H/L flags
        r"Triglycerides?\b.*?\s+(?:H|L)?\s*(\d{2,4}(?:\.\d+)?)"
    ],
    "TSH": [
        # Format 1: "TSH(THYROID...)"
        r"TSH\s*\(THYROID\s+STATIMULATING\s+HORMONE\)\s*(\d+\.\d+)",
        # Format 2: "TSH - Thyroid..."
        r"(?:TSH|Thyroid\s+Stimulating\s+Hormone)\b.*?\s+(\d{1,2}\.\d{2,4})"
    ],
    "Vitamin D": [
        # Format 2 specific
        r"25\(OH\)\s+Vitamin\s+D\b.*?\s+(\d{1,3}\.\d{2})"
    ],
    "Vitamin B12": [
        # Format 2 specific: Handles "< 148" or numbers
        r"Vitamin\s+B12\b.*?\s+(?:H|L|<|>)?\s*(\d{2,4})"
    ],
    "Age": [
        # Format 1: "Age/Gender: 54 years"
        r"Age\s*/\s*Gender\s*:\s*(\d{1,3})\s*years",
        # Format 2: "Sex/Age : Male / 41 Y"
        r"Sex\s*/\s*Age\s*:\s*\w+\s*/\s*(\d{1,3})\s*Y"
    ]
}

UNIT_MAP = {
    "Glucose": "mg/dL", "Creatinine": "mg/dL", "Urea": "mg/dL", 
    "Hemoglobin": "g/dL", "WBC": "/cmm", "Age": "Years", 
    "Cholesterol": "mg/dL", "Triglycerides": "mg/dL", "HbA1c": "%", 
    "TSH": "¬µIU/mL", "Vitamin D": "ng/mL", "Vitamin B12": "pg/mL"
}

def get_text_from_pdf(pdf_path: str) -> str:
    print(f"[INFO] Processing file: {pdf_path}")
    text_pages = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # x_tolerance=2 helps preserve spaces between columns in tables
                text = page.extract_text(x_tolerance=2)
                if text:
                    text_pages.append(text)
    except Exception as e:
        print(f"[WARN] pdfplumber error: {e}")

    full_text = "\n".join(text_pages)

    # Fallback to OCR if text is missing or extremely short
    if len(full_text) < 50:
        print("[INFO] Text not found. Attempting OCR...")
        try:
            images = convert_from_path(pdf_path, dpi=300)
            ocr_text = []
            for img in images:
                ocr_text.append(pytesseract.image_to_string(img))
            full_text = "\n".join(ocr_text)
        except Exception as e:
            print(f"[ERROR] OCR failed: {e}")
    
    return full_text

def extract_data(text: str) -> Dict[str, Any]:
    results = {}
    
    for param_name, patterns in EXTRACTORS.items():
        found_value = None
        match_source = None
        
        # Try every pattern in the list until one works
        for pattern in patterns:
            # re.DOTALL allows the dot (.) to match newlines if layout is multi-line
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                # We assume the actual value is the LAST capture group in the regex
                found_value = match.groups()[-1].strip()
                match_source = match.group(0).strip()
                break # Stop looking for this parameter
        
        results[param_name] = {
            "value": found_value,
            "unit": UNIT_MAP.get(param_name, ""),
            "raw_match": match_source if found_value else "Not Found"
        }
        
    return results

def main():
    # 1. Extract text from the single file defined in PDF_PATH
    raw_text = get_text_from_pdf(PDF_PATH)
    
    # 2. Extract structured data
    data = extract_data(raw_text)
    
    # 3. Print Results
    print("\n" + "="*40)
    print("       EXTRACTION RESULTS       ")
    print("="*40)
    
    for key, info in data.items():
        val = info['value']
        unit = info['unit']
        display_val = f"{val} {unit}" if val else "---"
        print(f"{key:15}: {display_val}")

    # 4. Save to JSON
    out_file = "patient_vitals.json"
    with open(out_file, "w") as f:
        json.dump(data, f, indent=4)
    print(f"\n[SUCCESS] Results saved to '{out_file}'")

if __name__ == "__main__":
    main()

[INFO] Processing file: C:\Users\saipr\Downloads\satish-patnaik-report-17-Jan-2026-1768654292841.pdf

       EXTRACTION RESULTS       
Glucose        : 146 mg/dL
HbA1c          : 8.2 %
Creatinine     : 8.41 mg/dL
Urea           : 18 mg/dL
Hemoglobin     : 14.7 g/dL
WBC            : 5520 /cmm
Cholesterol    : 263 mg/dL
Triglycerides  : 293 mg/dL
TSH            : 7.415 ¬µIU/mL
Vitamin D      : ---
Vitamin B12    : ---
Age            : 54 Years

[SUCCESS] Results saved to 'patient_vitals.json'


In [26]:
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer
import os

# ==========================================
# 1. Configuration & Paths
# ==========================================
# Ensure these paths match your local setup
JSON_INPUT = "patient_vitals.json"
MODEL_PATH = r"C:\AINutriCare\Notebooks\Milestone_2\LSTM\attention_lstm.h5"
SCALER_PATH = r"C:\AINutriCare\Data\Transformed\X_final.npy"

# The 17 Features the Model was trained on (Must be in this exact order)
MODEL_FEATURES = [
    "Heart Rate", "MAP", "Respiratory Rate", "Temperature", 
    "Glucose", "Creatinine", "BUN", "Sodium", "Potassium", "Hemoglobin", "WBC", "Lactate",
    "Fluid Balance", "Vasopressors", "Sedatives", "Antibiotics", "Insulin"
]

# Defaults for values NOT in the PDF (Assumes resting/stable state for missing vitals)
DEFAULTS = {
    'Heart Rate': 75, 'MAP': 90, 'Respiratory Rate': 16, 'Temperature': 98.4,
    'Lactate': 1.0, 'Fluid Balance': 0, 'Vasopressors': 0, 'Sedatives': 0, 
    'Antibiotics': 0, 'Insulin': 0
}

# ==========================================
# 2. Define Custom Layer (Required to load model)
# ==========================================
@tf.keras.utils.register_keras_serializable()
class SimpleAttention(Layer):
    def __init__(self, units=64, **kwargs):
        super(SimpleAttention, self).__init__(**kwargs)
        self.units = units
    def get_config(self):
        config = super(SimpleAttention, self).get_config()
        config.update({"units": self.units})
        return config
    def build(self, input_shape):
        self.W1 = self.add_weight(name='att_w1', shape=(input_shape[-1], self.units), initializer='glorot_uniform')
        self.W2 = self.add_weight(name='att_w2', shape=(self.units, 1), initializer='glorot_uniform')
        self.b1 = self.add_weight(name='att_b1', shape=(self.units,), initializer='zeros')
        super(SimpleAttention, self).build(input_shape)
    def call(self, x):
        h = tf.nn.tanh(tf.matmul(x, self.W1) + self.b1)
        e = tf.squeeze(tf.matmul(h, self.W2), -1)
        alpha = tf.nn.softmax(e)
        context = x * tf.expand_dims(alpha, -1)
        context = tf.reduce_sum(context, axis=1)
        return context, alpha

# ==========================================
# 3. Load Resources
# ==========================================
def load_ai_resources():
    print("Loading AI Model & Scaler...")
    try:
        # 1. Load Model
        if not os.path.exists(MODEL_PATH):
            raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
        model = load_model(MODEL_PATH, custom_objects={'SimpleAttention': SimpleAttention})
        
        # 2. Load Scaler Statistics (Mean/Std from Training)
        if os.path.exists(SCALER_PATH):
            X_ref = np.load(SCALER_PATH)
            X_flat = X_ref.reshape(-1, X_ref.shape[2])
            means = np.mean(X_flat, axis=0)
            stds = np.std(X_flat, axis=0)
            stds[stds == 0] = 1.0 # Prevent divide by zero
        else:
            print("‚ö†Ô∏è Scaler file (X_final.npy) not found. Using raw unscaled values (Results may be inaccurate).")
            means = np.zeros(17)
            stds = np.ones(17)
            
        print("‚úÖ AI Resources Loaded Successfully.")
        return model, means, stds
        
    except Exception as e:
        print(f"‚ùå Error Loading AI Resources: {e}")
        return None, None, None

# ==========================================
# 4. Data Processing (JSON -> Tensor)
# ==========================================
def preprocess_patient_data(json_file, means, stds):
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    # helper to safely get float values
    def get_val(key):
        item = data.get(key)
        if isinstance(item, dict):
            val = item.get('value')
        else:
            val = item
            
        if val in [None, "N/A", "Not Found"]: return None
        try:
            # Clean string like "H 141.0" -> 141.0
            clean = str(val).replace('H', '').replace('L', '').strip()
            return float(clean)
        except:
            return None

    # 1. Build Feature Vector (17,)
    vector = []
    extracted_values = {} # For reporting
    
    for feature in MODEL_FEATURES:
        # Map Model Feature Names to JSON Keys
        json_key = feature
        if feature == "Cholesterol": json_key = "Cholestrol" # Handle typo in json if present
        
        val = get_val(json_key)
        
        # Fallback Logic
        if val is None:
            val = DEFAULTS.get(feature, 0)
        
        # Scaling fixes (e.g. WBC 10570 -> 10.57)
        if feature == "WBC" and val > 1000:
            val = val / 1000.0
            
        vector.append(val)
        extracted_values[feature] = val

    # 2. Create Time Series (Repeat static data for 24 hours)
    # Shape: (1, 24, 17)
    patient_matrix = np.tile(vector, (24, 1))
    
    # 3. Normalize
    normalized_matrix = (patient_matrix - means) / stds
    input_tensor = normalized_matrix.reshape(1, 24, 17)
    
    return input_tensor, extracted_values, data

# ==========================================
# 5. Diet Logic (Post-Prediction)
# ==========================================
def generate_clinical_report(risk_score, vitals, raw_json):
    print(f"\n{'='*60}")
    print(f" AI CLINICAL ANALYSIS REPORT")
    print(f"{'='*60}")
    
    # --- 1. Risk Interpretation ---
    if risk_score > 0.60:
        status = "HIGH RISK (Critical)"
        action = "Immediate Metabolic Intervention"
    elif risk_score > 0.30:
        status = "MODERATE RISK"
        action = "Dietary Management & Monitoring"
    else:
        status = "STABLE"
        action = "Routine Maintenance"
        
    print(f"\n[1] MODEL PREDICTION")
    print(f"    Mortality/ICU Risk: {risk_score:.2%}")
    print(f"    Clinical Status:    {status}")
    print(f"    Recommended Action: {action}")

    # --- 2. Key Drivers ---
    print(f"\n[2] BIOMARKER ANALYSIS")
    
    # Check Diabetes
    gluc = vitals['Glucose']
    hba1c_val = raw_json.get('HbA1c', {}).get('value', 'N/A')
    print(f"    - Glucose: {gluc} mg/dL", end="")
    if gluc > 140: print(" (HIGH - Driver for Risk)")
    else: print(" (Normal)")
    
    print(f"    - HbA1c:   {hba1c_val} %", end="")
    try:
        if float(str(hba1c_val).replace('H','')) > 6.5: print(" (DIABETIC RANGE)")
        else: print("")
    except: print("")

    # Check Renal
    creat = vitals['Creatinine']
    print(f"    - Creatinine: {creat} mg/dL", end="")
    if creat > 1.2: print(" (RENAL STRESS)")
    else: print(" (Normal)")

    # --- 3. Diet Plan ---
    print(f"\n[3] AI-GENERATED NUTRITION PLAN")
    
    if gluc > 126 or (hba1c_val != 'N/A' and float(str(hba1c_val).replace('H','')) > 6.5):
        print("    Protocol: DIABETIC / LOW-GLYCEMIC INDEX")
        print("    - Carbohydrates: Restricted to 40% of total calories.")
        print("    - Focus: Complex carbs only (Fiber > 30g/day).")
        print("    - Avoid: Fruit juices, white bread, processed sugars.")
    elif risk_score > 0.5:
        print("    Protocol: CRITICAL CARE SUPPORT (High Protein)")
        print("    - Focus: Preventing muscle wasting (Catabolism).")
    else:
        print("    Protocol: STANDARD BALANCED DIET")
        print("    - Maintain current nutritional intake.")

    print("-" * 60)

def generate_clinical_report(risk_score, vitals, raw_json):
    """
    Analyzes prediction & vitals to create a JSON for the LLM.
    """
    print(f"\nProcessing AI Clinical Analysis...")

    # --- 1. Initialize Structure for LLM ---
    llm_context = {
        "patient_metrics": {
            "mortality_risk": float(risk_score),  # JSON needs native float, not numpy
            "glucose": float(vitals['Glucose']),
            "creatinine": float(vitals['Creatinine'])
        },
        "conditions": [],
        "avoid": [],
        "recommend": [],
        "summary": ""
    }

    # --- 2. Risk Interpretation ---
    if risk_score > 0.60:
        llm_context['conditions'].append("Critical Stability Risk")
        llm_context['summary'] = "Patient is at HIGH RISK. Immediate metabolic intervention required."
    elif risk_score > 0.30:
        llm_context['conditions'].append("Moderate Clinical Risk")
        llm_context['summary'] = "Patient requires dietary management and monitoring."
    else:
        llm_context['summary'] = "Patient is stable. Routine maintenance diet recommended."

    # --- 3. Biomarker Analysis (Logic -> Rules) ---
    
    # Check Diabetes / Glucose
    glucose = vitals['Glucose']
    hba1c_val = raw_json.get('HbA1c', {}).get('value', 'N/A')
    
    is_diabetic = False
    if glucose > 126:
        is_diabetic = True
    # Handle H141 type strings if present in raw json
    if hba1c_val != 'N/A':
        try:
            val = float(str(hba1c_val).replace('H','').replace('L',''))
            if val > 6.5: is_diabetic = True
        except: pass

    if is_diabetic:
        llm_context['conditions'].append("Diabetes (Type 2 / Hyperglycemia)")
        llm_context['avoid'].extend(["Fruit juices", "White bread", "Processed sugars", "High-GI foods"])
        llm_context['recommend'].extend(["Complex carbohydrates", "High fiber foods (>30g/day)", "Leafy greens"])

    # Check Renal (Kidneys)
    creatinine = vitals['Creatinine']
    if creatinine > 1.2:
        llm_context['conditions'].append("Renal Stress / Kidney Strain")
        llm_context['avoid'].extend(["High sodium foods", "Excessive red meat", "Processed deli meats"])
        llm_context['recommend'].extend(["Low-potassium vegetables", "Cauliflower", "Berries"])

    # Check Hypertension (using MAP as proxy if BP not split)
    # MAP > 100 often correlates with high BP
    if vitals['MAP'] > 100: 
        llm_context['conditions'].append("Hypertension Risk")
        llm_context['avoid'].append("Salt/Sodium")
        llm_context['recommend'].append("DASH diet principles")

    # If no specific conditions found, add general healthy advice
    if not llm_context['conditions']:
        llm_context['conditions'].append("General Health Maintenance")
        llm_context['recommend'].append("Balanced diet with lean proteins and vegetables")

    return llm_context

# ==========================================
# 6. Main Execution Loop
# ==========================================
if __name__ == "__main__":
    # 1. Load Model
    model, means, stds = load_ai_resources()
    
    if model:
        # 2. Process Data
        if os.path.exists(JSON_INPUT):
            input_tensor, vitals_dict, raw_data = preprocess_patient_data(JSON_INPUT, means, stds)
            
            # 3. Predict
            print("Running LSTM Prediction...")
            prediction = model.predict(input_tensor, verbose=0)[0][0]
            
            # 4. Generate & Save Report
            ai_output = generate_clinical_report(prediction, vitals_dict, raw_data)
            
            # Output Filename
            OUTPUT_FILE = "clinical_output.json"
            
            with open(OUTPUT_FILE, 'w') as f:
                json.dump(ai_output, f, indent=4)
                
            print(f"‚úÖ Success! Analysis saved to: {OUTPUT_FILE}")
            print(json.dumps(ai_output, indent=2)) # Print preview
            
        else:
            print(f"‚ùå Error: {JSON_INPUT} not found. Run the extraction step first.")



Loading AI Model & Scaler...
‚úÖ AI Resources Loaded Successfully.
Running LSTM Prediction...

Processing AI Clinical Analysis...
‚úÖ Success! Analysis saved to: clinical_output.json
{
  "patient_metrics": {
    "mortality_risk": 0.32739484310150146,
    "glucose": 146.0,
    "creatinine": 8.41
  },
  "conditions": [
    "Moderate Clinical Risk",
    "Diabetes (Type 2 / Hyperglycemia)",
    "Renal Stress / Kidney Strain"
  ],
  "avoid": [
    "Fruit juices",
    "White bread",
    "Processed sugars",
    "High-GI foods",
    "High sodium foods",
    "Excessive red meat",
    "Processed deli meats"
  ],
  "recommend": [
    "Complex carbohydrates",
    "High fiber foods (>30g/day)",
    "Leafy greens",
    "Low-potassium vegetables",
    "Cauliflower",
    "Berries"
  ],
  "summary": "Patient requires dietary management and monitoring."
}


In [10]:
!pip install google-genai

Collecting google-genai
  Downloading google_genai-1.58.0-py3-none-any.whl.metadata (53 kB)
Collecting tenacity<9.2.0,>=8.2.3 (from google-genai)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting websockets<15.1.0,>=13.0.0 (from google-genai)
  Using cached websockets-15.0.1-cp313-cp313-win_amd64.whl.metadata (7.0 kB)
Collecting distro<2,>=1.7.0 (from google-genai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting sniffio (from google-genai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Downloading google_genai-1.58.0-py3-none-any.whl (718 kB)
   ---------------------------------------- 0.0/718.4 kB ? eta -:--:--
   ----------------------------- ---------- 524.3/718.4 kB 3.0 MB/s eta 0:00:01
   ---------------------------------------- 718.4/718.4 kB 3.0 MB/s  0:00:00
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Downloading tenacity-9.1.2-py3-none-any.whl (28 kB)
Using cached websockets-15.0.1-cp313-cp313-win_amd64


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


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
AIzaSyD_lIMzf-pDJhSOZoEo8o5PtZUik6Yit6c

In [28]:
import json
import os
import pandas as pd

# ==========================================
# 1. CONFIGURATION (GOOGLE GEMINI)
# ==========================================
from google import genai
from google.genai import types

GEMINI_API_KEY = "AIzaSyD_lIMzf-pDJhSOZoEo8o5PtZUik6Yit6c"  # or set env var GEMINI_API_KEY
MODEL_ID = "gemini-2.5-flash"  # choose your Gemini model

# File Paths
CLINICAL_INPUT = "clinical_output.json"
FOOD_KB_FILE = "diet_kb.json"

client = genai.Client(api_key=GEMINI_API_KEY)


# ==========================================
# 2. AUTO-TAGGING & FILTERING SYSTEM
# ==========================================
def load_and_tag_data(filepath):
    """
    Loads diet_kb.json and adds medical tags based on macros.
    """
    if not os.path.exists(filepath):
        return pd.DataFrame()  # Return empty if missing

    with open(filepath, "r") as f:
        data = json.load(f)

    df = pd.DataFrame(data)

    # --- AUTO-TAGGING LOGIC ---
    def get_tags(row):
        tags = []
        # Diabetic Friendly: Low Carb (<30g) OR Low Sugar (ingredients check)
        if row["Carbohydrate (g)"] < 30 and "sugar" not in str(row["ingredients"]).lower():
            tags.append("diabetic_friendly")
            tags.append("low_sugar")

        # High Protein: > 10g
        if row["Protein (g)"] > 10:
            tags.append("high_protein")

        # Low Fat: < 8g
        if row["Total Fat (g)"] < 8:
            tags.append("low_fat")

        # Renal Safe (Simplified): Moderate Protein (5‚Äì15g)
        if 5 < row["Protein (g)"] < 15:
            tags.append("renal_safe")

        return tags

    df["medical_tags"] = df.apply(get_tags, axis=1)
    return df


def get_smart_candidates(df, clinical_insights):
    """
    Filters the tagged dataframe to find the best candidates for the patient.
    """
    if df.empty:
        return {}

    # 1. Parse Constraints
    conditions = " ".join(clinical_insights.get("conditions", [])).lower()
    avoid = " ".join(clinical_insights.get("avoid", [])).lower()

    # 2. Apply Filters
    candidates = df.copy()

    # DIABETES FILTER
    if "diabetes" in conditions or "sugar" in avoid:
        candidates = candidates[candidates["medical_tags"].apply(lambda x: "diabetic_friendly" in x)]

    # RENAL FILTER
    if "renal" in conditions or "kidney" in conditions:
        candidates = candidates[candidates["medical_tags"].apply(lambda x: "renal_safe" in x)]

    # 3. Categorize
    breakfast_keywords = "idli|dosa|upma|poha|paratha|oats|porridge"
    breakfast_df = candidates[candidates["name"].str.contains(breakfast_keywords, case=False, na=False)]
    if len(breakfast_df) < 2:
        breakfast_df = candidates.sample(n=min(5, len(candidates)))

    # Lunch/Dinner: high calorie mains
    mains_df = candidates[candidates["Energy (kcal)"] > 150]

    # Snacks: low calorie
    snacks_df = candidates[candidates["Energy (kcal)"] < 150]

    def serialize(sub_df, count=5):
        return sub_df.sample(n=min(count, len(sub_df))).to_dict(orient="records")

    return {
        "breakfast": serialize(breakfast_df),
        "lunch": serialize(mains_df),
        "dinner": serialize(mains_df),
        "snacks": serialize(snacks_df),
    }


# ==========================================
# 3. LLM GENERATOR (Structure Enforcer ‚Äì GEMINI)
# ==========================================
def generate_structured_plan(patient_profile, clinical_data, food_candidates):
    summary = clinical_data.get("summary", "Healthy Diet")
    options_preview = json.dumps(food_candidates, indent=2)

    prompt = f"""
    You are an AI Clinical Dietitian.

    PATIENT: {patient_profile['name']}
    CONDITION: {summary}

    TASK:
    Select items from the PROVIDED CANDIDATE LIST below to create a 1-day meal plan.
    You must output the result in STRICT JSON format matching the user's required schema.

    CANDIDATE FOODS (Pick from these):
    {options_preview}

    REQUIRED OUTPUT FORMAT (JSON):
    {{
      "day_plan": {{
          "breakfast": [
              {{ "item": "Name", "calories": 100, "protein": 5, "fat": 2, "carbs": 10, "tags": ["tag1", "tag2"] }}
          ],
          "lunch": [],
          "dinner": [],
          "snacks": []
      }},
      "total_nutrition": {{
          "calories": 0,
          "protein": 0,
          "carbs": 0,
          "fat": 0
      }},
      "medical_reasoning": "Brief explanation..."
    }}

    RULES:
    1. Use the EXACT nutritional values from the candidate list. Do not invent numbers.
    2. Include the 'medical_tags' provided in the candidate list.
    3. Calculate the 'total_nutrition' sum correctly.
    4. Output JSON ONLY. No text before or after.
    """

    # JSON schema ‚Äì fully specified for Gemini
    schema = {
        "type": "object",
        "properties": {
            "day_plan": {
                "type": "object",
                "properties": {
                    "breakfast": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "item": {"type": "string"},
                                "calories": {"type": "number"},
                                "protein": {"type": "number"},
                                "fat": {"type": "number"},
                                "carbs": {"type": "number"},
                                "tags": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                },
                            },
                            "required": [
                                "item",
                                "calories",
                                "protein",
                                "fat",
                                "carbs",
                                "tags",
                            ],
                        },
                    },
                    "lunch": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "item": {"type": "string"},
                                "calories": {"type": "number"},
                                "protein": {"type": "number"},
                                "fat": {"type": "number"},
                                "carbs": {"type": "number"},
                                "tags": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                },
                            },
                            "required": [
                                "item",
                                "calories",
                                "protein",
                                "fat",
                                "carbs",
                                "tags",
                            ],
                        },
                    },
                    "dinner": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "item": {"type": "string"},
                                "calories": {"type": "number"},
                                "protein": {"type": "number"},
                                "fat": {"type": "number"},
                                "carbs": {"type": "number"},
                                "tags": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                },
                            },
                            "required": [
                                "item",
                                "calories",
                                "protein",
                                "fat",
                                "carbs",
                                "tags",
                            ],
                        },
                    },
                    "snacks": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "item": {"type": "string"},
                                "calories": {"type": "number"},
                                "protein": {"type": "number"},
                                "fat": {"type": "number"},
                                "carbs": {"type": "number"},
                                "tags": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                },
                            },
                            "required": [
                                "item",
                                "calories",
                                "protein",
                                "fat",
                                "carbs",
                                "tags",
                            ],
                        },
                    },
                },
                "required": ["breakfast", "lunch", "dinner", "snacks"],
            },
            "total_nutrition": {
                "type": "object",
                "properties": {
                    "calories": {"type": "number"},
                    "protein": {"type": "number"},
                    "fat": {"type": "number"},
                    "carbs": {"type": "number"},
                },
                "required": ["calories", "protein", "fat", "carbs"],
            },
            "medical_reasoning": {"type": "string"},
        },
        "required": ["day_plan", "total_nutrition", "medical_reasoning"],
    }

    try:
        print("ü§ñ AI is structuring the JSON plan...")
        resp = client.models.generate_content(
            model=MODEL_ID,
            contents=prompt,
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema,
                temperature=0.1,
            ),
        )

        # In JSON mode, resp.parsed is already a Python dict
        if hasattr(resp, "parsed") and resp.parsed is not None:
            return resp.parsed

        raw_text = resp.text.strip()
        return json.loads(raw_text)

    except Exception as e:
        raw_text = ""
        try:
            raw_text = resp.text  # may exist if request reached the model
        except:
            pass
        return {"error": str(e), "raw_output": raw_text}


# ==========================================
# 4. ROUNDING HELPER
# ==========================================
def round_plan(plan, ndigits=0):
    for meal in ["breakfast", "lunch", "dinner", "snacks"]:
        for dish in plan.get("day_plan", {}).get(meal, []):
            for key in ["calories", "protein", "fat", "carbs"]:
                val = dish.get(key)
                if isinstance(val, (int, float)):
                    dish[key] = round(val, ndigits)

    tn = plan.get("total_nutrition", {})
    for key in ["calories", "protein", "fat", "carbs"]:
        val = tn.get(key)
        if isinstance(val, (int, float)):
            tn[key] = round(val, ndigits)
    return plan


# ==========================================
# 5. MAIN EXECUTION
# ==========================================
if __name__ == "__main__":

    print("Loading Food DB...")
    df = load_and_tag_data(FOOD_KB_FILE)

    # Clinical insights
    if os.path.exists(CLINICAL_INPUT):
        with open(CLINICAL_INPUT, "r") as f:
            clinical_insights = json.load(f)
    else:
        clinical_insights = {
            "conditions": ["Diabetes"],
            "summary": "Patient requires low-sugar, low-carb diet.",
        }

    candidates = get_smart_candidates(df, clinical_insights)

    patient = {"name": "Rajesh Kumar", "age": 45}
    final_json = generate_structured_plan(patient, clinical_insights, candidates)
    final_json = round_plan(final_json, ndigits=0)

    if "error" not in final_json:
        print("\n‚úÖ GENERATED JSON:")
        print(json.dumps(final_json, indent=2))

        with open("final_structured_diet.json", "w") as f:
            json.dump(final_json, f, indent=2)
            print("\nSaved to 'final_structured_diet.json'")
    else:
        print("‚ùå Error:", final_json["error"])


Loading Food DB...
ü§ñ AI is structuring the JSON plan...

‚úÖ GENERATED JSON:
{
  "day_plan": {
    "breakfast": [
      {
        "item": "Theeyal",
        "calories": 155.0,
        "protein": 14.0,
        "fat": 31.0,
        "carbs": 16.0,
        "tags": [
          "diabetic_friendly",
          "low_sugar",
          "high_protein",
          "renal_safe"
        ]
      },
      {
        "item": "Brown Rice",
        "calories": 139.0,
        "protein": 14.0,
        "fat": 13.0,
        "carbs": 22.0,
        "tags": [
          "diabetic_friendly",
          "low_sugar",
          "high_protein",
          "renal_safe"
        ]
      }
    ],
    "lunch": [
      {
        "item": "Keerai kootu",
        "calories": 278.0,
        "protein": 8.0,
        "fat": 37.0,
        "carbs": 11.0,
        "tags": [
          "diabetic_friendly",
          "low_sugar",
          "renal_safe"
        ]
      }
    ],
    "dinner": [
      {
        "item": "Vindaloo",
        "c