# **Milestone-3: NLP/AI Text Interpretation**

In [1]:
# install all required dependencies
!apt-get update -qq
!apt-get install -y poppler-utils tesseract-ocr -qq
!pip install PyPDF2 pdf2image pdfplumber pytesseract pillow tensorflow -qq


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 117528 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.4 MB/s[0m eta [

In [2]:
# import all libraries
import os
import re
import json
import numpy as np
from pathlib import Path

import PyPDF2
from pdf2image import convert_from_path
import pytesseract

from tensorflow.keras.models import load_model


In [3]:
# define the directories
BASE_DIR = Path("/content/drive/MyDrive/AINutriCare")

RAW_REPORTS = BASE_DIR / "/content/drive/MyDrive/AINutriCare/Data/Raw_Reports"
X_PATH = BASE_DIR / "/content/drive/MyDrive/AINutriCare/Data/Transformed_data/X_timeseries.npy"
MODEL_PATH = BASE_DIR / "/content/drive/MyDrive/AINutriCare/Data/Transformed_data/icu_outcome_lstm (1).h5"
OUT_DIR = BASE_DIR / "Data/Final_Output"

OUT_DIR.mkdir(parents=True, exist_ok=True)


In [4]:
# define trained model path
X_train = np.load(X_PATH)

TIME_STEPS = X_train.shape[1]
NUM_FEATURES = X_train.shape[2]

FEATURE_MEAN = X_train.mean(axis=(0,1))
FEATURE_STD = X_train.std(axis=(0,1)) + 1e-6


In [5]:
# load the model
model = load_model(MODEL_PATH)
model.summary()




In [6]:
# extract the text from the pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    return text

def is_scanned_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            if page.extract_text():
                return False
    return True

def ocr_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    return " ".join(pytesseract.image_to_string(img, config="--psm 6") for img in images)


In [7]:
# clean the text
def clean_text(text):
    text = text.replace("\x0c", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [8]:
# extract the vitals and necesaary information
def extract_float(text, pattern):
    m = re.search(pattern, text, re.I)
    return float(m.group(1)) if m else None

def extract_int(text, pattern):
    m = re.search(pattern, text, re.I)
    return int(m.group(1)) if m else None

def extract_age(text):
    m = re.search(r"\bage\s*[:\-]?\s*(\d{2,3})\b", text, re.I)
    if not m:
        return None
    age = int(m.group(1))
    return age if 18 <= age <= 100 else None

def extract_features(text):
    f = {}

    # VITALS
    f["heart_rate"] = extract_int(text, r"(?:heart rate|hr|pulse)\s*[:\-]?\s*(\d+)")
    f["respiratory_rate"] = extract_int(text, r"(?:respiratory rate|rr)\s*[:\-]?\s*(\d+)")
    f["spo2"] = extract_int(text, r"(?:spo2|o2 saturation)\s*[:\-]?\s*(\d+)")
    f["temperature_fahrenheit"] = extract_float(text, r"(?:temperature|temp)\s*[:\-]?\s*([0-9.]+)")
    f["mean_bp"] = extract_int(text, r"(?:mean bp|blood pressure mean|map)\s*[:\-]?\s*(\d+)")

    # LABS
    f["glucose"] = extract_float(text, r"(?:glucose|blood sugar)\s*[:\-]?\s*([0-9.]+)")
    f["creatinine"] = extract_float(text, r"creatinine\s*[:\-]?\s*([0-9.]+)")
    f["urea"] = extract_float(text, r"(?:urea|bun)\s*[:\-]?\s*([0-9.]+)")
    f["sodium"] = extract_float(text, r"sodium\s*[:\-]?\s*([0-9.]+)")
    f["potassium"] = extract_float(text, r"potassium\s*[:\-]?\s*([0-9.]+)")
    f["hemoglobin"] = extract_float(text, r"(?:hemoglobin|hgb)\s*[:\-]?\s*([0-9.]+)")
    f["wbc"] = extract_float(text, r"wbc\s*[:\-]?\s*([0-9.]+)")
    f["lactate"] = extract_float(text, r"lactate\s*[:\-]?\s*([0-9.]+)")
    f["ph"] = extract_float(text, r"\bph\s*[:\-]?\s*([0-9.]+)")
    f["cholesterol"] = extract_float(text, r"cholesterol\s*[:\-]?\s*([0-9.]+)")

    # INTERVENTIONS
    t = text.lower()
    f["fluid_balance"] = int("fluid" in t)
    f["vasopressors"] = int("vasopressor" in t)
    f["sedatives"] = int("sedative" in t)
    f["antibiotics"] = int("antibiotic" in t)
    f["insulin"] = int("insulin" in t)

    # DEMOGRAPHICS
    f["age"] = extract_age(text)

    gender = re.search(r"\b(sex|gender)\s*[:\-]?\s*(male|female)\b", text, re.I)
    f["gender"] = 1 if gender and gender.group(2).lower() == "male" else 0

    return f


In [9]:
# define features orders from the trained model
FEATURE_ORDER = [
    # VITALS
    "heart_rate",
    "respiratory_rate",
    "spo2",
    "temperature_fahrenheit",
    "mean_bp",

    # LABS
    "glucose",
    "creatinine",
    "urea",
    "sodium",
    "potassium",
    "hemoglobin",
    "wbc",
    "lactate",
    "ph",
    "cholesterol",

    # INTERVENTIONS
    "fluid_balance",
    "vasopressors",
    "sedatives",
    "antibiotics",
    "insulin",

    # DEMOGRAPHICS
    "age",
    "gender"
]


In [10]:
# build model
def build_model_input(features):
    x = []
    for i, key in enumerate(FEATURE_ORDER):
        val = features.get(key)
        if val is None:
            val = FEATURE_MEAN[i]
        val = (val - FEATURE_MEAN[i]) / FEATURE_STD[i]
        x.append(val)

    x = np.array(x, dtype=np.float32)
    x = np.tile(x, (TIME_STEPS, 1))
    return x.reshape(1, TIME_STEPS, NUM_FEATURES)


In [11]:
# calculate the mortality risk
clinical_output = {}

for file in os.listdir(RAW_REPORTS):
    path = RAW_REPORTS / file

    raw = ocr_pdf(path) if file.endswith(".pdf") and is_scanned_pdf(path) else extract_text_from_pdf(path)
    text = clean_text(raw)

    features = extract_features(text)
    x_input = build_model_input(features)

    risk = float(model.predict(x_input, verbose=0)[0][0])

    clinical_output[path.stem] = {
        "mortality_risk": round(risk, 4),
        "risk_category": "Low" if risk < 0.3 else "Moderate" if risk < 0.6 else "High",
        "features_used": features
    }


In [12]:
# INSTALL DEPENDENCIES
!pip install pdfplumber PyPDF2 pdf2image pytesseract pillow tensorflow numpy
!apt-get update
!apt-get install -y poppler-utils tesseract-ocr

Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the ne

In [13]:

# IMPORTS
import re
import json
import numpy as np
import tensorflow as tf
import pdfplumber
from pdf2image import convert_from_path
import pytesseract


# PATH CONFIG

PDF_PATH = "/content/drive/MyDrive/AINutriCare/Data/Raw_Reports/REPORT.pdf"   # INPUT REPORT
MODEL_PATH = "/content/drive/MyDrive/AINutriCare/Data/Transformed_data/icu_outcome_lstm (1).h5"
X_REF_PATH = "/content/drive/MyDrive/AINutriCare/Data/Transformed_data/X_timeseries.npy"
OUTPUT_JSON = "clinical_output.json"



# FEATURE ORDER (MUST MATCH TRAINING)

FEATURE_NAMES = [
    "Heart Rate", "Respiratory Rate", "O2 saturation pulseoxymetry",
    "Temperature Fahrenheit", "Non Invasive Blood Pressure mean",
    "Glucose", "Creatinine", "Urea (BUN)", "Sodium", "Potassium",
    "Hemoglobin", "WBC", "Lactate", "pH", "Cholesterol",
    "Fluid_Balance", "Vasopressors", "Sedatives", "Antibiotics",
    "Insulin", "Age", "Gender"
]


# PDF TEXT EXTRACTION

def extract_pdf_text(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                if page.extract_text():
                    text += page.extract_text() + "\n"
        if len(text.strip()) < 50:
            # OCR fallback
            images = convert_from_path(pdf_path)
            for img in images:
                text += pytesseract.image_to_string(img)
    except Exception as e:
        print("Error reading PDF:", e)
    return text


# CLEAN TEXT

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# SAFE FLOAT EXTRACTION

def extract_float(text, pattern):
    m = re.search(pattern, text, re.I)
    if not m:
        return None
    for g in m.groups():
        try:
            return float(g)
        except:
            continue
    return None



# DYNAMIC FEATURE EXTRACTION

def extract_features(text):
    features = {}

    # ---- AGE & GENDER ----
    ag = re.search(r"(Male|Female)\s*/\s*(\d+)\s*Y", text, re.I)
    if ag:
        gender_str = ag.group(1)
        features["Gender"] = 1 if gender_str.lower() == "male" else 0
        features["Age"] = int(ag.group(2))
    else:
        features["Gender"] = None
        features["Age"] = None

    # ---- HEART RATE ----
    for p in [r"Heart Rate[:\s]+([\d.]+)",
        r"HR[:\s]+([\d.]+)",
        r"Pulse[:\s]+([\d.]+)",      # added Pulse
        r"Pulse Rate[:\s]+([\d.]+)",
        r"PR[:\s]+([\d.]+)"
    ]:
        val = extract_float(text, p)
        if val is not None:
            features["Heart Rate"] = val
            break
    else:
        features["Heart Rate"] = None

    # ---- TEMPERATURE ----
    for p in [r"Temperature[:\s]+([\d.]+)", r"Temp[:\s]+([\d.]+)"]:
        val = extract_float(text, p)
        if val is not None:
            features["Temperature Fahrenheit"] = val
            break
    else:
        features["Temperature Fahrenheit"] = None

    # ---- BLOOD PRESSURE ----
    for p in [r"BP[:\s]+([\d.]+)", r"Blood Pressure[:\s]+([\d.]+)", r"Mean BP[:\s]+([\d.]+)"]:
        val = extract_float(text, p)
        if val is not None:
            features["Non Invasive Blood Pressure mean"] = val
            break
    else:
        features["Non Invasive Blood Pressure mean"] = None

    # ---- LABS (DYNAMICALLY HANDLE ALIASES) ----
    lab_patterns = {
        "Glucose": [
        r"Glucose[:\s]+([\d.]+)",
        r"Blood Sugar[:\s]+([\d.]+)",
        r"Fasting Blood Sugar[:\s]+([\d.]+)",
        r"FBS[:\s]+([\d.]+)"
    ],

        "Creatinine": [r"Creatinine[:\s]+([\d.]+)"],
        "Urea (BUN)": [r"(Urea|BUN)[:\s]+([\d.]+)"],
        "Sodium": [r"Sodium[:\s]+([\d.]+)"],
        "Potassium": [r"Potassium[:\s]+([\d.]+)"],
        "Hemoglobin": [r"Hemoglobin[:\s]+([\d.]+)"],
        "WBC": [
        r"WBC[:\s]+([\d.]+)",
        r"White Blood Cells[:\s]+([\d.]+)",
        r"WBC count[:\s]+([\d.]+)"
    ],
        "Lactate": [r"Lactate[:\s]+([\d.]+)"],
        "pH": [r"pH[:\s]+([\d.]+)"],
        "Cholesterol": [r"Cholesterol[:\s]+([\d.]+)"]
    }

    for lab, patterns in lab_patterns.items():
        features[lab] = None
        for p in patterns:
            val = extract_float(text, p)
            if val is not None:
                features[lab] = val
                break

    return features


# BUILD MODEL INPUT

def build_model_input(extracted, x_reference):
    feature_vector = np.zeros(len(FEATURE_NAMES), dtype=np.float32)
    for i, fname in enumerate(FEATURE_NAMES):
        val = extracted.get(fname)
        if val is None:
            feature_vector[i] = np.nan
        else:
            feature_vector[i] = float(val)
    # NaN → 0 ONLY for model
    feature_vector_model = np.nan_to_num(feature_vector, nan=0.0)
    x_input = np.tile(feature_vector_model, (24,1))
    x_input = np.expand_dims(x_input, axis=0)
    return x_input


# CLINICAL INTERPRETATION

def clinical_interpretation(risk, vitals):
    conditions = []
    recommendations = []
    avoid = []

    if risk < 0.3:
        conditions.append("Low Clinical Risk")
    elif risk < 0.6:
        conditions.append("Moderate Clinical Risk")
    else:
        conditions.append("High Clinical Risk")

    if vitals["glucose"] and vitals["glucose"] >= 140:
        conditions.append("Hyperglycemia / Diabetes Risk")
        recommendations += ["Low-GI foods", "High-fiber diet"]
        avoid += ["Sugary foods", "Sweetened beverages"]

    if vitals["creatinine"] and vitals["creatinine"] >= 1.3:
        conditions.append("Possible Renal Impairment")
        recommendations += ["Renal-friendly diet"]
        avoid += ["High sodium foods"]

    if vitals["hemoglobin"] and vitals["hemoglobin"] < 10:
        conditions.append("Anemia")
        recommendations += ["Iron-rich foods"]

    if vitals["lactate"] and vitals["lactate"] >= 2.0:
        conditions.append("Possible Tissue Hypoperfusion")

    summary = (
        "Patient is clinically stable."
        if risk < 0.3 else
        "Patient requires monitoring and dietary management."
        if risk < 0.6 else
        "Patient requires urgent clinical and nutritional intervention."
    )

    return list(set(conditions)), list(set(recommendations)), list(set(avoid)), summary


# MAIN PIPELINE

def main():
    # Load model and reference data
    model = tf.keras.models.load_model(MODEL_PATH, compile=False)
    x_ref = np.load(X_REF_PATH)

    # Extract PDF text
    raw_text = extract_pdf_text(PDF_PATH)
    text = clean_text(raw_text)

    # Extract dynamic features
    features = extract_features(text)

    # Build input for model
    x_input = build_model_input(features, x_ref)

    # Predict mortality risk
    mortality_risk = float(model.predict(x_input)[0][0])

    # Build patient metrics output
    patient_metrics = {
    "mortality_risk": mortality_risk,
    "heart_rate": features.get("Heart Rate"),  # None if missing
    "temperature_f": features.get("Temperature Fahrenheit"),
    "blood_pressure_mean": features.get("Non Invasive Blood Pressure mean"),
    "glucose": features.get("Glucose"),
    "creatinine": features.get("Creatinine"),
    "urea": features.get("Urea (BUN)"),
    "sodium": features.get("Sodium"),
    "potassium": features.get("Potassium"),
    "hemoglobin": features.get("Hemoglobin"),
    "wbc": features.get("WBC"),
    "lactate": features.get("Lactate"),
    "ph": features.get("pH"),
    "cholesterol": features.get("Cholesterol"),
    "age": features.get("Age"),
    "gender": "Male" if features.get("Gender") == 1 else "Female"
}



    # Clinical interpretation
    conditions, recommendations, avoid, summary = clinical_interpretation(
        mortality_risk, patient_metrics
    )

    # Final JSON
    clinical_output = {
        "patient_metrics": patient_metrics,
        "conditions": conditions,
        "recommendations": recommendations,
        "avoid": avoid,
        "summary": summary
    }

    # Save JSON
    with open(OUTPUT_JSON, "w") as f:
        json.dump(clinical_output, f, indent=4)

    print(json.dumps(clinical_output, indent=4))


# RUN

if __name__ == "__main__":
    main()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step
{
    "patient_metrics": {
        "mortality_risk": 0.2999638617038727,
        "heart_rate": 0.0,
        "temperature_f": null,
        "blood_pressure_mean": null,
        "glucose": 157.07,
        "creatinine": null,
        "urea": null,
        "sodium": null,
        "potassium": null,
        "hemoglobin": 14.5,
        "wbc": null,
        "lactate": null,
        "ph": 6.0,
        "cholesterol": 189.0,
        "age": 41,
        "gender": "Male"
    },
    "conditions": [
        "Low Clinical Risk",
        "Hyperglycemia / Diabetes Risk"
    ],
    "recommendations": [
        "High-fiber diet",
        "Low-GI foods"
    ],
    "avoid": [
        "Sugary foods",
        "Sweetened beverages"
    ],
    "summary": "Patient is clinically stable."
}


In [14]:
# building knowledge base
import pandas as pd
import json

# Load CSVs
ifct_df = pd.read_csv("/content/drive/MyDrive/AINutriCare/Data/Raw_data/ifct2017_compositions.csv")
indian_df = pd.read_csv("/content/drive/MyDrive/AINutriCare/Data/Raw_data/indian_food.csv")

# Inspect columns (for debugging)
print("IFCT columns:", ifct_df.columns.tolist())
print("Indian Food columns:", indian_df.columns.tolist())

# Merge on 'name' column (food name)
merged_df = pd.merge(ifct_df, indian_df, on="name", how="outer")  # outer join keeps all items

# Fill missing nutrition values with 0
nutrition_cols = ['enerc','protcnt','fatce','choavldf']  # calories, protein, fat, carbs
for col in nutrition_cols:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].fillna(0)

# Convert merged CSV to JSON KB
diet_kb = merged_df.to_dict(orient="records")

with open("diet_kb.json", "w") as f:
    json.dump(diet_kb, f, indent=4)

print("Merged diet knowledge base saved to diet_kb.json")



IFCT columns: ['code', 'name', 'scie', 'regn', 'water', 'water_e', 'protcnt', 'protcnt_e', 'ash', 'ash_e', 'fatce', 'fatce_e', 'fibtg', 'fibtg_e', 'fibins', 'fibins_e', 'fibsol', 'fibsol_e', 'choavldf', 'choavldf_e', 'enerc', 'enerc_e', 'dhbenzac34', 'dhbenzac34_e', 'hbenzal3', 'hbenzal3_e', 'pcathac', 'pcathac_e', 'vanlac', 'vanlac_e', 'gallac', 'gallac_e', 'cinmac', 'cinmac_e', 'coumaco', 'coumaco_e', 'coumacp', 'coumacp_e', 'caffac', 'caffac_e', 'chlrac', 'chlrac_e', 'ferac', 'ferac_e', 'apigen', 'apigen_e', 'apigen6cgls', 'apigen6cgls_e', 'apigen7onshps', 'apigen7onshps_e', 'luteol', 'luteol_e', 'kaemf', 'kaemf_e', 'querce', 'querce_e', 'querce3bdgls', 'querce3bdgls_e', 'querce3ortns', 'querce3ortns_e', 'querce3bgls', 'querce3bgls_e', 'isormt', 'isormt_e', 'myrct', 'myrct_e', 'rsvrtol', 'rsvrtol_e', 'hespt', 'hespt_e', 'narng', 'narng_e', 'hespd', 'hespd_e', 'daidzn', 'daidzn_e', 'gnstein', 'gnstein_e', 'epicatec', 'epicatec_e', 'epicategc', 'epicategc_e', 'epicatgc3gal', 'epicatgc

In [15]:
import json

with open("clinical_output.json", "r") as f:
    clinical_data = json.load(f)

print(json.dumps(clinical_data, indent=2))


{
  "patient_metrics": {
    "mortality_risk": 0.2999638617038727,
    "heart_rate": 0.0,
    "temperature_f": null,
    "blood_pressure_mean": null,
    "glucose": 157.07,
    "creatinine": null,
    "urea": null,
    "sodium": null,
    "potassium": null,
    "hemoglobin": 14.5,
    "wbc": null,
    "lactate": null,
    "ph": 6.0,
    "cholesterol": 189.0,
    "age": 41,
    "gender": "Male"
  },
  "conditions": [
    "Low Clinical Risk",
    "Hyperglycemia / Diabetes Risk"
  ],
  "recommendations": [
    "High-fiber diet",
    "Low-GI foods"
  ],
  "avoid": [
    "Sugary foods",
    "Sweetened beverages"
  ],
  "summary": "Patient is clinically stable."
}


In [16]:
# Load diet KB
with open("diet_kb.json", "r") as f:
    diet_kb = json.load(f)

# Construct the prompt
prompt = f"""
You are a clinical diet assistant.
Patient metrics:
{json.dumps(clinical_data['patient_metrics'], indent=2)}

Patient conditions:
{', '.join(clinical_data.get('conditions', []))}

Diet KB contains food items with nutrition info and tags: diabetic_friendly, low_sugar, high_protein, renal_safe.

Generate a 1-day meal plan JSON with the following structure:

{{
  "day_plan": {{
    "breakfast": [{{"item": "...", "calories": 0.0, "protein": 0.0, "fat": 0.0, "carbs": 0.0, "tags": []}}],
    "lunch": [],
    "dinner": [],
    "snacks": []
  }},
  "total_nutrition": {{"calories": 0.0, "protein": 0.0, "fat": 0.0, "carbs": 0.0}},
  "medical_reasoning": "Explain why these items were chosen for the patient."
}}

Use the KB to pick items suitable for the patient's conditions.
"""


In [17]:
# Install the official Gemini Python SDK
!pip install google-generativeai

import json
import os
import google.generativeai as genai




All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


In [18]:
! pip install google-genai




In [19]:
GEMINI_API_KEY = ""  # paste your Gemini API key from AI Studio

In [20]:
! pip install --upgrade google-generativeai




In [21]:
import json
from google import genai


# Configure Gemini API Key

# GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"  # from AI Studio
client = genai.Client(api_key=GEMINI_API_KEY)


#  Load Clinical & Diet KB

with open("clinical_output.json", "r") as f:
    clinical_data = json.load(f)

with open("diet_kb.json", "r") as f:
    diet_kb = json.load(f)


#  Build Prompt

prompt_text = f"""
You are a clinical dietitian AI.

IMPORTANT RULES:
- The patient is an ADULT unless explicitly stated otherwise.
- Do NOT assume pediatric or toddler age.
- Use ONLY Indian food items or Indian-style meals.
- Meals must be realistic (e.g., "Oats with skim milk", "Grilled chicken salad").

Patient clinical metrics:
{json.dumps(clinical_data, indent=2)}

Diet knowledge base:
{json.dumps(diet_kb[:50], indent=2)}

Generate a ONE-DAY diet plan in EXACT JSON FORMAT below.
Do NOT add text outside JSON.

{{
  "day_plan": {{
    "breakfast": [
      {{
        "item": "",
        "calories": 0.0,
        "protein": 0.0,
        "fat": 0.0,
        "carbs": 0.0,
        "tags": []
      }}
    ],
    "lunch": [],
    "dinner": [],
    "snacks": []
  }},
  "total_nutrition": {{
    "calories": 0.0,
    "protein": 0.0,
    "fat": 0.0,
    "carbs": 0.0
  }},
  "medical_reasoning": ""
}}

STRICT TAGGING RULES (MANDATORY):
- If creatinine > 2.0 → EVERY meal item MUST include "renal_safe"
- If glucose > 180 → EVERY meal item MUST include "diabetic_friendly" AND "low_sugar"
- If sodium > 145 → EVERY meal item MUST include "heart_healthy"
- If cholesterol > 240 → AVOID fried foods AND include "low_fat"
- Tags must be added EVEN IF the food is naturally safe

MEAL STYLE REQUIREMENT:
- Meals must be described in natural form:
  Example:
  "Oats with skim milk"
  "Grilled chicken salad with olive oil"
  "Vegetable soup with carrot and bottle gourd"
- Avoid listing raw ingredients alone.
- Follow Indian eating patterns.


"""



#  Call Gemini Model

response = client.models.generate_content(
    model="gemini-2.5-flash",   # or another available Gemini model
    contents=prompt_text        # pass prompt as a plain string
)


#  Extract Text Directly

diet_plan_text = response.text  # this property is correct usage
print("Raw Gemini output:\n", diet_plan_text)


#  Parse JSON Safely

try:
    diet_plan_json = json.loads(diet_plan_text)
except json.JSONDecodeError:
    # Clean up if necessary
    cleaned = diet_plan_text.strip().strip("```json").strip("```")
    diet_plan_json = json.loads(cleaned)


#  Save Output

with open("daily_diet_plan.json", "w") as f:
    json.dump(diet_plan_json, f, indent=4)

print("Diet plan JSON saved as daily_diet_plan.json")


Raw Gemini output:
 ```json
{
  "day_plan": {
    "breakfast": [
      {
        "item": "Bajra Roti (2 medium)",
        "calories": 180.0,
        "protein": 7.0,
        "fat": 5.0,
        "carbs": 30.0,
        "tags": [
          "high_fiber",
          "low_GI",
          "diabetic_friendly",
          "low_sugar"
        ]
      },
      {
        "item": "Mixed Vegetable Sabzi (1 cup, prepared with Agathi leaves, carrots, and green beans with minimal oil)",
        "calories": 100.0,
        "protein": 4.0,
        "fat": 6.0,
        "carbs": 12.0,
        "tags": [
          "high_fiber",
          "low_GI",
          "diabetic_friendly",
          "low_sugar"
        ]
      }
    ],
    "lunch": [
      {
        "item": "Whole Wheat Roti (2 medium)",
        "calories": 200.0,
        "protein": 7.0,
        "fat": 5.0,
        "carbs": 35.0,
        "tags": [
          "high_fiber",
          "low_GI",
          "diabetic_friendly",
          "low_sugar"
        ]
      

In [22]:
import json
import time
from collections import defaultdict

In [23]:
import json
from google import genai

NUM_DAYS = 7  # frontend can change later

# Extend Prompt (NO rewrite)

weekly_prompt = prompt_text + f"""

ADDITIONAL INSTRUCTIONS (DO NOT IGNORE):

- Generate diet plans for {NUM_DAYS} DAYS.
- Each day MUST have meal VARIATION.
- Structure output EXACTLY as below.
- Compute weekly nutrition summary by summing all days.
- Generate a grocery list aggregated from all meals.

FINAL OUTPUT JSON FORMAT (STRICT):

{{
  "days_generated": {NUM_DAYS},
  "diet_plans": {{
    "day_1": {{ ... SAME STRUCTURE AS SINGLE DAY ... }},
    "day_2": {{ ... }},
    "...": {{ }}
  }},
  "weekly_nutrition_summary": {{
    "average_per_day": {{
      "calories": 0.0,
      "protein": 0.0,
      "fat": 0.0,
      "carbs": 0.0
    }},
    "weekly_total": {{
      "calories": 0.0,
      "protein": 0.0,
      "fat": 0.0,
      "carbs": 0.0
    }}
  }},
  "grocery_list": [
    {{
      "item": "",
      "times_used_in_week": 0
    }}
  ]
}}

RULES:
- Maintain ALL medical tagging rules.
- No forbidden foods.
- Indian meals only.
- No text outside JSON.
"""


# Single Gemini Call (Quota Safe)

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=weekly_prompt
)

raw_output = response.text.strip()
print("Raw Weekly Output:\n", raw_output)

# Parse JSON Safely

try:
    weekly_diet_json = json.loads(raw_output)
except json.JSONDecodeError:
    cleaned = raw_output.strip("```json").strip("```")
    weekly_diet_json = json.loads(cleaned)


# Save Final Output

with open("weekly_diet_plan.json", "w") as f:
    json.dump(weekly_diet_json, f, indent=4)

print(" Weekly diet plan saved as weekly_diet_plan.json")


Raw Weekly Output:
 ```json
{
  "days_generated": 7,
  "diet_plans": {
    "day_1": {
      "breakfast": [
        {
          "item": "Bajra Porridge",
          "calories": 174.0,
          "protein": 5.48,
          "fat": 2.72,
          "carbs": 30.89,
          "tags": [
            "diabetic_friendly",
            "low_sugar",
            "high_fiber_diet",
            "low_gi_foods"
          ]
        },
        {
          "item": "Almonds",
          "calories": 60.92,
          "protein": 1.84,
          "fat": 5.85,
          "carbs": 0.3,
          "tags": [
            "diabetic_friendly",
            "low_sugar",
            "high_fiber_diet",
            "low_gi_foods"
          ]
        }
      ],
      "lunch": [
        {
          "item": "Barley Roti",
          "calories": 189.47,
          "protein": 6.56,
          "fat": 0.78,
          "carbs": 36.77,
          "tags": [
            "diabetic_friendly",
            "low_sugar",
            "high_fiber_diet",