In [1]:
import os
import pandas as pd
import json

In [2]:
from google import genai

In [9]:
import time

In [4]:
#Read files
data_path = "c:/Users/reema.alhenaki/Desktop/llama3_Data/data/cleaned"
patient_df = pd.read_csv(os.path.join(data_path, "HIS_Patient.csv"))
vitals_df = pd.read_csv(os.path.join(data_path, "HIS_PatientVitalSigns.csv"))
appointments_df = pd.read_csv(os.path.join(data_path, "HIS_Appointment.csv"))
docOrders_df = pd.read_csv(os.path.join(data_path, "HIS_DoctorOrder.csv"))

In [None]:
# Initialize client with API key and checking connection with Gemini API

client = genai.Client(api_key="API KEY")


response = client.models.generate_content(
    model="gemini-2.5-flash", contents="Based on the data you read give a clear and a full human readable summary about each patient. If the value is 'nan' to include it in the summary. This is the data"
)
print(response.text)

Please provide the data you want me to summarize. Once you provide the data, I will give you a clear and full human-readable summary for each patient, including 'nan' values as requested.


In [None]:
#Function to remove Nan
def clean_nan_records(record):
    return {k: v for k, v in record.items() if pd.notna(v)}
 

## All Patients Summary Creation

In [11]:
# Prepare the list of Patient IDs
patient_ids = patient_df["PatientID"].unique().tolist()
all_patients_data = []
 
# The path of the file that stores the json data + summary
output_path = "C:/Users/reema.alhenaki/Desktop/llama3_Data/data/json/patient_summaries_GEMINI.json"
 
# Load progress if exists
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        all_patients_data = json.load(f)
    processed_ids = {p["PatientID"] for p in all_patients_data}
else:
    processed_ids = set()
 
# Retry configuration
max_retries = 3
retry_delay = 10  # seconds
 
for idx, pid in enumerate(patient_ids, start=1):
    if pid in processed_ids:
        print(f"Skipping already processed patient {pid}")
        continue
 
    print(f"Processing patient {idx}/{len(patient_ids)}: {pid}")
 
    # Extract filtered patient data
    patient_info = patient_df[patient_df["PatientID"] == pid].to_dict(orient="records")
    vital_signs = vitals_df[vitals_df["PatientID"] == pid].to_dict(orient="records")
    appointments = appointments_df[appointments_df["PatientID"] == pid].to_dict(orient="records")
    doctor_orders = docOrders_df[docOrders_df["PatientID"] == pid].to_dict(orient="records")
 
    # Clean nested records
    vital_signs = [clean_nan_records(record) for record in vital_signs]
    appointments = [clean_nan_records(record) for record in appointments]
    doctor_orders = [clean_nan_records(record) for record in doctor_orders]
 
    # Clean patient info and remove PatientID key
    if patient_info:
        patient_info_cleaned = clean_nan_records(patient_info[0])
        patient_info_cleaned.pop("PatientID", None)
    else:
        patient_info_cleaned = {}
 
    # Build patient data dictionary
    patient_data = {
        "PatientID": pid,
        "PatientInfo": patient_info_cleaned,
        "VitalSigns": vital_signs,
        "Appointments": appointments,
        "DoctorOrders": doctor_orders,
    }

    
    # Convert patient data to JSON string for the prompt
    patient_data_str = json.dumps(patient_data, indent=2, ensure_ascii=False)
 
    # Create your prompt as you are currently using
    prompt = (
        "Based on the data you read, give a clear and full human-readable summary about this patient. "
        "If any value is 'nan', please include it explicitly in the summary. If the patient gender is 1 which indicates male then do not mention pregnancy because he is definatly not. Don't include (Here is a clear and full human-readable summary about the patient), start with the summary immediately. If you face a Nan value ignore it and do not mention the info related to it in the summary Here is the data:\n\n"
    + patient_data_str
        + patient_data_str
    )
 
    # Retry logic with current API call
    retries = 0
    while retries < max_retries:
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=prompt
            )
            full_summary = response.text
            break  # Success
        except Exception as e:
            retries += 1
            print(f"Error generating summary for PatientID {pid} (Attempt {retries}/{max_retries}): {e}")
            if retries == max_retries:
                print(f"❌ Max retries reached. Stopping safely.")
                # Save progress before exiting
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(all_patients_data, f, indent=2, ensure_ascii=False)
                exit()

            else:
                print(f"Retrying after {retry_delay} seconds...")
                time.sleep(retry_delay)
 
    # Add summary to patient data
    patient_data["Summary"] = full_summary
 
    # Append to master list
    all_patients_data.append(patient_data)
 
    # Save after each patient
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_patients_data, f, indent=2, ensure_ascii=False)
 
    # Optional: Small delay to respect API limits
    time.sleep(1)
 
print(f"✅ Summaries generated and saved to {output_path}")

 

Processing patient 1/12: 2677554
Processing patient 2/12: 1910035
Processing patient 3/12: 2573089
Processing patient 4/12: 2420011
Processing patient 5/12: 2339688
Processing patient 6/12: 2116903
Processing patient 7/12: 2080108
Processing patient 8/12: 1424125
Processing patient 9/12: 1354371
Processing patient 10/12: 1144414
Processing patient 11/12: 50172351
Processing patient 12/12: 50087049
✅ Summaries generated and saved to C:/Users/reema.alhenaki/Desktop/llama3_Data/data/json/patient_summaries_GEMINI.json


## Single Patient Summary per Json file

In [6]:
single_patient_id = 50172351 

# Extract patient data
patient_info = patient_df[patient_df["PatientID"] == single_patient_id].to_dict(orient="records")
vital_signs = vitals_df[vitals_df["PatientID"] == single_patient_id].to_dict(orient="records")
appointments = appointments_df[appointments_df["PatientID"] == single_patient_id].to_dict(orient="records")
doctor_orders = docOrders_df[docOrders_df["PatientID"] == single_patient_id].to_dict(orient="records")

# Function to remove NaN 
def clean_nan_records(record):
    """Remove keys with NaN values from a dictionary."""
    return {k: v for k, v in record.items() if pd.notna(v)}
 
# Clean nested records
vital_signs = [clean_nan_records(record) for record in vital_signs]
appointments = [clean_nan_records(record) for record in appointments]
doctor_orders = [clean_nan_records(record) for record in doctor_orders]
 
# Clean patient info and remove duplicated PatientID key
if patient_info:
    patient_info_cleaned = clean_nan_records(patient_info[0])
    patient_info_cleaned.pop("PatientID", None)
else:
    patient_info_cleaned = {}

# Build patient data dictionary
patient_data = {
    "PatientID": single_patient_id,
    "PatientInfo": patient_info[0] if patient_info else {},
    "VitalSigns": vital_signs,
    "Appointments": appointments,
    "DoctorOrders": doctor_orders,
}

# Print patient data before sending to AI model
print("=== Patient Data to send ===")
print(json.dumps(patient_data, indent=2, ensure_ascii=False))
print("===========================")


=== Patient Data to send ===
{
  "PatientID": 50172351,
  "PatientInfo": {
    "PatientID": 50172351,
    "RegistrationDate": "20/12/2011",
    "FirstName": "Yusuf",
    "MiddleName": "Abdullah",
    "LastName": "Abdullah",
    "Gender": 1,
    "DateofBirth": "20/12/1967 14:43",
    "NationalityID": "SAU",
    "FirstVisit": "20/12/2011 14:44",
    "LastVisit": "25/06/2018 17:17",
    "NoOfVisit": 19,
    "MobileNumber": 555333541,
    "EmailAddress": "yusuf@mail.com",
    "IsPregnant": 0,
    "BloodGroup": 0,
    "RHFactor": " ",
    "RegisteredDoctor": 9802,
    "EmergencyContactName": "NAWAL",
    "EmergencyContactNo": 555333542
  },
  "VitalSigns": [
    {
      "PatientID": 50172351,
      "WeightKg": 100.0,
      "HeightCm": 167.0,
      "BodyMassIndex": 0.0,
      "TemperatureCelcius": 36.9,
      "PulseBeatPerMinute": 86,
      "RespirationBeatPerMinute": 20,
      "BloodPressureLower": 83,
      "BloodPressureHigher": 190,
      "SAO2": 99,
      "FIO2": 0.0,
      "PainScore":

In [7]:
# Prepare prompt
patient_data_str = json.dumps(patient_data, indent=2, ensure_ascii=False)
prompt = (
    "Based on the data you read, give a clear and full human-readable summary about this patient. "
    "If any value is 'nan', please include it explicitly in the summary. If the patient gender is 1 which indicates male then do not mention pregnancy because he is definatly not. Don't include (Here is a clear and full human-readable summary about the patient), start with the summary immediately. If you face a Nan value ignore it and do not mention the info related to it in the summary Here is the data:\n\n"
    + patient_data_str
)

# Generate summary
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=prompt
)

# Add summary to patient data
patient_data["Summary"] = response.text

print(response.text)

This patient, Yusuf Abdullah Abdullah (Patient ID: 50172351), born on December 20, 1967, is a male of Saudi Arabian nationality. He registered on December 20, 2011, and had his first visit on the same day. His last visit was on June 25, 2018, having accumulated 19 visits. His mobile number is 555333541 and his email address is yusuf@mail.com. His emergency contact is NAWAL, reachable at 555333542. The registered doctor for this patient is 9802. His blood group is O, and his RH Factor is not specified.

Latest vital signs recorded on July 18, 2017, at 17:59:00 include: Weight: 100.0 kg, Height: 167.0 cm, Body Mass Index: 0.0, Temperature: 36.9 Celcius, Pulse: 86 beats per minute, Respiration: 20 beats per minute, Blood Pressure: 190/83, SAO2: 99, FIO2: 0.0, Pain Score: 0.0, Triage Category: 0, and GC Score: 0.0.

An upcoming appointment (Appointment No: 14516303) is scheduled for June 24, 2025, from 17:16:00 to 17:21:00 at Clinic ID 8 with Doctor ID 163154. This visit is of type 3 and i

In [8]:
# Save patient data (with summary) to a JSON file
output_path = "c:/Users/reema.alhenaki/Desktop/llama3_Data/data/json/single_patient_GeminiSummary.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(patient_data, f, indent=2, ensure_ascii=False)

print(f"✅ Summary generated and saved to {output_path}")


✅ Summary generated and saved to c:/Users/reema.alhenaki/Desktop/llama3_Data/data/json/single_patient_GeminiSummary.json
