In [3]:
%pip install names

Note: you may need to restart the kernel to use updated packages.Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
     ---------------------------------------- 0.0/789.1 kB ? eta -:--:--
     -------------------------------------- 789.1/789.1 kB 8.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: names
  Building wheel for names (setup.py): started
  Building wheel for names (setup.py): finished with status 'done'
  Created wheel for names: filename=names-0.3.0-py3-none-any.whl size=803711 sha256=54629312f92593308caa31a74770c8af954b4ebcfc885eae6fc30c095f1a8f2b
  Stored in directory: C:\Users\sayan\AppData\Local\Temp\pip-ephem-wheel-cache-dlbl5iwp\wheels\c7\f0\8f\de9f15941cd988c39b82703fa04cb2d550ba5867f13c6da052
Successfully built names
Installing collected packages: names
Successfully installed

DEPRECATION: Loading egg at c:\users\sayan\appdata\local\programs\python\python312\lib\site-packages\entfa-1.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# %% [markdown]
# # Healthcare Patient Analytics ETL & Star Schema (Improved)
#
# This notebook demonstrates:
# 1. Generating >=10 rows per table (Patients, Doctors, Admissions, Vitals, Treatments, Readmission_Risk).
# 2. Data cleaning & validation (primary key uniqueness, not-null checks, foreign key checks).
# 3. Star Schema with surrogate keys in dimension tables (DimPatients, DimDoctors) and three fact tables:
#    - FactAdmissions (merged with readmission risk),
#    - FactVitals,
#    - FactTreatments.
# 4. Loading the final tables into MySQL.

import pandas as pd
import numpy as np
import random
import names
from datetime import datetime, timedelta
from sqlalchemy import create_engine

# For demonstration only; supress warnings
import warnings
warnings.filterwarnings("ignore")


# %% [markdown]
# ## 1. Generate Sample Data (>= 10 Rows per Table)

def random_date(start_year=2020, end_year=2025):
    """Generate a random date between start_year and end_year."""
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    delta = end_date - start_date
    random_days = random.randrange(delta.days)
    return start_date + timedelta(days=random_days)

def random_phone():
    """Generate a random 10-digit phone number as a string."""
    return str(random.randint(10**9, 10**10 - 1))

def random_diagnosis():
    return random.choice(["Pneumonia", "Hypertension", "Asthma Attack", "Diabetes Complications", 
                          "Heart Failure", "Sepsis", "Kidney Stones", "Migraine", "COVID-19", "Fracture"])

def random_specialization():
    return random.choice(["Cardiologist", "Pulmonologist", "General Physician", "Neurologist",
                          "Orthopedic", "Endocrinologist", "Gastroenterologist"])

def random_chronic_condition():
    return random.choice(["None", "Hypertension", "Asthma", "Diabetes", "Heart Disease", "None", "None"])

# 1.1 Patients (>= 10 rows)
num_patients = 10
patient_ids = list(range(101, 101 + num_patients))
patients_data = []
for pid in patient_ids:
    patients_data.append([
        pid,
        names.get_first_name(),
        names.get_last_name(),
        random_date(1950, 2000).date(),  # dob
        random.choice(["Male", "Female"]),
        random_phone(),
        f"{random.randint(100,999)} Main St",
        random_chronic_condition()
    ])

patients_df = pd.DataFrame(patients_data, columns=[
    "patient_id", "first_name", "last_name", "dob", "gender", "contact_no", "address", "chronic_conditions"
])

# 1.2 Doctors (>= 10 rows)
num_doctors = 10
doctor_ids = list(range(301, 301 + num_doctors))
doctors_data = []
for did in doctor_ids:
    doctors_data.append([
        did,
        names.get_first_name(),
        names.get_last_name(),
        random_specialization(),
        random_phone()
    ])

doctors_df = pd.DataFrame(doctors_data, columns=[
    "doctor_id", "first_name", "last_name", "specialization", "contact_no"
])

# 1.3 Admissions (>= 10 rows)
# We will ensure each admission references an existing patient & doctor
num_admissions = 10
admission_ids = list(range(2001, 2001 + num_admissions))
admissions_data = []
for aid in admission_ids:
    patient_id = random.choice(patient_ids)
    doctor_id = random.choice(doctor_ids)
    admission_date = random_date(2024, 2025)
    # Some admissions have not been discharged yet
    discharge_date = admission_date + timedelta(days=random.randint(1, 10)) if random.random() > 0.3 else None
    diagnosis = random_diagnosis()
    room_no = random.choice(["A101","A102","B210","C305","B405","ICU1","ICU2","D110","D120","E201"])
    admissions_data.append([
        aid, patient_id, admission_date.date(),
        discharge_date.date() if discharge_date else None,
        diagnosis, doctor_id, room_no
    ])

admissions_df = pd.DataFrame(admissions_data, columns=[
    "admission_id", "patient_id", "admission_date", "discharge_date",
    "diagnosis", "doctor_id", "room_no"
])

# 1.4 Vitals (>= 10 rows)
# Each vitals row references an existing admission
num_vitals = 10
vital_ids = list(range(5001, 5001 + num_vitals))
vitals_data = []
for vid in vital_ids:
    admission_id = random.choice(admission_ids)
    # Just pick a random time near the admission_date
    base_date = admissions_df.loc[admissions_df['admission_id'] == admission_id, 'admission_date'].values[0]
    # Convert base_date to datetime
    base_datetime = pd.to_datetime(base_date)
    recorded_time = base_datetime + timedelta(hours=random.randint(0, 100))
    heart_rate = random.randint(60, 120)
    bp_systolic = random.randint(100, 160)
    bp_diastolic = random.randint(70, 100)
    blood_pressure = f"{bp_systolic}/{bp_diastolic}"
    oxygen_level = random.randint(88, 100)
    temperature = round(random.uniform(97.0, 103.0), 1)
    vitals_data.append([
        vid, admission_id, recorded_time, heart_rate, blood_pressure, oxygen_level, temperature
    ])

vitals_df = pd.DataFrame(vitals_data, columns=[
    "vital_id", "admission_id", "recorded_time", "heart_rate",
    "blood_pressure", "oxygen_level", "temperature"
])

# 1.5 Treatments (>= 10 rows)
num_treatments = 10
treatment_ids = list(range(7001, 7001 + num_treatments))
treatments_data = []
possible_procedures = ["Nebulization", "Blood Pressure Monitoring", "ECG", "X-Ray", 
                       "MRI Scan", "IV Fluid Therapy", "Physical Therapy", "Vaccination"]
possible_meds = ["Amoxicillin 500mg", "Prednisone 10mg", "Metoprolol 50mg", "Ibuprofen 400mg",
                 "Acetaminophen 500mg", "Atorvastatin 20mg", "Insulin 10units"]
for tid in treatment_ids:
    admission_id = random.choice(admission_ids)
    # approximate date of treatment around admission_date
    base_date = admissions_df.loc[admissions_df['admission_id'] == admission_id, 'admission_date'].values[0]
    base_datetime = pd.to_datetime(base_date)
    treat_date = base_datetime + timedelta(days=random.randint(0, 5))
    procedure = random.choice(possible_procedures)
    medication = random.choice(possible_meds)
    dosage = random.choice(["1x daily", "2x daily", "3x daily", "As needed"])
    treatments_data.append([
        tid, admission_id, treat_date.date(), procedure, medication, dosage
    ])

treatments_df = pd.DataFrame(treatments_data, columns=[
    "treatment_id", "admission_id", "treatment_date", "procedure", "medication", "dosage"
])

# 1.6 Readmission_Risk (>= 10 rows)
# We'll keep a 1-to-1 relationship with admissions for demonstration
risk_ids = list(range(9001, 9001 + num_admissions))
risk_data = []
for i, aid in enumerate(admission_ids):
    pred_date = admissions_df.loc[admissions_df['admission_id'] == aid, 'admission_date'].values[0]
    pred_date = pd.to_datetime(pred_date) + timedelta(days=random.randint(0,2))
    risk_score = round(random.uniform(0.2, 0.9), 2)
    if risk_score < 0.4:
        risk_level = "Low"
    elif risk_score < 0.7:
        risk_level = "Medium"
    else:
        risk_level = "High"
    risk_data.append([
        risk_ids[i], aid, pred_date.date(), risk_score, risk_level
    ])

risk_df = pd.DataFrame(risk_data, columns=[
    "risk_id", "admission_id", "prediction_date", "risk_score", "risk_level"
])

# Write dataframes to CSV (simulate source files)
patients_df.to_csv("patients.csv", index=False)
doctors_df.to_csv("doctors.csv", index=False)
admissions_df.to_csv("admissions.csv", index=False)
vitals_df.to_csv("vitals.csv", index=False)
treatments_df.to_csv("treatments.csv", index=False)
risk_df.to_csv("readmission_risk.csv", index=False)

print("Sample CSV files created with >= 10 rows each.")


# %% [markdown]
# ## 2. Read CSV Files (Extract Phase)

patients = pd.read_csv("patients.csv", parse_dates=["dob"])
doctors = pd.read_csv("doctors.csv")
admissions = pd.read_csv("admissions.csv", parse_dates=["admission_date", "discharge_date"])
vitals = pd.read_csv("vitals.csv", parse_dates=["recorded_time"])
treatments = pd.read_csv("treatments.csv", parse_dates=["treatment_date"])
readmission_risk = pd.read_csv("readmission_risk.csv", parse_dates=["prediction_date"])

print("CSV files loaded into DataFrames.")


# %% [markdown]
# ## 3. Data Cleaning & Validation

# ### 3.1 Check Primary Key Uniqueness & Drop Duplicates

def check_and_drop_duplicates(df, pk_col, table_name):
    dup_count = df.duplicated(subset=[pk_col]).sum()
    if dup_count > 0:
        print(f"{table_name}: Dropping {dup_count} duplicate rows based on primary key {pk_col}.")
        df = df.drop_duplicates(subset=[pk_col])
    return df

patients = check_and_drop_duplicates(patients, "patient_id", "Patients")
doctors = check_and_drop_duplicates(doctors, "doctor_id", "Doctors")
admissions = check_and_drop_duplicates(admissions, "admission_id", "Admissions")
vitals = check_and_drop_duplicates(vitals, "vital_id", "Vitals")
treatments = check_and_drop_duplicates(treatments, "treatment_id", "Treatments")
readmission_risk = check_and_drop_duplicates(readmission_risk, "risk_id", "Readmission_Risk")

# ### 3.2 Check Required (NOT NULL) Columns

def drop_missing_required(df, required_cols, table_name):
    missing_mask = df[required_cols].isnull().any(axis=1)
    missing_count = missing_mask.sum()
    if missing_count > 0:
        print(f"{table_name}: Dropping {missing_count} rows with missing data in required columns {required_cols}.")
        df = df[~missing_mask]
    return df

patients = drop_missing_required(patients, ["patient_id", "first_name", "last_name", "dob", "gender", "contact_no"], "Patients")
doctors = drop_missing_required(doctors, ["doctor_id", "first_name", "last_name", "specialization", "contact_no"], "Doctors")
admissions = drop_missing_required(admissions, ["admission_id", "patient_id", "admission_date", "diagnosis", "doctor_id"], "Admissions")
vitals = drop_missing_required(vitals, ["vital_id", "admission_id", "recorded_time", "heart_rate", "blood_pressure", "oxygen_level", "temperature"], "Vitals")
treatments = drop_missing_required(treatments, ["treatment_id", "admission_id", "treatment_date", "medication"], "Treatments")
readmission_risk = drop_missing_required(readmission_risk, ["risk_id", "admission_id", "prediction_date", "risk_score", "risk_level"], "Readmission_Risk")

# ### 3.3 Validate Foreign Keys

# Admissions -> Patients
invalid_pat_fk = ~admissions['patient_id'].isin(patients['patient_id'])
if invalid_pat_fk.sum() > 0:
    print(f"Admissions: Dropping {invalid_pat_fk.sum()} rows with invalid patient_id.")
    admissions = admissions[~invalid_pat_fk]

# Admissions -> Doctors
invalid_doc_fk = ~admissions['doctor_id'].isin(doctors['doctor_id'])
if invalid_doc_fk.sum() > 0:
    print(f"Admissions: Dropping {invalid_doc_fk.sum()} rows with invalid doctor_id.")
    admissions = admissions[~invalid_doc_fk]

# Vitals -> Admissions
invalid_adm_fk_v = ~vitals['admission_id'].isin(admissions['admission_id'])
if invalid_adm_fk_v.sum() > 0:
    print(f"Vitals: Dropping {invalid_adm_fk_v.sum()} rows with invalid admission_id.")
    vitals = vitals[~invalid_adm_fk_v]

# Treatments -> Admissions
invalid_adm_fk_t = ~treatments['admission_id'].isin(admissions['admission_id'])
if invalid_adm_fk_t.sum() > 0:
    print(f"Treatments: Dropping {invalid_adm_fk_t.sum()} rows with invalid admission_id.")
    treatments = treatments[~invalid_adm_fk_t]

# Readmission_Risk -> Admissions
invalid_adm_fk_r = ~readmission_risk['admission_id'].isin(admissions['admission_id'])
if invalid_adm_fk_r.sum() > 0:
    print(f"Readmission_Risk: Dropping {invalid_adm_fk_r.sum()} rows with invalid admission_id.")
    readmission_risk = readmission_risk[~invalid_adm_fk_r]

print("Data cleaning & validation complete.")


# %% [markdown]
# ## 4. Build Star Schema with Surrogate Keys in Dimension Tables
#
# **Improvements**:
# - We introduce `dim_patients` and `dim_doctors` with **surrogate keys** (`patient_key` and `doctor_key`) instead of using `patient_id` and `doctor_id` directly as PK.
# - We create `fact_admissions` by merging `admissions` and `readmission_risk`.
# - We keep `fact_vitals` and `fact_treatments` referencing `admission_id`.
# - You could also replace `admission_id` with a surrogate key in `fact_admissions` and reference that from `fact_vitals`/`fact_treatments`. For simplicity, we’ll keep `admission_id` as the fact PK.

# ### 4.1 Create DimPatients with Surrogate Key
dim_patients = patients.copy()
dim_patients["patient_key"] = range(1, len(dim_patients) + 1)

# Reorder columns: put surrogate key first
dim_patients = dim_patients[[
    "patient_key", "patient_id", "first_name", "last_name", "dob",
    "gender", "contact_no", "address", "chronic_conditions"
]]

# ### 4.2 Create DimDoctors with Surrogate Key
dim_doctors = doctors.copy()
dim_doctors["doctor_key"] = range(1, len(dim_doctors) + 1)
dim_doctors = dim_doctors[[
    "doctor_key", "doctor_id", "first_name", "last_name", "specialization", "contact_no"
]]

# ### 4.3 Create FactAdmissions (merge with readmission_risk)
fact_admissions = admissions.merge(readmission_risk, on="admission_id", how="left")

# Replace patient_id with patient_key
fact_admissions = fact_admissions.merge(
    dim_patients[["patient_id", "patient_key"]],
    on="patient_id", how="left"
)

# Replace doctor_id with doctor_key
fact_admissions = fact_admissions.merge(
    dim_doctors[["doctor_id", "doctor_key"]],
    on="doctor_id", how="left"
)

# We can drop the original patient_id/doctor_id if we want to rely solely on surrogate keys in the fact table:
# (But keep them for reference if you prefer.)
fact_admissions.drop(["patient_id", "doctor_id"], axis=1, inplace=True)

# ### 4.4 FactVitals & FactTreatments remain referencing admission_id
fact_vitals = vitals.copy()
fact_treatments = treatments.copy()

print("Star schema created with surrogate keys in dimension tables.")


# %% [markdown]
# ## 5. Load to MySQL

username = 'root'
password = '12345'
host = 'localhost'
port = '3306'
database = 'case8'
engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

# For demonstration: write dimension/fact tables
dim_patients.to_sql('dim_patients', engine, if_exists='replace', index=False)
dim_doctors.to_sql('dim_doctors', engine, if_exists='replace', index=False)
fact_admissions.to_sql('fact_admissions', engine, if_exists='replace', index=False)
fact_vitals.to_sql('fact_vitals', engine, if_exists='replace', index=False)
fact_treatments.to_sql('fact_treatments', engine, if_exists='replace', index=False)

print("Data successfully loaded to MySQL with improved star schema.")


Sample CSV files created with >= 10 rows each.
CSV files loaded into DataFrames.
Data cleaning & validation complete.
Star schema created with surrogate keys in dimension tables.
Data successfully loaded to MySQL with improved star schema.
