In [4]:
pip install faker --break-system-packages

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/2.0 MB 4.2 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/2.0 MB 3.5 MB/s eta 0:00:01
   ------------------------------- -------- 1.6/2.0 MB 2.7 MB/s eta 0:00:01
   ------------------------------- -------- 1.6/2.0 MB 2.7 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 2.1 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-40.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

In [3]:
fake = Faker("en_IN")
np.random.seed(42)
random.seed(42)


In [7]:
START_DATE = datetime.now() - timedelta(days=90)
END_DATE = datetime.now()

In [9]:
# 1Ô∏è‚É£ Branch (DIM)
branch_df = pd.DataFrame([
    ("B001", "Delhi Central", 400, 60),
    ("B002", "Mumbai West", 350, 50),
    ("B003", "Bangalore North", 300, 40)
], columns=["branch_id", "branch_name", "total_beds", "icu_beds"])

In [15]:
#2Ô∏è‚É£ Department (DIM) 
departments = [
    "Cardiology", "Oncology", "Orthopedics",
    "Pediatrics", "Emergency", "General Medicine"
]

dept_rows = []
for b in branch_df["branch_id"]:
    for i, d in enumerate(departments):
        dept_rows.append((f"D{b[-1]}{i+1:02}", d, b))

department_df = pd.DataFrame(
    dept_rows,
    columns=["department_id", "department_name", "branch_id"]
)


In [19]:
# 3Ô∏è‚É£ Patient (DIM) ‚Äî Indian names
PATIENT_COUNT = 8000

patient_df = pd.DataFrame({
    "patient_id": [f"P{100000+i}" for i in range(PATIENT_COUNT)],
    "age": np.random.randint(1, 85, PATIENT_COUNT),
    "gender": np.random.choice(["Male", "Female"], PATIENT_COUNT, p=[0.52, 0.48]),
    "insurance_type": np.random.choice(
        ["Cash", "Ayushman Bharat", "Private", "Corporate"],
        PATIENT_COUNT,
        p=[0.45, 0.25, 0.20, 0.10]
    )
})

In [21]:
# 4Ô∏è‚É£ Doctor (DIM)
DOCTOR_COUNT = 120

doctor_df = pd.DataFrame({
    "doctor_id": [f"DR{2000+i}" for i in range(DOCTOR_COUNT)],
    "department_id": np.random.choice(department_df["department_id"], DOCTOR_COUNT),
    "max_daily_hours": np.random.choice([8, 9, 10], DOCTOR_COUNT, p=[0.7, 0.2, 0.1])
})

In [23]:
## 5Ô∏è‚É£ Admission (PRIMARY FACT)
ADMISSION_COUNT = 6500

def random_timestamp():
    return START_DATE + timedelta(
        minutes=random.randint(0, int((END_DATE - START_DATE).total_seconds() / 60))
    )

los_map = {
    "Emergency": (1, 3),
    "General Medicine": (2, 5),
    "Cardiology": (3, 7),
    "Orthopedics": (3, 8),
    "Pediatrics": (1, 4),
    "Oncology": (5, 12)
}

admissions = []

for i in range(ADMISSION_COUNT):
    dept = department_df.sample(1).iloc[0]
    dept_name = dept["department_name"]

    admit_time = random_timestamp()
    los_days = random.randint(*los_map[dept_name])
    discharge_time = admit_time + timedelta(days=los_days)

    admission_type = (
        "Emergency" if dept_name == "Emergency"
        else np.random.choice(["Scheduled", "Emergency"], p=[0.7, 0.3])
    )

    bed_type = (
        "ICU" if dept_name in ["Cardiology", "Oncology", "Emergency"]
        and random.random() < 0.25 else "General"
    )

    admissions.append((
        f"A{300000+i}",
        patient_df.sample(1)["patient_id"].values[0],
        dept["department_id"],
        dept["branch_id"],
        admit_time,
        discharge_time,
        admission_type,
        bed_type
    ))

admission_df = pd.DataFrame(admissions, columns=[
    "admission_id", "patient_id", "department_id", "branch_id",
    "admission_time", "discharge_time",
    "admission_type", "bed_type"
])


In [25]:
## 6Ô∏è‚É£ Procedure (FACT)
procedure_types = [
    "CT Scan", "MRI", "X-Ray", "Blood Test",
    "Surgery", "ECG", "Chemotherapy"
]

procedure_rows = []

for _, row in admission_df.iterrows():
    for _ in range(random.randint(1, 4)):
        proc_time = row["admission_time"] + timedelta(
            hours=random.randint(1, int((row["discharge_time"] - row["admission_time"]).total_seconds() / 3600))
        )
        procedure_rows.append((
            f"PR{random.randint(100000,999999)}",
            row["admission_id"],
            random.choice(procedure_types),
            proc_time,
            row["admission_type"] == "Emergency"
        ))

procedure_df = pd.DataFrame(procedure_rows, columns=[
    "procedure_id", "admission_id",
    "procedure_type", "procedure_time",
    "emergency_flag"
])

In [27]:
# 7Ô∏è‚É£ Billing (FACT)
billing_rows = []

for _, row in admission_df.iterrows():
    los = (row["discharge_time"] - row["admission_time"]).days
    base_cost = los * random.randint(3000, 7000)
    if row["bed_type"] == "ICU":
        base_cost *= 1.8

    insurance_cover = 0
    patient_ins = patient_df.loc[
        patient_df["patient_id"] == row["patient_id"], "insurance_type"
    ].values[0]

    if patient_ins == "Ayushman Bharat":
        insurance_cover = base_cost * 0.8
    elif patient_ins == "Private":
        insurance_cover = base_cost * 0.6
    elif patient_ins == "Corporate":
        insurance_cover = base_cost * 0.7

    billing_rows.append((
        row["admission_id"],
        round(base_cost, 2),
        round(insurance_cover, 2)
    ))

billing_df = pd.DataFrame(billing_rows, columns=[
    "admission_id", "total_cost", "insurance_covered"
])

In [31]:
# 8Ô∏è‚É£ Outcome (FACT)
outcome_df = pd.DataFrame({
    "admission_id": admission_df["admission_id"],
    "outcome_status": np.random.choice(
        ["Recovered", "Transferred", "Deceased"],
        len(admission_df),
        p=[0.93, 0.05, 0.02]
    ),
    "readmitted_30d": np.random.choice(
        [True, False],
        len(admission_df),
        p=[0.12, 0.88]
    )
})

In [35]:
# 9Ô∏è‚É£ Doctor Workload (UTILIZATION FACT)
work_days = pd.date_range(START_DATE.date(), END_DATE.date())

workload_rows = []

for _, doc in doctor_df.iterrows():
    for d in work_days:
        hours = round(
            random.uniform(0.4, 1.0) * doc["max_daily_hours"], 2
        )
        workload_rows.append((
            doc["doctor_id"],
            d,
            hours
        ))

doctor_workload_df = pd.DataFrame(workload_rows, columns=[
    "doctor_id", "work_date", "hours_booked"
])

In [38]:
# üîü Bed Occupancy (HOURLY SNAPSHOT)
snapshot_rows = []

for _, dept in department_df.iterrows():
    for hour in pd.date_range(START_DATE, END_DATE, freq="h"):
        active = admission_df[
            (admission_df["department_id"] == dept["department_id"]) &
            (admission_df["admission_time"] <= hour) &
            (admission_df["discharge_time"] >= hour)
        ]
        occupied = len(active) + random.randint(-2, 3)
        occupied = max(0, occupied)

        snapshot_rows.append((
            hour,
            dept["department_id"],
            dept["branch_id"],
            occupied
        ))

bed_occupancy_df = pd.DataFrame(snapshot_rows, columns=[
    "snapshot_time", "department_id", "branch_id", "occupied_beds"
])

In [39]:
# Export
branch_df.to_csv("branch.csv", index=False)
department_df.to_csv("department.csv", index=False)
patient_df.to_csv("patient.csv", index=False)
doctor_df.to_csv("doctor.csv", index=False)
admission_df.to_csv("admission.csv", index=False)
procedure_df.to_csv("procedure.csv", index=False)
billing_df.to_csv("billing.csv", index=False)
outcome_df.to_csv("outcome.csv", index=False)
doctor_workload_df.to_csv("doctor_workload.csv", index=False)
bed_occupancy_df.to_csv("bed_occupancy.csv", index=False)

In [44]:
import os

os.makedirs("hospital_data", exist_ok=True)

tables = {
    "branch": branch_df,
    "department": department_df,
    "patient": patient_df,
    "doctor": doctor_df,
    "admission": admission_df,
    "procedure": procedure_df,
    "billing": billing_df,
    "outcome": outcome_df,
    "doctor_workload": doctor_workload_df,
    "bed_occupancy": bed_occupancy_df
}

for name, df in tables.items():
    df.to_csv(f"hospital_data/{name}.csv", index=False)


In [46]:
os.makedirs("hospital_data", exist_ok=True)
df.to_csv("hospital_data/branch.csv", index=False)
