In [1]:
!pip install pandas numpy scikit-learn matplotlib statsmodels xgboost pyyaml
import os
base_dir = "/content/smart-hospital-analytics"
os.makedirs(base_dir, exist_ok=True)
os.chdir(base_dir)
for d in ["data/raw","data/processed","data/external","data/synthetic","notebooks","src","reports"]:
    os.makedirs(d, exist_ok=True)
print("✅ Folder structure created at:", base_dir)


✅ Folder structure created at: /content/smart-hospital-analytics


In [2]:
import numpy as np, pandas as pd, os
from datetime import datetime, timedelta

OUT_DIR = "data/raw"
os.makedirs(OUT_DIR, exist_ok=True)
np.random.seed(42)

# --- 1) No-show appointments ---
dates = pd.date_range("2024-01-01", periods=400, freq="D")
rows=[]
for d in dates:
    for _ in range(np.random.randint(20,50)):
        age = int(np.clip(np.random.normal(45,18),0,90))
        lead = np.random.randint(0,30)
        appt_day = d + pd.Timedelta(days=lead)
        gender = np.random.choice(["F","M"])
        no_show = np.random.rand() < (0.15 + 0.1*(lead>14) + 0.05*(appt_day.weekday() in [0,5]))
        rows.append({"Gender":gender,"Age":age,"ScheduledDay":d,"AppointmentDay":appt_day,"No-show":"Yes" if no_show else "No"})
pd.DataFrame(rows).to_csv(f"{OUT_DIR}/noshowappointments.csv",index=False)

# --- 2) Hospital resources ---
dates = pd.date_range("2024-04-01", periods=200, freq="D")
depts=["General","Cardiology","Pediatrics","ICU"]
res=[]
for dept in depts:
    total_beds = 40 if dept!="ICU" else 12
    total_icu = 0 if dept!="ICU" else 12
    base = np.clip(0.6 + 0.2*np.sin(np.linspace(0,6*np.pi,len(dates))) + 0.05*np.random.randn(len(dates)),0.3,0.98)
    for i,d in enumerate(dates):
        occ = int(total_beds*base[i]) if total_beds>0 else 0
        icu = int(total_icu*np.clip(base[i]+0.05*np.random.randn(),0.3,0.98)) if total_icu>0 else 0
        res.append({"date":d,"dept":dept,"occupied_beds":occ,"total_beds":total_beds,"icu_occupied":icu,"total_icu":total_icu})
pd.DataFrame(res).to_csv(f"{OUT_DIR}/hospital_admissions.csv",index=False)

# --- 3) Disease & weather ---
dates = pd.date_range("2024-01-01", periods=300, freq="D")
cities=["Bengaluru","Hyderabad","Chennai"]
w,c=[],[]
for city in cities:
    temp=20+8*np.sin(np.linspace(0,2*np.pi,len(dates)))+np.random.randn(len(dates))
    rain=np.clip(5+10*np.sin(np.linspace(0,4*np.pi,len(dates)))+2*np.random.randn(len(dates)),0,None)
    hum=np.clip(60+15*np.sin(np.linspace(0,2*np.pi,len(dates)))+5*np.random.randn(len(dates)),30,100)
    cases=np.clip(15+5*np.sin(np.linspace(0,2*np.pi,len(dates)))+0.08*rain+np.random.randn(len(dates))*2,0,None).astype(int)
    for i,d in enumerate(dates):
        w.append({"date":d,"city":city,"temp_c":temp[i],"rain_mm":rain[i],"humidity":hum[i]})
        c.append({"date":d,"city":city,"cases":cases[i]})
pd.DataFrame(w).to_csv(f"{OUT_DIR}/weather_daily.csv",index=False)
pd.DataFrame(c).to_csv(f"{OUT_DIR}/disease_cases.csv",index=False)
print("✅ Synthetic datasets created under data/raw/")


✅ Synthetic datasets created under data/raw/


In [5]:
import pandas as pd, os
PROC = "data/processed"
os.makedirs(PROC, exist_ok=True)

# --- No-show Cleaning ---
df = pd.read_csv("data/raw/noshowappointments.csv")

# 🔍 ensure correct datetime parsing (force conversion)
df["ScheduledDay"] = pd.to_datetime(df["ScheduledDay"], errors="coerce", utc=True)
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"], errors="coerce", utc=True)

# Drop rows where datetime conversion failed
df = df.dropna(subset=["ScheduledDay", "AppointmentDay"])

# ✅ Compute features safely
df["lead_time_days"] = (df["AppointmentDay"].dt.date - df["ScheduledDay"].dt.date).apply(lambda x: x.days)
df["weekday"] = df["AppointmentDay"].dt.day_name()
df["target"] = df["No-show"].astype(str).str.upper().str.startswith("Y").astype(int)

df.to_csv(f"{PROC}/noshow_clean.csv", index=False)
print("✅ Cleaned noshow data:", df.shape)

# --- Resource Cleaning ---
r = pd.read_csv("data/raw/hospital_admissions.csv")
r["date"] = pd.to_datetime(r["date"], errors="coerce")
r["util_beds"] = (r["occupied_beds"] / r["total_beds"]).clip(0, 1)
r["util_icu"] = (r["icu_occupied"] / r["total_icu"].replace(0, 1)).clip(0, 1)
r.to_csv(f"{PROC}/resource_daily.csv", index=False)
print("✅ Cleaned resource data:", r.shape)

# --- Outbreak Cleaning ---
w = pd.read_csv("data/raw/weather_daily.csv")
c = pd.read_csv("data/raw/disease_cases.csv")
w["date"] = pd.to_datetime(w["date"], errors="coerce")
c["date"] = pd.to_datetime(c["date"], errors="coerce")
merged = c.merge(w, on=["date", "city"], how="left")
merged.to_csv(f"{PROC}/cases_weather_daily.csv", index=False)
print("✅ Cleaned outbreak data:", merged.shape)


✅ Cleaned noshow data: (13678, 8)
✅ Cleaned resource data: (800, 8)
✅ Cleaned outbreak data: (900, 6)


In [7]:
import pandas as pd
df = pd.read_csv("data/processed/noshow_clean.csv")
print(df.columns)


Index(['Gender', 'Age', 'ScheduledDay', 'AppointmentDay', 'No-show',
       'lead_time_days', 'weekday', 'target'],
      dtype='object')


In [8]:
X = df[["Age", "Gender", "lead_time_days", "weekday"]]
y = df["target"]

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["Gender", "weekday"]),
    ("num", "passthrough", ["Age", "lead_time_days"])
])


In [11]:
df.rename(columns={"Gender": "gender"}, inplace=True)


In [12]:
import pandas as pd, json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# --- No-show model ---
df = pd.read_csv("data/processed/noshow_clean.csv")
print("🧾 Columns:", df.columns.tolist())

# Match correct column name (use 'Gender' not 'gender')
X = df[["Age", "Gender", "lead_time_days", "weekday"]]
y = df["target"]

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["Gender", "weekday"]),
    ("num", "passthrough", ["Age", "lead_time_days"])
])
pipe = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=200))])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(Xtr, ytr)
rpt = classification_report(yte, pipe.predict(Xte), output_dict=True)
print("📊 No-show accuracy:", round(rpt["accuracy"], 3))

# --- Resource forecast (ICU/Beds) ---
r = pd.read_csv("data/processed/resource_daily.csv", parse_dates=["date"])
dept = "ICU"
rd = r[r["dept"] == dept].set_index("date").asfreq("D")
m = SARIMAX(rd["util_beds"], order=(1, 1, 1), seasonal_order=(1, 1, 1, 7))
res = m.fit(disp=False)
f = res.get_forecast(14).predicted_mean
print(f"🛏️ Resource forecast ({dept}) next 2 weeks mean util:", round(f.mean(), 3))

# --- Outbreak regression ---
o = pd.read_csv("data/processed/cases_weather_daily.csv", parse_dates=["date"])
o = o.sort_values(["city", "date"])
o["cases_lag7"] = o.groupby("city")["cases"].shift(7)
o = o.dropna()
X = o[["rain_mm", "temp_c", "humidity", "cases_lag7"]]
y = o["cases"]

Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(Xt, yt)
yp = rf.predict(Xv)
print("🦠 Outbreak MAE:", round(mean_absolute_error(yv, yp), 2),
      " R²:", round(r2_score(yv, yp), 3))
print("✅ All modules executed successfully!")


🧾 Columns: ['Gender', 'Age', 'ScheduledDay', 'AppointmentDay', 'No-show', 'lead_time_days', 'weekday', 'target']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 No-show accuracy: 0.784




🛏️ Resource forecast (ICU) next 2 weeks mean util: 0.487
🦠 Outbreak MAE: 1.74  R²: 0.735
✅ All modules executed successfully!


In [13]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

# -------------------------------
# Load data
# -------------------------------
noshow = pd.read_csv("data/processed/noshow_clean.csv")
resources = pd.read_csv("data/processed/resource_daily.csv", parse_dates=["date"])
outbreak = pd.read_csv("data/processed/cases_weather_daily.csv", parse_dates=["date"])

# -------------------------------
# 1️⃣ No-show analysis
# -------------------------------
fig1 = px.pie(
    noshow,
    names="target",
    title="Patient No-Show Distribution (0 = Attended, 1 = Missed)",
    color_discrete_sequence=px.colors.qualitative.Pastel,
)
fig1.show()

fig2 = px.box(
    noshow,
    x="weekday",
    y="lead_time_days",
    color="target",
    title="Lead Time vs Weekday (Impact on No-Show Rate)",
)
fig2.show()

# -------------------------------
# 2️⃣ Hospital Resource Utilization
# -------------------------------
icu = resources[resources["dept"] == "ICU"].set_index("date")
fig3 = px.line(
    icu,
    y=["util_beds", "util_icu"],
    title="ICU Utilization Over Time",
    labels={"value": "Utilization Rate", "date": "Date"},
)
fig3.show()

# -------------------------------
# 3️⃣ Disease Outbreak Trend
# -------------------------------
city = outbreak["city"].unique()[0]
df_city = outbreak[outbreak["city"] == city]
fig4 = px.line(
    df_city,
    x="date",
    y="cases",
    title=f"{city}: Daily Disease Cases Trend",
)
fig4.show()

fig5 = px.scatter(
    outbreak,
    x="rain_mm",
    y="cases",
    color="city",
    trendline="ols",
    title="Correlation between Rainfall and Disease Cases",
)
fig5.show()

print("✅ Dashboard visualizations created successfully!")


✅ Dashboard visualizations created successfully!


In [14]:
noshow.to_csv("reports/noshow_analysis.csv", index=False)
resources.to_csv("reports/resource_analysis.csv", index=False)
outbreak.to_csv("reports/outbreak_analysis.csv", index=False)
print("📁 Reports saved under /content/smart-hospital-analytics/reports/")


📁 Reports saved under /content/smart-hospital-analytics/reports/


In [15]:

import shutil, os

os.makedirs("reports", exist_ok=True)

!cp data/processed/noshow_clean.csv reports/noshow_clean.csv
!cp data/processed/resource_daily.csv reports/resource_daily.csv
!cp data/processed/cases_weather_daily.csv reports/cases_weather_daily.csv

print("✅ Exported to /content/smart-hospital-analytics/reports")
!ls reports


✅ Exported to /content/smart-hospital-analytics/reports
cases_weather_daily.csv  noshow_clean.csv	resource_analysis.csv
noshow_analysis.csv	 outbreak_analysis.csv	resource_daily.csv
