# Hospital Readmission Analysis
This notebook walks through data cleaning, exploratory analysis, and a simple predictive model for 30-day readmissions.


## 1) Setup
We load the dataset and create helpful features for analysis.


## 2) Exploratory Data Analysis (EDA)
We compute readmission rates across groups and visualize trends.


## 3) Simple Predictive Model
We fit a baseline logistic regression to demonstrate risk modeling.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("../data/hospital_readmissions.csv")
df.head()

In [None]:
# Feature engineering
df["admit_date"] = pd.to_datetime(df["admit_date"])
df["year_month"] = df["admit_date"].dt.to_period("M").astype(str)

# Quick checks
df.isna().mean().sort_values(ascending=False).head(10)

In [None]:
# Overall readmission rate
readmit_rate = df["readmitted_within_30_days"].mean()
readmit_rate

In [None]:
# Readmission rate by diagnosis
diag = (df.groupby("primary_diagnosis")["readmitted_within_30_days"]
        .mean()
        .sort_values(ascending=False))

plt.figure()
diag.plot(kind="bar")
plt.ylabel("Readmission Rate (30d)")
plt.title("Readmission Rate by Primary Diagnosis")
plt.tight_layout()
plt.show()

In [None]:
# Trend over time
trend = df.groupby("year_month")["readmitted_within_30_days"].mean()

plt.figure()
trend.plot()
plt.ylabel("Readmission Rate (30d)")
plt.title("Monthly Readmission Trend")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Simple model: logistic regression (sklearn)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

target = "readmitted_within_30_days"
X = df.drop(columns=[target, "days_to_readmission", "admit_date"])
y = df[target]

cat_cols = ["sex","race","insurance","admission_type","primary_diagnosis","discharge_destination","year_month"]
num_cols = ["age","comorbidity_count","length_of_stay_days"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, proba)
auc

In [None]:
# Classification report at default threshold 0.5
pred = (proba >= 0.5).astype(int)
print(classification_report(y_test, pred))