# Data Realizations
This file contains code to create various visualizations exploring relationships between the target column ("length_of_stay") and the 11 features. 

The main purpose of this file is to understand relationships in the dataset and see if certain features have larger impacts than others through graphs. 

In [None]:
# Load the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px

In [None]:
# Load the dataset
df = pd.read_csv("hospital_cleaned.csv")

In [None]:
# Visualization comfirming no missing values remain after data cleaning
msno.matrix(df)

In [None]:
# Shows unique values in the gender column
print("Unique gender groups:")
print(df["gender"].unique(), "\n")
# Shows the number of data points for each unique gender value
print("Counts per gender group:")
print(df["gender"].value_counts(dropna=False))

In [None]:
# Visualization showing length of stay based on gender
fig = px.box(df, x="gender", y="length_of_stay", color="gender")
fig.show()

In [None]:
# Shows unique values in the type of admission column
print("Unique type_of_admission groups:")
print(df["type_of_admission"].unique(), "\n")
# Shows the number of data points for each unique type of admission
print("Counts per type_of_admission group:")
print(df["type_of_admission"].value_counts(dropna=False))

In [None]:
# Visualization showing the length of stay based on the type of admission
fig = px.box(df, x="type_of_admission", y="length_of_stay", points=False)
fig.show()
# Summary showing key values like quartiles, min, max, standard deviation, and mean
summary = df.groupby("type_of_admission")["length_of_stay"].describe()
print(summary)

In [None]:
# Visualization showing the length of stay based on the risk of mortality
risk_mortality_order = ["Minor", "Moderate", "Major", "Extreme"]

fig = px.box(
    df,
    x="apr_risk_of_mortality",    
    y="length_of_stay",
    category_orders={"apr_risk_of_mortality": risk_mortality_order
    },
    points=False
)
fig.show()

In [None]:
# Visualization showing the length of stay based on the severity of illness
severity_illness_order = ["Minor", "Moderate", "Major", "Extreme"]
fig = px.box(
    df,
    x="apr_severity_of_illness_description",    
    y="length_of_stay",
    category_orders={"apr_severity_of_illness_description": severity_illness_order
    },
    points=False
)
fig.show()

In [None]:
# Visualization showing the average length of stay based on the medical/surgical description 
fig = px.histogram(df, x="apr_medical_surgical_description", y="length_of_stay", histfunc= "avg")
fig.show()

In [None]:
# Visualization showing the proportion of the total length of stay in each length of stay timeframe bucket for each age group
age_order = ['0 to 17', '18 to 29', '30 to 49', '50 to 69', '70 or Older']
bins = [0, 1, 3, 7, df["length_of_stay"].max()]
labels = ["0–1 day", "2–3 days", "4–7 days", "> 7 days"]

df["los_bucket"] = pd.cut(df["length_of_stay"], bins=bins, labels=labels, right=True)

prop = (
    df.groupby(["age_group", "los_bucket"])
      .size()
      .unstack("los_bucket")
      .loc[age_order]          # ensure correct order
)
prop = prop.div(prop.sum(axis=1), axis=0)

prop.plot(
    kind="bar",
    stacked=True,
    figsize=(10,6),
    rot=30
)
plt.xlabel("Age Group")
plt.ylabel("Proportion of Stays")
plt.title("Length-of-Stay Buckets by Age Group")
plt.legend(title="LOS Bucket", bbox_to_anchor=(1.02,1))
plt.tight_layout()
plt.show()

In [None]:
# Visualization showing the average length of stay based on gender
fig = px.histogram(df, x="gender", y="length_of_stay", histfunc= "avg")
fig.show()

In [None]:
# Visualization showing the average length of stay based on the type of admission
fig = px.histogram(df, x="type_of_admission", y="length_of_stay", histfunc= "avg")
fig.show()

In [None]:
# Visualization showing the average length of stay based on the severity of illness 
fig = px.histogram(df, x="apr_severity_of_illness_description", y="length_of_stay", histfunc= "avg")
fig.show()

In [None]:
# Visualization showing the average length of stay based on the risk of mortality, organized by the medical/surgical description
risk_mortality_order = ["Minor", "Moderate", "Major", "Extreme"]
fig = px.histogram(df, x="apr_risk_of_mortality", y="length_of_stay", color="apr_medical_surgical_description", histfunc= "avg", labels={
        "apr_risk_of_mortality": "Risk of Mortality",
        "length_of_stay": "Average Length of Stay (days)",
        "apr_medical_surgical_description": "Medical/Surgical Type"
    }
)

fig.update_layout(
    title="Average Length of Stay by Risk of Mortality and Medical/Surgical Type",
    xaxis_title="Risk of Mortality",
    yaxis_title="Average Length of Stay (days)"
)

fig.show()


In [None]:
# Visualization showing the average length of stay based on the emergency department indicator, organized by gender
fig = px.histogram(df, x="emergency_department_indicator", y="length_of_stay", color="gender", histfunc= "avg")
fig.show()

In [None]:
# Visualization showing the average length of stay based on the medical/surgical description, organized by risk of mortality
fig = px.histogram(df, x="apr_medical_surgical_description", y="length_of_stay", color="apr_risk_of_mortality", histfunc= "avg",labels={
        "apr_medical_surgical_description": "Medical/Surgical Type",
        "length_of_stay": "Average Length of Stay (days)",
        "apr_risk_of_mortality": "Risk of Mortality"
    }
)

fig.update_layout(
    title="Average Length of Stay by Risk of Mortality and Medical/Surgical Type",
    xaxis_title="Medical/Surgical Type",
    yaxis_title="Average Length of Stay (days)"
)

fig.show()


In [None]:
# Visualization showing the average length of stay based on the risk of mortality, organized by age group
fig = px.histogram(df, x="apr_risk_of_mortality", y="length_of_stay", color="age_group", histfunc= "avg",labels={
        "apr_risk_of_mortality": "Risk of Mortality",
        "length_of_stay": "Average Length of Stay (days)",
        "age_group": "Age Group"
    }
)

fig.update_layout(
    title="Average Length of Stay by Risk of Mortality and Age Group",
    xaxis_title="Risk of Mortality",
    yaxis_title="Average Length of Stay (days)"
)

fig.show()

In [None]:
# Visualization showing the average length of stay based on the emergency department indicator, organized by age group
fig = px.histogram(df, x="emergency_department_indicator", y="length_of_stay", color="age_group", histfunc= "avg")
fig.show()