In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

sns.set()

In [2]:
df = pd.read_csv("./edstay_encounters.csv")

In [3]:
df.columns

Index(['subject_id', 'stay_id', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'chiefcomplaint', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'gender', 'age_on_adm', 'stay_id-2',
       'arrival_transport', 'disposition', 'gender-2', 'intime', 'outtime',
       'los'],
      dtype='object')

In [4]:
df.shape

(425087, 23)

In [13]:
vparams_df = df[
    ["temperature", "heartrate", "resprate", "o2sat", "sbp", "dbp", "age_on_adm"]
]

In [14]:
col_names_to_real_names = {
    "temperature": "Temperature",
    "heartrate": "Heart rate",
    "resprate": "Respiratory rate",
    "o2sat": "Oxygen saturation",
    "sbp": "Systolic blood pressure",
    "dbp": "Diastolic blood pressure",
    "age_on_adm": "Age",
}

In [None]:
fig, (ax, ax2) = plt.subplots(2, 1, figsize=(8, 10))

# Plot all the data in a single plot
sns.boxplot(data=vparams_df, ax=ax)
ax.set_yscale("log")

# Create a copy of the DataFrame for the trimmed data
trimmed_df = vparams_df.copy()

# Remove upper and lower 2% of the data for each column
for col in vparams_df.columns:
    lower_bound = vparams_df[col].quantile(0.02)
    upper_bound = vparams_df[col].quantile(0.98)
    trimmed_df = trimmed_df[
        (trimmed_df[col] > lower_bound) & (trimmed_df[col] < upper_bound)
    ]

# plot the trimmed data with ylim set to fit the data
sns.boxplot(data=trimmed_df, ax=ax2)
ax2.set_ylim(0, 200)

# set axis of second plot to
plt.tight_layout()
plt.show()

In [26]:
def make_three_plots(x: pd.Series, name: str):
    raw_series = x
    filtered_series = x[(x >= x.quantile(0.02)) & (x <= x.quantile(0.98))]

    # start multiplot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
    fig.suptitle(f"Boxplot distribution of feature: {name}")
    sns.boxplot(ax=ax1, y=raw_series, log_scale=True)
    sns.boxplot(ax=ax2, y=filtered_series)

    fig.savefig(
        f"../graphs/{''.join(name.split(' '))}_box_dist.png", format="png", dpi=1200
    )

In [64]:
def analyze_series(series: pd.Series):
    # Calculate mean and 95% CI
    series = series.dropna()
    mean = series.mean()
    sd = series.std()
    confidence = 0.95
    ci = stats.t.interval(
        confidence=confidence, df=len(series) - 1, loc=mean, scale=stats.sem(series)
    )

    # Calculate quartiles
    quartiles = series.quantile([0.25, 0.5, 0.75])

    # Print results
    print(f"Mean: {mean:.2f} (95% CI: {ci[0]:.2f} to {ci[1]:.2f})")
    print(f"Standard Deviation: {sd}")
    print("\nQuartiles:")
    print(f"25th percentile (Q1): {quartiles[0.25]:.2f}")
    print(f"50th percentile (Median): {quartiles[0.5]:.2f}")
    print(f"75th percentile (Q3): {quartiles[0.75]:.2f}")

In [None]:
for x, y in col_names_to_real_names.items():
    make_three_plots(vparams_df[x], y)

In [None]:
for x in col_names_to_real_names:
    print(x)
    analyze_series(vparams_df[x])
    print("\n")

In [51]:
catcol_to_real_names = {"pain": "Pain scale", "acuity": "Acuity", "gender": "Gender"}

In [52]:
cat_frame = df[[x for x in catcol_to_real_names.keys()]]

In [None]:
cat_frame["pain"] = np.where(
    cat_frame["pain"].str.isdigit(), cat_frame["pain"], "non-numeric"
)

cat_frame["pain"] = np.where(
    cat_frame[]
)

In [70]:
def make_cat_plots(x: pd.Series, name: str):
    sns.countplot(x=x)
    plt.show()

In [None]:
for x in catcol_to_real_names:
    make_cat_plots(cat_frame[x], x)

In [61]:
def make_histplot(x: pd.Series, name: str):
    sns.histplot(x=x)
    plt.show()

In [62]:
hist_frame = df[["age_on_adm", "los"]]

In [None]:
for sname, s in hist_frame.items():
    print(sname)
    analyze_series(s)
    make_histplot(s, sname)

In [None]:
df.arrival_transport.value_counts()