# No Show EDA

Exploratory data analysis of first datadump of no-show

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np

from noshow.preprocessing.load_data import process_appointments, process_postal_codes, load_appointment_csv
from noshow.visualisation.features_plots import feature_barplot, feature_scatter
from noshow.features.no_show_features import prev_no_show_features
from noshow.features.appointment_features import add_days_since_created, add_appointments_same_day, add_minutes_early, add_time_features, add_days_since_last_appointment, add_appointments_last_days
from noshow.features.patient_features import add_patient_features

## Read and clean data

In [None]:
appointments_df = load_appointment_csv("../data/raw/poliafspraken_no_show.csv")
appointments_df = process_appointments(appointments_df)
appointments_df.shape

## Set policlinic name

In [None]:
lung_mask = appointments_df["name"].isin(
    ["UG", "AAV", "AZ", "BN", "BF", "ABG", "BE", "BO", "EG", "BG"]
)
sport_mask = appointments_df["name"].isin(["MA", "MB"])
wkz_mask = appointments_df["name"].isin(
    [
        "WQ",
        "WY",
        "WV",
        "WW",
        "WL",
        "W12",
        "WS",
        "K1",
        "K5",
        "W1",
        "WP",
        "W2",
        "KW",
        "WM",
        "W#",
        "W*",
        "WR",
        "W4",
    ]
)
appointments_df.loc[lung_mask, "clinic"] = "Longziekten"
appointments_df.loc[sport_mask, "clinic"] = "Sport & Revalidatie"
appointments_df.loc[wkz_mask, "clinic"] = "WKZ"

appointments_df["clinic"]

## Plot no-shows

In [None]:
cancelation_reason_count = (
    appointments_df[appointments_df["no_show"] == "no_show"]
    .groupby("cancelationReason_display")["no_show"]
    .count()
)

fig, ax = plt.subplots(figsize=(10, 8))

bar_container = ax.bar(cancelation_reason_count.index, cancelation_reason_count)
ax.bar_label(bar_container)
ax.set_xticks(range(len(cancelation_reason_count.index)))
ax.set_xticklabels(cancelation_reason_count.index, rotation=45, ha="right")
ax.set_title("Cancelation reasons for no-shows")
fig.set_tight_layout("h_pad")
plt.show()


In [None]:
no_show_per_specialisation = (
    appointments_df.groupby("clinic")["no_show"].value_counts().unstack()
)

bottom = np.zeros(2)
fig, ax = plt.subplots(figsize=(10, 4))
for idx, row in no_show_per_specialisation.iterrows():
    ax.bar(["no-show", "show"], row, label=idx, bottom=bottom)
    bottom += row
ax.legend()

plt.show()

In [None]:
no_show_percent = (
    appointments_df.groupby("no_show")["clinic"]
    .value_counts(normalize=True)
    .unstack(level="no_show")
)
fig, ax = plt.subplots(figsize=(10, 4))

bottom = np.zeros(2)
for idx, row in no_show_percent.iterrows():
    ax.bar(["no-show", "show"], row, label=idx, bottom=bottom)
    bottom += row

ax.legend()

plt.show()

In [None]:
no_show_over_time = appointments_df.copy().reset_index()
no_show_over_time["start"] = (
    no_show_over_time["start"].dt.to_period("M").dt.to_timestamp()
)
no_show_over_time = (
    no_show_over_time.groupby(["clinic", "start"])["no_show"]
    .value_counts(normalize=True)
    .unstack(level="no_show")
)

fig, ax = plt.subplots(figsize=(12, 5))
# ax.plot(no_show_over_time.index, no_show_over_time["no_show"])
for idx in no_show_over_time.index.unique(level="clinic"):
    ax.plot(
        no_show_over_time.loc[idx].index,
        no_show_over_time.loc[idx, "no_show"],
        label=idx,
    )
ax.legend()
ax.spines.top.set_visible(False)
ax.spines.right.set_visible(False)
ax.set_title("Percentage of no-shows over time")

plt.show()

In [None]:
no_show_over_time = appointments_df.copy().reset_index()
no_show_over_time["start"] = (
    no_show_over_time["start"].dt.to_period("M").dt.to_timestamp()
)
no_show_over_time = no_show_over_time.groupby(["clinic", "start"])["no_show"].count()

fig, ax = plt.subplots(figsize=(12, 5))
# ax.plot(no_show_over_time.index, no_show_over_time["no_show"])
for idx in no_show_over_time.index.unique(level="clinic"):
    ax.plot(
        no_show_over_time.loc[idx].index,
        no_show_over_time.loc[idx, "no_show"],
        label=idx,
    )
ax.legend()
ax.spines.top.set_visible(False)
ax.spines.right.set_visible(False)
ax.set_title("number of appointments over time")

plt.show()

In [None]:
appointments_df["month"] = appointments_df.index.get_level_values("start").month
month_names = [
    "jan",
    "feb",
    "mrt",
    "apr",
    "mei",
    "jun",
    "jul",
    "aug",
    "sep",
    "okt",
    "nov",
    "dec",
]

no_show_month = (
    appointments_df.groupby(["month", "clinic"])["no_show"]
    .value_counts(normalize=True)
    .unstack(level="no_show")
)
fig, ax = plt.subplots(3, figsize=(10, 4))

for idx, clinic in zip(range(3), no_show_month.index.unique(level="clinic")):
    plot_data = no_show_month.loc[(slice(None), clinic), "no_show"]
    ax[idx].bar(month_names, plot_data, label=clinic)
    ax[idx].legend()

ax[0].set_title("Average percentage of no-show per month")
fig.set_tight_layout("rect")
plt.show()

## Calculate earlier no-shows

In [None]:
appointments_features = prev_no_show_features(appointments_df)


In [None]:
ax = feature_barplot(
    appointments_features, "prev_no_show", feature_name="previous no-shows"
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()

In [None]:
feature_barplot(
    appointments_features,
    "prev_no_show_perc",
    feature_name="previous no-show percentage",
    perc_feature=True,
    round_decimals=1,
)
plt.show()


In [None]:
feature_scatter(appointments_features, "earlier_appointments")
plt.show()


### Calculate days since created

In [None]:
appointments_features = add_days_since_created(appointments_features)

In [None]:
feature_scatter(appointments_features, "days_since_created")
plt.show()

### Calculate appointments on the same day

In [None]:
appointments_features = add_appointments_same_day(appointments_features)

In [None]:
feature_barplot(appointments_features, "appointments_same_day")
plt.show()

In [None]:
appointments_features = add_days_since_last_appointment(appointments_features)
appointments_features = add_appointments_last_days(appointments_features)

In [None]:
feature_scatter(appointments_features, "days_since_last_appointment")
plt.show()

In [None]:
feature_barplot(appointments_features, "appointments_last_days")
plt.show()

## Calculate patient features

### Postal codes

Using the dump from https://download.geonames.org/export/dump/

In [None]:
all_postalcodes = process_postal_codes("../data/raw/NL.txt")
appointments_features = add_patient_features(appointments_features, all_postalcodes)


### Plot patient features

In [None]:
feature_scatter(appointments_features, "age")
plt.show()

In [None]:
feature_scatter(
    appointments_features,
    "dist_umcu",
    feature_name="distance to UMCU",
    round_feature=True,
)
plt.show()


## Calculate punctuality patients

In [None]:
appointments_features = add_minutes_early(appointments_features)

In [None]:
appointments_features["minutes_early"].plot.hist(bins=10)

### Plot punctuality vs no show

In [None]:
feature_scatter(
    appointments_features,
    "prev_minutes_early",
    feature_name="previous minutes too early",
    round_feature=True,
)
plt.show()


## Add time features

In [None]:
appointments_features = add_time_features(appointments_features)

In [None]:
feature_barplot(appointments_features, "hour")
plt.show()

In [None]:
feature_barplot(appointments_features, "weekday")
plt.show()

## Plot categorical features

In [None]:
ax = feature_barplot(appointments_features, "specialty_code")
ax.tick_params("x", labelrotation=45)
plt.show()