# No Show EDA

Exploratory data analysis of first datadump of no-show

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from noshow.preprocessing.geo import haversine_distance
from noshow.visualisation.features_plots import feature_barplot, feature_scatter

## Read and clean data

In [None]:
appointments_df = pd.read_csv(
    "/mapr/nicu_ew/nicu_ew_onderzoeker/data/no_show/poliafspraken_no_show.csv",
    parse_dates=["created"],
)
appointments_df["start"] = pd.to_datetime(appointments_df["start"], errors="coerce")
appointments_df["end"] = pd.to_datetime(appointments_df["end"], errors="coerce")
appointments_df["gearriveerd"] = pd.to_datetime(
    appointments_df["gearriveerd"], errors="coerce"
)

appointments_df["no_show"] = appointments_df["cancelationReason_code"].isin(
    ["M", "C2", "C3", "0000000010", "D1", "N", "E1"]
)
appointments_df["no_show"] = appointments_df["no_show"].replace(
    {True: "no_show", False: "show"}
)

# Some patients have multiple postal codes
appointments_df = appointments_df.drop_duplicates(
    subset=appointments_df.columns.difference(["address_postalCodeNumbersNL"])
)

# Some start dates are NaT
appointments_df = appointments_df.loc[~appointments_df["start"].isna()]

# No phone consults
appointments_df = appointments_df.loc[appointments_df["soort_consult"] != "Telefonisch"]


In [None]:
appointments_df.shape

## Plot no-shows

In [None]:
cancelation_reason_count = (
    appointments_df[appointments_df["no_show"] == "no_show"]
    .groupby("cancelationReason_display")["no_show"]
    .count()
)

fig, ax = plt.subplots(figsize=(10, 8))

bar_container = ax.bar(cancelation_reason_count.index, cancelation_reason_count)
ax.bar_label(bar_container)
ax.set_xticks(range(len(cancelation_reason_count.index)))
ax.set_xticklabels(cancelation_reason_count.index, rotation=45, ha="right")
ax.set_title("Cancelation reasons for no-shows")
fig.set_tight_layout("h_pad")
plt.show()

In [None]:
no_show_per_specialisation = (
    appointments_df.groupby("specialty_code")["no_show"].value_counts().unstack()
)

bottom = np.zeros(2)
fig, ax = plt.subplots(figsize=(10, 4))
for idx, row in no_show_per_specialisation.iterrows():
    ax.bar(["no-show", "show"], row, label=idx, bottom=bottom)
    bottom += row
ax.legend()

plt.show()


In [None]:
no_show_percent = (
    appointments_df.groupby("no_show")["specialty_code"]
    .value_counts(normalize=True)
    .unstack(level="no_show")
)
fig, ax = plt.subplots(figsize=(10, 4))

bottom = np.zeros(2)
for idx, row in no_show_percent.iterrows():
    ax.bar(["no-show", "show"], row, label=idx, bottom=bottom)
    bottom += row

ax.legend()

plt.show()


In [None]:
no_show_over_time = appointments_df.copy()
no_show_over_time["start"] = (
    no_show_over_time["start"].dt.to_period("Y").dt.to_timestamp()
)
no_show_over_time = (
    no_show_over_time.groupby(["specialty_code", "start"])["no_show"]
    .value_counts(normalize=False)
    .unstack(level="no_show")
)

fig, ax = plt.subplots(figsize=(12, 5))

for idx in no_show_over_time.index.unique(level="specialty_code"):
    ax.plot(
        no_show_over_time.loc[idx].index,
        no_show_over_time.loc[idx, "no_show"],
        label=idx,
    )
ax.legend()
ax.spines.top.set_visible(False)
ax.spines.right.set_visible(False)
ax.set_title("Number of no-shows per specialty per year")

plt.show()


## Calculate earlier no-shows

In [None]:
appointments_features = appointments_df.set_index(["pseudo_id", "start"])
appointments_features = appointments_features.sort_index(level="start")
appointments_features["prev_no_show"] = appointments_features["no_show"].replace(
    {"no_show": 1, "show": 0}
)

appointments_features["prev_no_show"] = (
    appointments_features.groupby("pseudo_id")["prev_no_show"]
    .shift(1, fill_value=0)
    .groupby("pseudo_id")
    .cumsum()
)

appointments_features = appointments_features.sort_index(level="start")
appointments_features["earlier_appointments"] = appointments_features.groupby(
    "pseudo_id"
)["no_show"].cumcount()

appointments_features["prev_no_show_perc"] = (
    appointments_features["prev_no_show"]
    / appointments_features["earlier_appointments"]
)
appointments_features.loc[
    appointments_features["prev_no_show_perc"].isna(), "prev_no_show_perc"
] = 0

### Plot earlier no show features

In [None]:
feature_barplot(appointments_features, "prev_no_show", feature_name="previous no-shows")
plt.show()

In [None]:
feature_barplot(
    appointments_features,
    "prev_no_show_perc",
    feature_name="previous no-show percentage",
    perc_feature=True,
    round_decimals=1,
)
plt.show()

## Calculate patient features

### Age

In [None]:
appointments_features["age"] = (
    appointments_features.index.get_level_values("start").year
    - appointments_features["BIRTH_YEAR"]
)

### Postal codes

Using the dump from https://download.geonames.org/export/dump/

In [None]:
all_postalcodes = pd.read_table(
    "/mapr/nicu_ew/nicu_ew_onderzoeker/rpeters7/No_Show/data/raw/NL.txt",
    sep="\t",
    header=None,
    names=[
        "country",
        "postalcode",
        "city",
        "admin_name1",
        "admin_code1",
        "admin_name2",
        "admin_code2",
        "admin_name3",
        "admin_code3",
        "latitude",
        "longitude",
        "accuracy",
    ],
)
all_postalcodes = all_postalcodes.set_index("postalcode")[["latitude", "longitude"]]
all_postalcodes = all_postalcodes.loc[~all_postalcodes.index.duplicated()]
all_postalcodes

In [None]:
appointments_features = appointments_features.merge(
    all_postalcodes, left_on="address_postalCodeNumbersNL", right_index=True
)

appointments_features["dist_umcu"] = appointments_features.apply(
    lambda x: haversine_distance(x["latitude"], x["longitude"]), axis="columns"
)

### Plot patient features

In [None]:
feature_scatter(appointments_features, "age")
plt.show()


In [None]:
feature_scatter(
    appointments_features,
    "dist_umcu",
    feature_name="distance to UMCU",
    round_feature=True,
)
plt.show()

## Calculate punctuality patients

In [None]:
appointments_features["minutes_early"] = (
    appointments_features.index.get_level_values(level="start")
    - appointments_features["gearriveerd"]
).dt.total_seconds() / 60

appointments_features.loc[
    appointments_features["minutes_early"] > 60, "minutes_early"
] = 0
appointments_features.loc[
    appointments_features["minutes_early"] < -60, "minutes_early"
] = 0

appointments_features["minutes_early"].plot.hist(bins=10)


In [None]:
appointments_features = appointments_features.sort_index(level="start")
appointments_features["prev_minutes_early"] = (
    appointments_features.groupby(level="pseudo_id")["minutes_early"]
    .shift(1, fill_value=0)
    .groupby(level="pseudo_id")
    .cumsum()
    / appointments_features["earlier_appointments"]
)

appointments_features["prev_minutes_early"] = appointments_features[
    "prev_minutes_early"
].replace([np.inf, -np.inf, np.nan], 0)


### Plot punctuality vs no show

In [None]:
feature_scatter(
    appointments_features,
    "prev_minutes_early",
    feature_name="previous minutes too early",
    round_feature=True,
)
plt.show()

## Add time features

In [None]:
appointments_features["weekday"] = appointments_features.index.get_level_values(
    "start"
).weekday
appointments_features["hour"] = appointments_features.index.get_level_values(
    "start"
).hour


In [None]:
feature_barplot(appointments_features, "hour")
plt.show()


In [None]:
feature_barplot(appointments_features, "weekday")
plt.show()


## Plot categorical features

In [None]:
feature_barplot(appointments_features, "specialty_code")
plt.show()


In [None]:
feature_barplot(appointments_features, "appointmentType_code")


In [None]:
(
    appointments_features.groupby("appointmentType_code")["no_show"]
    .value_counts(normalize=False, dropna=False)
    .unstack(level="no_show")
)

## Save featuretable

In [None]:
appointments_features[
    [
        "hour",
        "weekday",
        "specialty_code",
        # "appointmentType_code", # Figure out what's going wrong here
        "minutesDuration",
        "no_show",
        "prev_no_show",
        "prev_no_show_perc",
        "age",
        "dist_umcu",
        "prev_minutes_early",
    ]
].to_parquet(
    "/mapr/nicu_ew/nicu_ew_onderzoeker/rpeters7/No_Show/data/processed/featuretable.parquet"
)