In [None]:
import os

import pandas as pd
import numpy as np
from tqdm.auto import tqdm  # for notebooks
import matplotlib.pyplot as plt
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [None]:
def choose_target_period(pat, mins_before_first_icp=60 * 24, target_name="ICP_Vital",
                         min_val=-10, max_val=105):
    target_steps = pat[pat["Maßnahme"] == target_name]
    no_outlier_mask = (target_steps["Wert"] >= min_val) & (target_steps["Wert"] <= max_val)
    target_steps = target_steps[no_outlier_mask]
    if len(target_steps) == 0:
        return pat.iloc[0:0]
    min_target_time = min(target_steps["rel_time"])
    min_allowed_time = min_target_time - mins_before_first_icp
    max_target_time = max(target_steps["rel_time"])
    # kick out (basically set to NaN) measurements beyond interesting region
    pat = pat[pat["rel_time"] >= min_allowed_time]
    pat = pat[pat["rel_time"] <= max_target_time]
    # adjust rel_time
    pat["rel_time"] -= min(pat["rel_time"])
    return pat

In [None]:
import sys
PYTHON_PATH = sys.executable
# install pyarrow
!{PYTHON_PATH} -m pip install pyarrow

In [None]:
load_icp = True

if load_icp:
    df_long = pd.read_parquet("data/Datenbank_icp.parquet", engine='pyarrow')
    if "Unnamed: 0" in df_long:
        df_long = df_long.drop(columns=["Unnamed: 0"])
else:
    if not os.path.exists("data/Datenbank_Werte.parquet"):
        df_long = pd.read_csv("data/Datenbank_Werte.csv")
        # drop where Pat_ID or Wert is NaN
        df_long = df_long.dropna(subset=["Pat_ID", "Wert"])
        # convert Pat_ID to int by removing "_" to save it more efficiently with parquet
        df_long["Pat_ID"] = df_long["Pat_ID"].apply(lambda x: int(str(x).replace("_", ""))).astype(int)
        df_long.to_parquet("data/Datenbank_Werte.parquet")
    df_long = pd.read_parquet("data/Datenbank_Werte.parquet", engine='pyarrow')
    print("Loaded data")
    # drop eICU
    #df_long = df_long[df_long["DB"] != "eICU"]
    
    # add type to Maßnahme
    df_long["Maßnahme"] += "_" + df_long["ID"]
    df_long = df_long.drop(columns="ID")

    # drop measurements where we have a "NAN"
    print("Drop NaN measurements")
    df_long = df_long.dropna(subset=["Wert"])
    
    # drop duplicates
    print("Dropping duplicates...")
    df_long = df_long.drop_duplicates(subset=["Pat_ID", "Maßnahme", "rel_time", "Wert"])
    
    # fix rel_times to ICP, drop steps after last ICP measurement, filter ICP outliers
    # use 0.999 quantile = -10 to 105
    print("Fixing ICP times...")
    df_long = df_long.groupby("Pat_ID").apply(lambda pat: 
                                choose_target_period(pat, mins_before_first_icp=60 * 24, 
                                                     target_name="ICP_Vital",
                                                     min_val=-10, max_val=105))
    df_long = df_long.reset_index(drop=True)

    # merge "NBD" (non-invasive ways of measuring blood pressure) with invasive ways as they measure the same thing
    print("Merging NBD...")
    def rename_nbd(name):
        if "syst" in name:
            name = "syst_Vital"
        elif "diast" in name:
            name = "diast_Vital"
        elif "mittl" in name:
            name = "mittl_Vital"
        return name
    df_long["Maßnahme"] = df_long["Maßnahme"].apply(rename_nbd)
    # save    
    df_long.to_parquet("data/Datenbank_icp.parquet", engine='pyarrow')
    
    # save maßnahme and maßnahme norm for meds
    measures_norm_groups = df_long.groupby("Maßnahme")
    mapping_dict = {}
    for group_name, group in measures_norm_groups:
        if "_Med" in group_name:
            measures = list(group["Maßnahme_norm"].unique())
            mapping_dict[group_name] = measures
    # save mapping dict
    import json
    with open("data/measure_norm_mapping_dict.json", "w") as f:
        json.dump(mapping_dict, f)
print("Data loaded!")

In [None]:
df_long.DB.unique()

In [None]:
# determine columns that have few missings - add min, max, std for those
val_counts = df_long.groupby("DB").apply(lambda db: db.groupby("Maßnahme").apply(lambda m: len(m))).reset_index(drop=False)
val_counts = val_counts.groupby("Maßnahme").mean()[0]
val_counts

In [None]:
# select high count vitals 
quant = 200000 # chosen such that ICP and everything more frequent than it stays in #np.quantile(val_counts, 0.85)
print(quant)
mask = (val_counts > quant).astype(int) + val_counts.index.str.contains("_Vital").astype(int) #+  (~val_counts.index.str.contains("ICP")).astype(int)
mask = mask == 2
high_counts = val_counts[mask].sort_values(0)
print(high_counts)
high_counts = list(high_counts.index) + ["ICP_Vital"]

In [None]:
check_dupls = False

if check_dupls:
    for db in df_long.DB.unique():
        df_long_db = df_long[df_long["DB"] == db]
        first_5_vals = df_long_db.groupby("Pat_ID").apply(lambda pat: pat.sort_values("rel_time")["rel_time"].iloc[:5]).reset_index()
        arrays = []
        for pat_id in first_5_vals.Pat_ID.unique():
            array = first_5_vals[first_5_vals.Pat_ID == pat_id].values
            arrays.append(array)
            
        s = []
        for arr in arrays:
            s.append(tuple(arr.reshape(-1).astype(float).round(0)))
        s = set(s)
        
        if len(s) == len(arrays):
            print("No duplicates in " + db)

In [None]:
icp_vals = df_long[df_long["Maßnahme"] == "ICP_Vital"].reset_index()
diffs = icp_vals.groupby("Pat_ID").apply(lambda pat: pat.sort_values("rel_time")["rel_time"].diff()).reset_index(drop=True)

In [None]:
max_diff = 24 * 60

large_diff_ids = icp_vals[diffs > max_diff].Pat_ID
len(large_diff_ids)

In [None]:
max_diff = 128 * 60

large_diff_ids = icp_vals[diffs > max_diff].Pat_ID.unique()
for pat_id in large_diff_ids:
    pat_vals = icp_vals[icp_vals["Pat_ID"] == pat_id]
    plt.figure()
    ax = plt.gca()
    pat_diffs = pat_vals.rel_time.diff()
    large_diffs = pat_diffs[pat_diffs > max_diff].reset_index()["rel_time"]
    times_of_diffs = pat_vals[pat_diffs > max_diff].reset_index()["rel_time"]
    for large_diff_time, difference in zip(times_of_diffs, large_diffs):
        # plot vertical line at time
        plt.axvline(large_diff_time, color="red")
        num_vals_above = sum(pat_vals["rel_time"] >= large_diff_time)
        num_vals_below = sum(pat_vals["rel_time"] < large_diff_time)
        print(pat_id, num_vals_above, num_vals_below)
    
    pat_vals.plot(x="rel_time", y="Wert", kind="scatter", ax=ax)
    plt.show()

In [None]:
# we see many regions where no ICP is measured at all. We can remove those and only take into account the regions where ICP is measured (+ 24 hours before)
max_time_diff = 24 * 60
min_size = 2
time_before_icp = 24 * 60

num_windows = []

def cut_into_windows(pat_data, icp_times, icp_time_diffs):
    times_of_large_diffs = icp_times[icp_time_diffs > max_time_diff]
    if len(times_of_large_diffs) == 0:
        pat_data["window_id"] = 0
        return [pat_data]
    
    # now we have the sorted times of large diffs
    # we want to split regions in two: before and after. If a region is too small, we can remove it
    last_time = 0
    time_windows = []
    for time in times_of_large_diffs:
        # get the time of the ICP measurement before the large diff
        measure_time_before = icp_times[icp_times < time][-1]
        # get the region between the last time and the measurement before the large diff
        before = icp_times[(icp_times <= measure_time_before) & (icp_times >= last_time)]
        if len(before) >= min_size:
            time_windows.append((last_time, measure_time_before))
        last_time = time - time_before_icp
            
    final_part = icp_times[icp_times >= last_time]
    if len(final_part) >= min_size:
        time_windows.append((last_time, final_part[-1]))
    
    windows = []
    for idx, (start, end) in enumerate(time_windows):
        pat_window = pat_data[(pat_data["rel_time"] >= start) & (pat_data["rel_time"] <= end)]
        pat_window["window_id"] = idx
        windows.append(pat_window)
    return windows

df_long_copy = df_long.copy()
pats = []
pat_groups = df_long.groupby("Pat_ID")
for pat_id, pat_data in tqdm(pat_groups):
    # delete pat_mask'ed from df_long_copy to speed up processing for next pat_ids
    #df_long_copy = df_long_copy[~pat_mask]
    # get icp measurements for pat
    icp_measurements = pat_data[pat_data["Maßnahme"] == "ICP_Vital"]
    icp_times = icp_measurements.rel_time.values
    icp_time_diffs = icp_measurements.rel_time.diff()
    # cut into windows
    windows = cut_into_windows(pat_data, icp_times, icp_time_diffs)
    pats.extend(windows)
            
df_long_windows = pd.concat(pats)

In [None]:
#df_long_windows.to_parquet("data/df_long_windows.csv")

In [None]:
# compare df_long and df_long_windows 
print(len(df_long) - len(df_long_windows))
# compare number of ICP measurements in df_long and df_long_windows
print(len(df_long[df_long["Maßnahme"] == "ICP_Vital"]) - len(df_long_windows[df_long_windows["Maßnahme"] == "ICP_Vital"]))
print(df_long_windows.groupby("DB").apply(lambda db: len(db[db["Maßnahme"] == "ICP_Vital"])))

In [None]:
import pandas as pd

#df_long_windows = pd.read_csv("data/df_long_windows.csv")

In [None]:
# check windows lengths! longest should be at most 60 (minutes) * 128 (hours) = 7680 minutes
df_long_windows.groupby("Pat_ID").apply(lambda x: x.groupby("window_id").apply(lambda win: win[win["Maßnahme"] == "ICP_Vital"].rel_time.diff().max())).hist(bins=100)

In [None]:
df_long = df_long_windows

In [None]:
df_long_windows.groupby("DB").apply(lambda db: db.value_counts("window_id"))

In [None]:
# check distance between icp measurements per DB
for db in ["UKE", "MIMIC", "eICU"]:
    df_db = df_long[df_long["DB"] == db]
    
    plt.figure()
    diffs_db = df_db.groupby(["Pat_ID", "window_id"]).apply(lambda win: win[win["Maßnahme"] == "ICP_Vital"].sort_values("rel_time").rel_time.diff()).reset_index(drop=True)    
    diffs_db = diffs_db[diffs_db < 125]
    if db == "eICU":
        diffs_db = diffs_db[diffs_db < 10]
    # make normalized histogram
    diffs_db.hist(bins=100, alpha=1.0, label=db, density=True)
    plt.xlabel("Minutes")
    plt.ylabel("Count")
    plt.title(f"Minutes between ICP measurements for {db}")
    #plt.yscale("log")
    #plt.legend()
    os.makedirs("figures", exist_ok=True)
    plt.savefig(f"figures/{db}_icp_diffs.pdf")

In [None]:
from typing import Tuple

#import numba

minutes = 5

#@numba.jit()
def create_dict(values: np.ndarray, rel_time: np.ndarray, measure_name: str, high_counts: Tuple[str], minutes: int):
    means = []
    stds = []
    mins = []
    maxs = []
    rel_times = []
    i = 0
    while i < len(values): 
        # add consequent measurements as long as they fit in time window
        # subtract remainder to make sure rel_times are fitted in the right time window
        current_rel_time = rel_time[i] - (rel_time[i] % minutes)  
        vals = []
        while i < len(values) and rel_time[i] < current_rel_time + minutes:
            vals.append(values[i])
            i += 1
        # summarize values
        vals = np.array(vals)
        means.append(np.mean(vals))
        rel_times.append(current_rel_time)
        # add detailed summary if it is a high count feature
        if measure_name in high_counts:
            stds.append(np.std(vals))
            mins.append(np.min(vals))
            maxs.append(np.max(vals))
    return means, stds, mins, maxs, rel_times


def summarize_measure(measure, high_counts, minutes):
    # get name and relevant columns
    measure_name = measure["Maßnahme"].iloc[0]
    rel_time = measure["rel_time"].to_numpy().astype(float)
    values = measure["Wert"].to_numpy().astype(float)
    # get lists
    means, stds, mins, maxs, rel_time = create_dict(values, rel_time, measure_name, high_counts, minutes)
    # create new, shorter df with same base stats (Pat_ID etc.)
    new_df = pd.DataFrame({"rel_time": rel_time, "Wert": means})
    new_df["Pat_ID"] = measure["Pat_ID"].iloc[0]
    new_df["Maßnahme"] = measure["Maßnahme"].iloc[0]
    new_df["Maßnahme_norm"] = measure["Maßnahme_norm"].iloc[0]
    new_df["DB"] = measure["DB"].iloc[0]
    new_df["window_id"] = measure["window_id"].iloc[0]
    if len(stds) > 0:
        # if we have additional stats for this feature, copy the df above, fill in other value and append it
        new_measures = {measure_name: means,
                        measure_name + "_std": stds,
                        measure_name + "_min": mins,
                        measure_name + "_max": maxs} 
        dfs = [new_df.copy() for key in new_measures]
        for df, key in zip(dfs, new_measures):
            df["Maßnahme"] = key
            df["Wert"] = new_measures[key]
        new_df = pd.concat(dfs, axis=0)
    return new_df
            
def summarize_patient(pat, high_counts, minutes):
    return pat.groupby("Maßnahme").apply(lambda measure: summarize_measure(measure, high_counts, minutes)).reset_index(drop=True)

df_long_summarized = df_long.groupby(["Pat_ID", "window_id"]).progress_apply(lambda pat: summarize_patient(pat.sort_values("rel_time", ascending=True), high_counts, minutes)).reset_index(drop=True)
# summarize per N minutes (5 min, 60 min) 
# take mean, min, max, std per Vital measure

In [None]:
print(df_long.groupby("DB").apply(lambda db: len(db[db["Maßnahme"] == "ICP_Vital"])))

In [None]:
print(df_long_summarized.groupby("DB").apply(lambda db: len(db[db["Maßnahme"] == "ICP_Vital"])))

In [None]:
count = 0
for _ in df_long.groupby(["Pat_ID", "window_id"]):
    count += 1
print("Total windows: ", count)

In [None]:
# norm med by maßnahme norm per db
meds = [feat for feat in df_long_summarized["Maßnahme"].unique() if "_Med" in feat]

meds

# not done anymore to avoid information leakage from test to train and also to avoid merging meds of different strengths into one
# instead we now replace the Maßnahme col by Maßnahme_norm for all meds 
def replace_measure(measure):
    if measure["Maßnahme"].iloc[0] in meds:
        measure["Maßnahme"] = measure["Maßnahme_norm"] + "_Med"
    return measure
df_long_meds = df_long_summarized.groupby("Maßnahme").apply(lambda measure: replace_measure(measure)).reset_index(drop=True)

In [None]:
meds_now = [feat for feat in df_long_meds["Maßnahme"].unique() if "_Med" in feat]

meds, meds_now, len(meds), len(meds_now)

In [None]:
#icp_long = df_long[df_long["Maßnahme"] == "ICP_Vital"]
#print(icp_long.groupby("DB").apply(len))
#icp_long_summ = df_long_summarized[df_long_summarized["Maßnahme"] == "ICP_Vital"]
#print(icp_long_summ.groupby("DB").apply(lambda x: len(x)))
#icp_long_med_normed = df_long_normed_med[df_long_normed_med["Maßnahme"] == "ICP_Vital"]
#print(icp_long_med_normed.groupby("DB").apply(lambda x: len(x)))

In [None]:
# from long to wide
df = df_long_meds.pivot_table(index=["rel_time", "Pat_ID", "DB", "window_id"], columns="Maßnahme", values="Wert").reset_index()

In [None]:
df

In [None]:
#df.to_csv(f"data/df_wide_{minutes}.csv", index=False)

In [None]:
# Num ICP measurements per DB
print(len(df))
print(df.groupby("DB").apply(lambda x: len(x[~x["ICP_Vital"].isna()])))
print(df["ICP_Vital"].isna().mean())

In [None]:
# fill in rows with NAN where we have no measurement at all

#@numba.jit()
def add_nan_rows(arr: np.ndarray, rel_time: np.ndarray, minutes: int):
    # check dist between all consecutive rel_times and create NaN array, which is added to list. Merge this list in the end
    num_feats = arr.shape[1]
    rows = []
    old_time = rel_time[0]
    for row, t in zip(arr, rel_time):
        #print(t)
        # check how many rows between old_time and current time have to be added
        num_rows = int(np.ceil((t - old_time) / minutes)) - 1
        if num_rows > 0:
            #print(num_rows)
            #num_rows_added.append(num_rows)
            new_rows = np.ones((num_rows, num_feats)) * np.nan
            rows.append(new_rows)
        rows.append(np.expand_dims(row, 0))
        old_time = t
    return np.concatenate(rows, axis=0)


num_rows_added = []
def unroll_patient(pat, minutes):
    # assumes a patient, sorted by rel_time
    # transform to numpy array and move to helper function
    rel_time = pat["rel_time"].to_numpy()
    db = pat["DB"].iloc[0]
    pat_id = pat["Pat_ID"].iloc[0]
    # drop DB and pat_id temporarily, as their type is str
    pat = pat.drop(columns=["DB", "Pat_ID"]).astype(np.float32)
    arr = pat.to_numpy()

    # get new_array that contains nan_rows where necessary
    #print("old array shape: ", pat.shape)
    new_array = add_nan_rows(arr, rel_time, minutes)
    num_rows_added.append(len(new_array) - len(arr))
    #print("new array shape: ", new_array.shape)
    # set Pat_ID, rel_time and col names correctly at end
    new_pat = pd.DataFrame(data=new_array, columns=pat.columns, index=np.arange(len(new_array)))
    new_pat["DB"] = db 
    new_pat["Pat_ID"] = pat_id
    # fill in rel_times
    start_time = pat["rel_time"].iloc[0]
    new_pat["rel_time"] = np.arange(start_time, start_time + len(new_pat) * minutes, minutes)
    return new_pat
    

df_filled = df.groupby(["Pat_ID", "window_id"]).progress_apply(lambda pat: unroll_patient(pat.sort_values("rel_time", ascending=True), minutes)).reset_index(drop=True)

In [None]:
print("Rows added: ", sum(num_rows_added))

In [None]:
import matplotlib.pyplot as plt
p = plt.hist(num_rows_added, bins=100)
plt.xlabel("Num consecutive missing steps")

In [None]:
len(num_rows_added)

In [None]:
#df_filled.to_csv(f"data/df_filled_{minutes}.csv", index=False)

In [None]:
#df_filled = pd.read_csv("data/df_filled_60.csv")

In [None]:
df_filled

In [None]:
# fill med missing with 0
df_med_filled = df_filled.copy()
med_col_names = [col for col in df_med_filled.columns if "_Med" in col]
df_med_filled[med_col_names] = df_med_filled[med_col_names].fillna(0)

In [None]:
# disabled for now as it reduces UKE performance a lot. Might be useful for better dataset transfer in the future
"""
# show values with highest std of NaNs between DBs
max_nan_frac_over_dbs = df_med_filled.groupby("DB").apply(lambda db: db.isna().mean()).std().sort_values().iloc[-10:]
max_nan_frac_over_dbs
# select highest ranking ones
drop_cols = ["PEEP_Vital", "Pmean_Vital", "Freq gesamt_Vital", "Ppeak_Vital", "FiO2_Vital", "Freq spontan_Vital"]
"""
pass

In [None]:
df_med_filled.groupby("DB").apply(lambda db: db.isna().mean())

In [None]:
# remove feature that have N% missing spots in at least one database
#threshold = 1 - (1 - 0.999) / (60 / minutes)  # make it such that it is 0.99 for 60 minutes and 0.999 for 5 minutes
#max_nan_frac_over_dbs = df_med_filled.groupby("DB").apply(lambda db: db.isna().mean()).max().sort_values()
#drop_cols = list(max_nan_frac_over_dbs[max_nan_frac_over_dbs > threshold].index)
#df_drop_too_missing = df_med_filled.drop(columns=drop_cols)
#print("Dropped: ", drop_cols, " - ", len(drop_cols))

In [None]:
# get patient data such as gender etc
df_static = pd.read_csv("data/Datenbank_Pat_ID.csv")
df_static = df_static[["Pat_ID", "Alter", "Diagnose_txt", "Geschlecht", "Größe", "Gewicht"]]
df_static = df_static.rename(columns={"Diagnose_txt": "Diagnose"})
# one-hot encode
df_static = pd.get_dummies(df_static, columns=["Diagnose", "Geschlecht"])
df_static = df_static.drop(columns=["Geschlecht_Weiblich"])
df_static = df_static.rename(columns={"Geschlecht_Männlich": "Geschlecht"})
# drop nan Pat_ID
df_static = df_static.dropna(subset=["Pat_ID"])

In [None]:
# merge
df_med_filled["Pat_ID"] = df_med_filled["Pat_ID"].astype(int)
df_static["Pat_ID"] = df_static["Pat_ID"].astype(int)
df_large = pd.merge(df_med_filled, df_static, on="Pat_ID", how="left")

In [None]:
# one-hot-encode DB
df_large = pd.get_dummies(df_large, columns=["DB"])

In [None]:
#path = f"data/df_final_{minutes}.csv"
#df_large.to_csv(path, index=False)
#print("Saved to: ", path)

In [None]:
#df_large = pd.read_csv(path)

In [None]:
#df_large.isna().mean().sort_values().iloc[-30:]

In [None]:
df_large

In [None]:
# measure how often meds are non-zero
for db in ["DB_UKE", "DB_MIMIC", "DB_eICU"]:
    db_df = df_large[df_large[db] == 1]
    sorted_meds = (db_df[med_col_names] == 0).mean().sort_values()
    print(db)
    print(sorted_meds.iloc[:20])
    
    too_many_zero_cols = list(sorted_meds[sorted_meds >= 0.995].index)
    #print(too_many_zero_cols)
    # per DB, set med values to NaN if more than 99.9% are zero
    df_large.loc[df_large[db] == 1, too_many_zero_cols] = np.nan
    print()

In [None]:
df_large.isna().mean().sort_values()

In [None]:
df_large[df_large["DB_UKE"] == 0].isna().mean().sort_values()

In [None]:
# set feats completely to NaN per DB that now have too many NaNs
threshold = 0.995

for db in ["DB_UKE", "DB_MIMIC", "DB_eICU"]:
    db_df = df_large[df_large[db] == 1]
    missing_per_col = db_df.isna().mean().sort_values()
    # ignore completely missing ones
    missing_per_col = missing_per_col[missing_per_col < 1.0]
    print(db)
    print(missing_per_col.iloc[-5:])
    print()
    drop_cols = list(missing_per_col[missing_per_col > threshold].index)
    df_large.loc[df_large[db] == 1, drop_cols] = np.nan

In [None]:
# remove correlated feats per database. Set dropped values to NaN
df_corr = df_large.copy()

threshold = 0.90

for db in ["DB_UKE", "DB_MIMIC", "DB_eICU"]:
    db_df = df_corr[df_corr[db] == 1]
    corr_matrix = db_df.corr()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print(db)
    print(to_drop)
    print()
    df_corr.loc[df_corr[db] == 1, to_drop] = np.nan

# drop columns that are now completely NaN
# first print them
print(df_corr.isna().mean().sort_values().iloc[-10:])
# then drop them
df_corr = df_corr.dropna(axis=1, how="all")

In [None]:
def add_split_idcs(df, test_size=0.2):
    pat_ids = np.array(sorted(df["Pat_ID"].unique()))
    seq_list = [df[df["Pat_ID"] == pat_id] for pat_id in pat_ids]
    dev_data, test_data, dev_idcs, test_idcs = make_split(seq_list, test_size=test_size, seed=1)
    #df[dev_idcs]["split"] = "dev"
    #df.loc[test_idcs, "split"] = "test"
    train_data, val_data, train_idcs, val_idcs = make_split(dev_data, test_size=test_size, seed=1)
    mapped_train_idcs = dev_idcs[train_idcs]
    mapped_val_idcs = dev_idcs[val_idcs]
    
    # map idcs to pat_ids
    train_pat_ids = set(pat_ids[mapped_train_idcs])
    val_pat_ids = set(pat_ids[mapped_val_idcs])
    test_pat_ids = set(pat_ids[test_idcs])
    #print(test_pat_ids)
    
    # set column in original df according to idcs
    def assign_split_name(pat_id):
        if pat_id in train_pat_ids:
            return "train"
        elif pat_id in val_pat_ids:
            return "val"
        else:
            return "test"
    df["split"] = df["Pat_ID"].apply(assign_split_name)
    return df

In [None]:
from sklearn.model_selection import train_test_split

def create_seq_labels(seq_list, target_name="ICP_Vital"):
    median_len = np.median([len(pat) for pat in seq_list])
    median_target = np.median([seq[target_name][~seq[target_name].isna()].mean() for seq in seq_list])
    #print("Mean len: ", mean_len)
    #print("Mean target: ", mean_target)
    labels =  [(len(seq) < median_len).astype(int).astype(str) +
               ((seq[target_name][~seq[target_name].isna()].mean() < median_target).astype(int).astype(str))
               for seq in seq_list]
    return labels

def make_split(seq_list, test_size=0.2, seed=1):
    indices = np.arange(len(seq_list))
    labels = create_seq_labels(seq_list)
    train_data, val_data, train_idcs, val_idcs = train_test_split(seq_list, indices, test_size=test_size,
                                                                  stratify=labels, shuffle=True,
                                                                  random_state=seed)
    return train_data, val_data, train_idcs, val_idcs


In [None]:
import os

#from icp_pred.data_utils import make_split
db_cols = [col for col in df_corr.columns if "DB_" in col]
for db_name in db_cols:
    print(db_name)
    db_df = df_corr[df_corr[db_name] == 1].drop(columns=db_cols)
    # drop columns that are completely NaN
    db_df = db_df.dropna(axis=1, how="all")
    # drop columns that are completely zero
    mean_zeros = (db_df == 0).mean()
    db_df = db_df[list(mean_zeros[mean_zeros < 1.0].index)]
    print(db_df.shape)
    print("ICP measurements in DB: ", (~db_df["ICP_Vital"].isna()).sum())
    # make splits if 60 minutes else take split from 60 minutes
    if minutes == 60:
        db_df = add_split_idcs(db_df, test_size=0.2)
    else:
        ref_df = pd.read_parquet(f"data/{db_name}_60_final_df.parquet")
        mapping = {pat_id: ref_df[ref_df["Pat_ID"] == pat_id].iloc[0]["split"] for pat_id in ref_df["Pat_ID"].unique()}
        split = np.array(db_df["Pat_ID"].apply(lambda x: mapping[x]))
        db_df["split"] = split
    # save it
    save_path = f"data/{db_name}_{minutes}_final_df.pkl"
    os.makedirs("data/", exist_ok=True)
    db_df.to_parquet(f"data/{db_name}_{minutes}_final_df.parquet")

In [None]:
save_path

In [None]:
(db_df["split"] == "train").sum()

In [None]:
list(db_df.columns)

In [None]:
# check icp distributions
db_df[db_df["split"] == "train"]["ICP_Vital"].hist(bins=100, density=True, alpha=0.5)
db_df[db_df["split"] == "val"]["ICP_Vital"].hist(bins=100, density=True, alpha=0.5)
db_df[db_df["split"] == "test"]["ICP_Vital"].hist(bins=100, density=True, alpha=0.5)

In [None]:
vals = db_df["ICP_Vital"]

vals[vals < 0].hist(bins=100)

In [None]:
vals.quantile(0.001)

In [None]:
len(vals)

In [None]:
import matplotlib.pyplot as plt

In [None]:
# check distributions
p = uke_df.hist(bins=100, figsize=(15, 15))
plt.tight_layout()

In [None]:
from sklearn.preprocessing import PowerTransformer

transform = PowerTransformer(method='yeo-johnson', standardize=True)

In [None]:
uke_df = df_large[df_large["DB_UKE"] == 1]

In [None]:
uke_arr = uke_df.to_numpy()

In [None]:
transform.fit(uke_arr)

In [None]:
np.round(transform.lambdas_, 1)

In [None]:
transformed_arr = transform.transform(uke_arr)

In [None]:
lambdas_ = transform.lambdas_
mask = np.abs(lambdas_) > 5
print(lambdas_[mask])

In [None]:
transformed_df = pd.DataFrame(transformed_arr, columns=df_large.columns)#, index=df_large.index)

In [None]:
feat = "sO2_BGA"

In [None]:
idx = list(transformed_df.columns).index(feat)
lambdas_[idx]

In [None]:
uke_df[feat].describe()

In [None]:
transformed_df[feat].describe()

In [None]:
uke_df[feat].unique()

In [None]:
transformed_df[feat].unique()

In [None]:
uke_df[feat].hist(bins=100)

In [None]:
transformed_df[feat].hist(bins=100)

In [None]:
import numba
import numpy as np

@numba.jit()
def ema_fill(pat: np.ndarray, ema_val: float, mean: np.ndarray):
    # init ema
    ema = np.ones_like(pat[0]) * pat[0]
    ema[np.isnan(ema)] = mean[np.isnan(ema)]
    # run ema
    ema_steps = np.ones_like(pat)
    for i, pat_step in enumerate(pat):
        pat_step[np.isnan(pat_step)] = 0
        ema = ema_val * ema + (1 - ema_val) * pat_step
        ema_steps[i] = ema.copy()
    return ema_steps

In [None]:
mean = uke_df.mean().to_numpy()

In [None]:
uke_filled = uke_df.groupby("Pat_ID").apply(lambda pat: pd.DataFrame(ema_fill(pat.sort_values("rel_time").to_numpy(), 0.9, mean), columns=pat.columns))

In [None]:
def four_parts(df):
    leng = len(df)
    first = df.iloc[0: leng // 4].mean()
    second = df.iloc[leng // 4: leng // 2].mean()
    third = df.iloc[leng // 2: leng * 3 // 4].mean()
    fourth = df.iloc[leng * 3 // 4:].mean()
    all_parts = [first, second, third, fourth]
    return pd.DataFrame(all_parts, columns=df.columns)

In [None]:
umap_arr_red = umap_norm.reset_index(drop=True).groupby("Pat_ID").apply(four_parts).reset_index(drop=True).dropna()

In [None]:
umap_arr_dropped = umap_arr_red.drop(columns=["DB_MIMIC", "DB_UKE", "DB_eICU", "Pat_ID", "ICP_Vital"]).to_numpy()

In [None]:
from sklearn.preprocessing import PowerTransformer

def apply_yeo(df, thresh=50, lambs=None):
    transform = PowerTransformer(method='yeo-johnson', standardize=False)
    drop_cols = ["DB_MIMIC", "DB_UKE", "DB_eICU", "Pat_ID", "ICP_Vital", "rel_time"]
    dropped = df.drop(columns=drop_cols)
    arr = dropped.to_numpy()
    if lambs is None:
        # apply yeo
        transform.fit(arr)
        lambs = transform.lambdas_ 
        mask = np.abs(lambs) > thresh
        print(df["DB_MIMIC"].sum(), df["DB_UKE"].sum(), df["DB_eICU"].sum())
        print(mask.sum())
        #print(pd.Series(np.round(lambs, 1), index=dropped.columns).sort_values(np.abs(lambs)))
        print(np.round(lambs, 1))
        print()
        lambs[mask] = 1
    transform.lambdas_ = lambs
    trans_arr = transform.transform(arr)
    # merge back
    df = pd.concat([pd.DataFrame(trans_arr, columns=dropped.columns), df[drop_cols]], axis=1)
    return df, lambs

In [None]:
def create_umap_df(df, yeo=False, lambs=None, mean=None, std=None, thresh=10):
    # apply yeo
    if yeo:
        df, _ = apply_yeo(df, lambs=lambs, thresh=thresh)
    # calc median
    median = df.median().to_numpy()
    # fill using ema
    df_filled = df.groupby("Pat_ID").apply(lambda pat: pd.DataFrame(ema_fill(pat.sort_values("rel_time").to_numpy(), 0.9, median), columns=pat.columns)).reset_index(drop=True)
    # calc mean
    if mean is None:
        mean = df_filled.mean()
        std = df_filled.std()
        mean[std == 0] = 0
        std[std == 0] = 1
    #norm
    df_norm = (df_filled - mean) / std
    # average over four regions per Pat
    df_red = df_norm.groupby("Pat_ID").apply(four_parts).reset_index(drop=True).dropna()
    return df_red

In [None]:
yeo = False
use_train_stats = False
thresh = 10

lambs = None
mean = None
std = None
    
if use_train_stats:
    train_df = df_large[df_large["DB_UKE"] == 1]
    if yeo:
        # calc lambdas for train dataset
        #train_df_filled = train_df.groupby("Pat_ID").apply(lambda pat: pd.DataFrame(ema_fill(pat.astype(float).sort_values("rel_time").to_numpy(), 0.9, train_df.median()), columns=pat.columns)).reset_index(drop=True)
        train_df, lambs = apply_yeo(train_df, thresh=thresh)
         # calc median
        median = train_df.median().to_numpy()
        # fill using ema
        train_df = train_df.groupby("Pat_ID").apply(lambda pat: pd.DataFrame(ema_fill(pat.sort_values("rel_time").to_numpy(), 0.9, median), columns=pat.columns)).reset_index(drop=True)
    
    # calc mean
    mean = train_df.mean().to_numpy()
    std = train_df.std()
    mean[std == 0] = 0
    std[std == 0] = 1


grouper = df_large["DB_UKE"] * 1 + df_large["DB_MIMIC"] * 2 + df_large["DB_eICU"] * 3
df_red = df_large.groupby(grouper).apply(lambda db: create_umap_df(db, yeo=yeo, lambs=lambs, mean=mean, std=std, thresh=thresh))
#df_red = transformed_df.groupby(grouper).apply(create_umap_df)

In [None]:
if std is not None:
    std.sort_values()

In [None]:
db_values = df_red["DB_UKE"] * 1 + df_red["DB_MIMIC"] * 2 + df_red["DB_eICU"] * 3
db_values[db_values == 1] = "UKE"
db_values[db_values == 2] = "MIMIC"
db_values[db_values == 3] = "eICU"

In [None]:
# drop some cols and to numpy
drop_cols = ["DB_MIMIC", "DB_UKE", "DB_eICU", "Pat_ID", "ICP_Vital"]
#drop_cols.extend([col for col in df_red.columns if "_Med" in col])
#drop_cols.extend([col for col in df_red.columns if "Diagnose" in col])
#drop_cols.extend([col for col in df_red.columns if "BGA" in col])
#drop_cols.extend([col for col in df_red.columns if "Labor" in col])
#drop_cols.extend([col for col in df_red.columns if "Vital" in col])

#print(drop_cols)
dropped = df_red.drop(columns=drop_cols)
#print(dropped.columns)
arr = dropped.to_numpy()

In [None]:
df_used.groupby(grouper).apply(lambda db: db["Cl_BGA"].mean())

In [None]:
df_used.groupby(grouper).apply(lambda db: db["Cl_BGA"].std())

In [None]:
dropped.groupby(db_values).apply(lambda db: db["Cl_BGA"].std())

In [None]:
dropped.groupby(db_values).apply(lambda db: db["Cl_BGA"].mean())

In [None]:
dropped["Cl_BGA"].std()

In [None]:
from umap import UMAP
import umap.plot

umapper = UMAP(n_components=2, n_neighbors=15)
umapped_arr = umapper.fit_transform(arr)

In [None]:
from sklearn.decomposition import PCA
mapper = PCA()

pca_arr = mapper.fit_transform(arr)
pca_arr.shape

In [None]:
colors = db_values.copy()
colors[colors=="UKE"] = "red"
colors[colors=="MIMIC"] = "blue"
colors[colors=="eICU"] = "green"

colors = df_red["DB_UKE"]
plt.figure(figsize=(15, 7))
plt.scatter(pca_arr[:, 0], pca_arr[:, 1], c=colors, s=1.5)

In [None]:
plt.plot(mapper.explained_variance_)

In [None]:
comp_idx = 0
plt.bar(range(len(mapper.components_[comp_idx])), mapper.components_[comp_idx])
print(mapper.components_[comp_idx].sum())

In [None]:
max_comp_idx = np.argmax(mapper.components_[comp_idx])
print(max_comp_idx)
print(dropped.columns[max_comp_idx])

In [None]:
dropped.groupby(db_values).apply(lambda x: x["Temp_Vital"].std())

In [None]:
#help(UMAP)

In [None]:
#umap.plot.points(umapper, theme="fire", values=df_red["Pupille re_Vital"])

In [None]:
umap.plot.points(umapper, theme="fire", values=df_red["Phosphat_Labor"])

In [None]:
# yeo all
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# yeo all
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# all
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# all yeo
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# only static
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# only vital
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# no med + no diag
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
# no med
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
umap.plot.points(umapper, theme="fire", labels=db_values)

In [None]:
list(transformed_df.columns).index("Na_BGA")

In [None]:
p = transformed_df.loc[:, mask].hist(figsize=(13, 13), bins=100)

In [None]:
# find stds of 0
transformed_df.loc[:, transformed_df.std() == 0]

In [None]:
transformed_df["sO2_BGA"].describe()

In [None]:
np.quantile(transformed_df["sO2_BGA"].dropna(), 0.9999)

In [None]:
np.quantile(df_large["sO2_BGA"].dropna(), 0.99)

In [None]:
df_large["sO2_BGA"].describe()

In [None]:
transformed_df.max()[transformed_df.max() > 5]

In [None]:
transformed_df.loc[:, mask]

In [None]:
transformed_df["Na_BGA"].hist(bins=100)
transformed_df["Na_BGA"].describe()

In [None]:
p = transformed_df.hist(bins=100, figsize=(15, 15))
plt.tight_layout()

In [None]:
(~df_large["ICP_Vital"].isna()).sum()

In [None]:
(~df_large["ICP_Vital"].isna()).sum()

In [None]:
len(df_large)

In [None]:
(~df_large[df_large["DB_MIMIC"] == 1]["ICP_Vital"].isna()).sum()

In [None]:
len(df_large[df_large["DB_MIMIC"] == 1]["ICP_Vital"])

In [None]:
# 5 min: 3093093 ICP_Vital non_nans. 154014 in MIMIC
# len total: xxx. MIMIC: 2118712

In [None]:
# 60 min: 585353 ICP VItal non_nans in total. 128303 in MIMIC
# total steps: 735951. MIMIC: 177102

In [None]:
test_df["Diagnose_Tumor"].isna().mean()

In [None]:
test_df["Alter"].isna().mean()

In [None]:
test_df = pd.read_csv("data/df_final_5.csv")

In [None]:
len(test_df[test_df["DB_MIMIC"] == 1]["ICP_Vital"])

In [None]:
(~test_df[test_df["DB_MIMIC"] == 1]["ICP_Vital"].isna()).sum()

In [None]:
(~df_large[df_large["DB_MIMIC"] == 1]["ICP_Vital"].isna()).sum()

In [None]:
df_large[df_large["DB_MIMIC"] == 1]["ICP_Vital"].hist(bins=100)

In [None]:
pat_ids = []
for db in df_long["DB"].unique():
    subset = df_long[df_long["DB"] == db]
    ids = subset["Pat_ID"].unique()
    pat_ids.append(ids)

In [None]:
id_dict = {}
for id_ in pat_ids[0]:
    id_dict[id_] = 1
    

In [None]:
for id_ in pat_ids[1]:
    if id_ in id_dict:
        print(id_, "is duplicate!")
        id_dict[id_] = 1

In [None]:
for id_ in pat_ids[2]:
    if id_ in id_dict:
        print(id_, "is duplicate!")
        id_dict[id_] = 1

In [None]:
np.isinf(df_large.to_numpy()).sum()

In [None]:
# check specific data
clinic = df_large[df_large["DB_UKE"] == 1]
pat_ids = clinic["Pat_ID"].unique()

In [None]:
len(pat_ids)

In [None]:
clinic["ICP_Vital"].isna().mean()

In [None]:
pat_idx = -500

pat_id = pat_ids[pat_idx]
pat = clinic[clinic["Pat_ID"] == pat_id]

In [None]:
pat.plot.scatter(x="rel_time", y="ICP_Vital")
pat.plot(x="rel_time", y="ICP_Vital")

In [None]:
#pat["HF_Vital"].plot()

In [None]:
uke = df_long[df_long["DB"] == "UKE"]

In [None]:
uke[uke["Maßnahme_norm"] == "FiO2"]

In [None]:
nan_frac_per_db = df_filled.groupby("DB").apply(lambda: db.isna().mean())

In [None]:
nan_frac_per_db

In [None]:
for col in df_filled.columns:
    nan_means = df_filled.groupby("DB").apply(lambda db: db[col].isna().mean())
    print(col)
    print(nan_means)
    print()
    if max(nan_means) > 0.99:
        print("Drop: ", col)

In [None]:
df_filled.groupby("DB").apply(lambda db: db["FiO2_BGA"].mean())

In [None]:
df_filled.groupby("DB").apply(lambda db: db["FiO2_Vital"].mean())

In [None]:
df_filled.groupby("DB").apply(lambda db: db["FiO2_Vital"].std())

In [None]:
df_large["FiO2_BGA"]

In [None]:
# FiO2 does not exist in UKE