In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
def load_csv(path):
    full = os.path.join('diabetes', path)
    return pd.read_csv(full)

In [None]:
load_csv('IDs_mapping.csv')

In [None]:
original = load_csv('diabetic_data_balanced.csv')
pd.set_option('display.max_columns', None)

original.head()

In [None]:
original.info()

In [None]:
original.describe(include='all')

In [None]:
np.any(original['metformin-pioglitazone'] != 'No')

## Data manipulation

### Missing values & redundant columns
* Drop `weight`
* Drop `payer_code`
* Create an `unknown` group for `medical_speciality`
* Get rid of singular columns
* Get rid of columns that have nothing to do with prediction

In [None]:
df = original.copy()

def get_na_info(df):
    nas = df.isna().sum()
    nas = nas[nas > 0]
    return nas

print("NA info in the begginning:")
print(get_na_info(df))

df.drop('weight', axis=1, inplace=True)
df.drop('payer_code', axis=1, inplace=True)
df.drop('diag_1', axis=1, inplace=True)
df.drop('diag_2', axis=1, inplace=True)
df.drop('diag_3', axis=1, inplace=True)

df['medical_specialty'].fillna(value=unknown_token, inplace=True)
df['race'].fillna(value=unknown_token, inplace=True)

print("NA info after cleanup")
print(get_na_info(df))

In [None]:
df.describe(include='all')

In [None]:
def get_unique_pairs(df):
    pairs = []
    
    for col in df.columns:
        pairs.append((col, df[col].unique().shape[0]))
    
    return pairs

def get_singular_cols(df):
    singular_cols = []
    unq_pairs = get_unique_pairs(df)
    
    for col, unq in unq_pairs:
        if unq == 1:
            singular_cols.append(col)
    return singular_cols

sing = get_singular_cols(df)
print("Singular columns before", sing)
df.drop(sing, axis=1, inplace=True)
print("Singular columns after", get_singular_cols(df))
df.describe(include='all')

In [None]:
print('Unique pairs before:')
print(get_unique_pairs(df))

df.drop('encounter_id', axis=1, inplace=True)
#TODO: Could drop subsequent patient visits
df.drop('patient_nbr', axis=1, inplace=True)

print('Unique pairs after:')
print(get_unique_pairs(df))

In [None]:
df.head()

#### Encode categorical values

* Encode `age` as the medium between the two boundaries
* Encode `[admission_type_id, discharge_disposition_id, admission_source_id]` and all non-numerical columns to binary format

In [None]:
num_ages = np.linspace(5, 95, 10)
cat_ages = df['age'].unique()

for cat, num in zip(cat_ages, num_ages):
    print(cat,'with', num)
    df['age'].replace(to_replace=cat, value=num, inplace=True)

df.describe(include='all')

In [None]:
label = df["readmitted"]
df.drop("readmitted", axis=1, inplace=True)
df.head()

In [None]:
id_cols = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']
categorical_cols = df.select_dtypes(include='object').columns
categorical_cols

In [None]:
from sklearn.preprocessing import LabelBinarizer

to_bin = np.concatenate([id_cols, categorical_cols])
print(to_bin)
to_bin.shape

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelBinarizer

def bin_column(df, cols):
    return pd.get_dummies(df, columns=cols, drop_first=True)

bin_column(df, to_bin)

## Split into train/test & Scale
* `from sklearn.model_selection import StratifiedShuffleSplit`
* `from sklearn.preprocessing import StandardScaler`

## Model fitting
* 3 models

## Evaluation
* RMSE & accuracy
* Confusion matrix
* F1 (or Fx) scores & compare