# Data Preparation
Author: Lin Lee Cheong

Notebook for preparing data for pre-processing and modeling:
- Copy data into raw_data folder
- Split data into 5 fold folders

## 0. Install packages - First time only

In [None]:
! pip install black
! jupyter nbextension install https://github.com/drillan/jupyter-black/archive/master.zip --user
! jupyter nbextension enable jupyter-black-master/jupyter-black
print('Completed')

## 1. Raw_data

In [1]:
!aws s3 cp s3://cmsai-mrk-amzn/CSVModelInputs/readmission_input_targets_365_v2.csv raw_data/

download: s3://cmsai-mrk-amzn/CSVModelInputs/readmission_input_targets_365_v2.csv to raw_data/readmission_input_targets_365_v2.csv


## 2. Split files

In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold

In [3]:
raw_df = pd.read_csv(
    "./raw_data/readmission_input_targets_365_v2.csv", low_memory=False
)
raw_df.head()

Unnamed: 0,patient_id,discharge_dt,discharge_id,365,364,363,362,361,360,359,...,8,7,6,5,4,3,2,1,0,unplanned_readmission
0,100002085,20110922,100002085_20110922,,,,,,,"d_7295, d_78650, d_78652, d_78659, d_78659, h_...",...,"d_486, d_5119, d_5128, d_5183, d_80709, h_7101...","d_496, d_80709, h_99232, h_99233","d_5119, d_80709, d_8600, h_00528, h_31645, h_3...","d_496, d_51889, h_71010","d_496, d_51189, d_5119, d_7931, d_80709, h_710...","d_5119, d_51919, d_7931, d_80709, h_00520, h_7...","d_5119, d_5128, d_5180, d_7931, d_80709, h_710...","d_5119, d_5183, d_80709, d_8600, d_V5399, h_32...","admission, d_496, d_72887, d_78605, d_78650, d...",False
1,100002829,20111013,100002829_20111013,,"d_28521, d_58881, h_82310, h_84100",,,,,,...,"h_90999, h_J1270",,"h_90999, h_J1270",,,"h_90999, h_J1270, h_J1756","admission, d_40391, d_5856, d_5856, d_59970, d...","d_5856, d_59970, d_92303, d_9233, h_00400, h_1...","d_4019, d_5856, d_59970, discharge, h_90732, h...",False
2,100003379,20091207,100003379_20091207,,,,,,,,...,,"d_586, h_99231","d_99883, h_99231","d_586, d_99883, h_11042, h_99231, p_8622",,"d_99883, h_99232",,,"admission, d_586, d_71945, d_V4989, discharge,...",True
3,100004211,20110102,100004211_20110102,,,,"d_42731, d_42822, d_78650, h_93010, h_99214, h...","d_53081, d_78902, h_99214",,,...,"d_4019, d_42731, h_99233",,"d_1950, d_5119, d_5738, h_71010, h_76705, h_99232","d_1539, d_V667, h_99233","d_1975, h_99233","d_1975, d_51881, h_99233","d_1975, d_42731, h_99233","d_1975, h_99233","d_1975, d_42731, d_51881, death, discharge, h_...",False
4,100008869,20101116,100008869_20101116,,"d_29633, h_90806","d_53550, h_99213",,,,,...,"d_29620, h_99231","d_29620, h_99231",,,"admission, d_25000, d_29620, d_29623, d_4019, ...","d_41401, d_78650, h_93306, h_99222, h_99232","d_78650, h_99231",,"d_78650, discharge, h_99238",False


In [4]:
raw_df.shape

(1619158, 370)

In [5]:
def check_make_dirs(fpath):
    if not isinstance(fpath, list):
        fpath = [fpath]
    for path in fpath:
        if not os.path.isdir(path):
            os.makedirs(path)

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
for fold, (train_idx, test_idx) in enumerate(
    skf.split(raw_df, raw_df["unplanned_readmission"])
):
    print(
        f"Fold number: {fold}: \n\
        train len = {len(train_idx)},\n\
        ratio = {sum(raw_df.unplanned_readmission[train_idx]/len(train_idx))},\n\
        test len = {len(test_idx)}, \n\
        test ratio = {sum(raw_df.unplanned_readmission[test_idx]/len(test_idx))}"
    )

    # make directory paths
    fold_dir = os.path.join("./", "fold_" + str(fold))
    train_fold_dir = os.path.join(fold_dir, "train")
    test_fold_dir = os.path.join(fold_dir, "test")
    check_make_dirs([fold_dir, train_fold_dir, test_fold_dir])

    raw_df.iloc[train_idx].to_csv(
        os.path.join(train_fold_dir, "raw_train_data.csv"), index=False
    )
    raw_df.iloc[test_idx].to_csv(
        os.path.join(test_fold_dir, "raw_test_data.csv"), index=False
    )

Fold number: 0: 
        train len = 1295326,
        ratio = 0.14456437993186713,
        test len = 323832, 
        test ratio = 0.1445657007337037
Fold number: 1: 
        train len = 1295326,
        ratio = 0.14456437993186713,
        test len = 323832, 
        test ratio = 0.1445657007337037
Fold number: 2: 
        train len = 1295326,
        ratio = 0.14456437993186713,
        test len = 323832, 
        test ratio = 0.1445657007337037
Fold number: 3: 
        train len = 1295327,
        ratio = 0.1445650403335202,
        test len = 323831, 
        test ratio = 0.144563059126681
Fold number: 4: 
        train len = 1295327,
        ratio = 0.1445650403335202,
        test len = 323831, 
        test ratio = 0.144563059126681
