In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_PATH = Path.cwd()/'Data'

In [3]:
RAND_STATE = 0

In [4]:
full_df = pd.read_csv(DATA_PATH/'Final_data.csv', low_memory=False)
full_df.drop(labels='AlkPhos_UL', inplace=True, axis=1, errors='ignore')

In [5]:
#Adding categorical age column for proportional/stratified train/test split
AGE_GROUP_AMOUNT = 8
full_df['AGE_GROUP'] = pd.cut(full_df['AGE'], bins=AGE_GROUP_AMOUNT,
                              labels=range(AGE_GROUP_AMOUNT))

#Making train test split with proportional age groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1,
                               random_state=RAND_STATE)
for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
    strat_train_set = full_df.iloc[train_index]
    strat_test_set = full_df.iloc[test_index]

In [6]:
full_transform = ColumnTransformer([
    ('encoder', OrdinalEncoder(), ['GENDER']),
    ('drop', 'drop', ['SEQN', 'AGE_GROUP']),
    ('scaler', StandardScaler(), list(strat_train_set.columns)[2:-2]),
    ('passthrough', 'passthrough', ["AGE"]),
])

trans_train = full_transform.fit_transform(strat_train_set)

In [7]:
train = pd.DataFrame(full_transform.fit_transform(strat_train_set), 
                 columns=strat_train_set.columns[1:-1])
test = pd.DataFrame(full_transform.fit_transform(strat_test_set), 
                 columns=strat_train_set.columns[1:-1])

In [8]:
train.to_csv('Data/train_data.csv', index=False)
test.to_csv('Data/test_data.csv', index=False)

In [9]:
full_df = pd.concat([train, test]).reset_index(drop=True)

In [10]:
full_df.to_csv('Data/full_data.csv', index=False)