In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_PATH = Path.cwd()/'Data'
RAND_STATE = 0

In [3]:
def make_test_train_full(from_path=DATA_PATH/'Final_data.csv', to_path='Data/'):
    """Create a test, train, and full csv files from preprocessed raw data"""
    full_df = pd.read_csv(from_path, low_memory=False)
    full_df.drop(labels='AlkPhos_UL', inplace=True, axis=1, errors='ignore')

    #Adding categorical age column for a stratified train/test split
    AGE_GROUP_AMOUNT = 8
    full_df['AGE_GROUP'] = pd.cut(full_df['AGE'], bins=AGE_GROUP_AMOUNT,
                                  labels=range(AGE_GROUP_AMOUNT))

    #Making train test split with proportional age groups
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.1,
                                   random_state=RAND_STATE)
    for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
        strat_train_set = full_df.iloc[train_index]
        strat_test_set = full_df.iloc[test_index]

    full_transform = ColumnTransformer([
        ('encoder', OrdinalEncoder(), ['GENDER']),
        ('drop', 'drop', ['SEQN', 'AGE_GROUP']),
        ('scaler', StandardScaler(), list(strat_train_set.columns)[2:-2]),
        ('passthrough', 'passthrough', ["AGE"]),
    ])

    trans_train = full_transform.fit_transform(strat_train_set)

    train = pd.DataFrame(full_transform.fit_transform(strat_train_set), 
                     columns=strat_train_set.columns[1:-1])
    test = pd.DataFrame(full_transform.fit_transform(strat_test_set), 
                     columns=strat_train_set.columns[1:-1])
    train.to_csv(to_path+'train_data.csv', index=False)
    test.to_csv(to_path+'test_data.csv', index=False)

    full_df = pd.concat([train, test]).reset_index(drop=True)
    full_df.to_csv(to_path+'full_data.csv', index=False)

In [4]:
def make_biomarkers_units(to_path="Data/non_NHANES/markers_units.csv"):
    """Create a zipped list of biomarkers and their units of emasurements"""
    a='''Albumin**                   [                ] 35 - 52 g/l
    Glucose**                        [                ] 3.9 - 5.8 mmole/l
    Urea**(BUN)                      [                ] 2.5 - 6.4 mmole/l
    Cholesterol**                    [                ] 3.37 - 5.96 mmole/l
    Protein total**                  [                ] 64 - 83 g/l
    Sodium**                         [                ] 136 - 146 mmole/l
    Creatinine**                     [                ] 53 - 97 mmole/l
    Hemoglobin**                     [                ] 11.7 - 15.5 g/dl
    Bilirubin total                  [                ] 1.7 - 21 mcmole/l
    Triglycerides                    [                ] 0.68 - 6 mmole/l
    HDL Cholesterol                  [                ] < 3.3 mmole/l
    LDL cholesterol (by Friedewald)  [                ] 1.81- 4.04 mmole/l
    Calcium                          [                ] 2.15 - 2.65 mmole/l
    Potassium                        [                ] 3.4 - 5.1 mmole/l
    Hematocrit                       [                ] 37 - 50 %
    MCHC                             [                ] 31.5 - 35.7 g/dL
    MCV                              [                ] 82 - 95 fl
    Platelets                        [                ] 150 - 450 10^3 /mcl
    Erythrocytes (RBC)               [                ] 3.5 - 5.5 10^6 /mcl'''

    lines = a.splitlines()
    lines = [l.split('[') for l in lines]
    lines = [(b.strip(), m.strip()) for b, m in lines]

    biomarkers = [l[0] for l in lines]
    units = [l[1][2:] for l in lines]
    units[4] = '6.4 - 8.3 g/dl'

    s = 'Biomarker,Unit\n'
    for b, u in list(zip(biomarkers, units)):
        s += b+','+u+'\n'

    with open(to_path, "w") as text_file:
        text_file.write(s.strip())

In [5]:
samples = [list(pd.read_csv('Data/non_NHANES/samples_from_aging.csv').iloc[i]) 
           for i in range(len(pd.read_csv(
               'Data/non_NHANES/samples_from_aging.csv')))]

In [6]:
def make_samples():

    MALE69 = [1.0, 37.0, 5.16, 3.89, 4.73, 5.9, 140.0, 88.4, 14.4, 18.81, 1.11,
              1.05, 3.13, 2.05, 4.4, 44.2, 32.6, 91.0, 188.0, 4.86, 69.0]
    MALE40=[1.0, 49.21, 5.28, 7.3, 7.04, 7.0, 143.0, 120, 16.5, 17.2, 1.35, #100.5 Creatine -> 120
            1.6, 4.83, 2.48, 4.7, 46.7, 35.4, 86.5, 177.0, 5.41, 40.0]
    MALE29 = [1.0, 47.64, 5.06, 6.0, 5.35, 8.2, 139.0, 72.7, 13.6, 10.9, 0.74,
              1.38, 3.63, 2.62, 4.7, 40.8, 33.3, 73.2, 206.0, 5.57, 29.0]

    people = [MALE69, MALE40]

    df = pd.read_csv('Data/test_data.csv')
    df = df[0:len(people)].copy()
    for i, biomarkers in enumerate(people):
        df[i:i+1] = biomarkers
    
    df.to_csv('Data/non_NHANES/samples_from_aging.csv', index=False)

In [7]:
make_test_train_full()
make_biomarkers_units()
make_samples()