In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm

# user defined methods
import sys
sys.path.append('../utils')

from modeling import split_data, train_model

random.seed(0)
np.random.seed(0)

In [2]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

# Models: Finding cheap features to predict Central Sleep Apnea

In [4]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')

In [5]:
target = 'ahi_c0h4'

In [6]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None

for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove(target)

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset

# Save model
import pickle
pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '.pkl', 'wb'))

  0%|          | 0/127 [00:00<?, ?it/s]

100%|██████████| 127/127 [1:22:43<00:00, 39.08s/it]


In [7]:
print(model_name, best_mae)

ridge 4.635646174021972


In [8]:
best_model.feature_names_in_

array(['hip', 'neck20', 'coffee15', 'tea15', 'soda15', 'evsmok15',
       'smknow15', 'asa15', 'smokstat_s1', 'bmi_s1', 'weight', 'waist',
       'height', 'weight20', 'nsrrid'], dtype=object)

In [9]:
features_dict = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [10]:
# keep only the rows where id == best_model.feature_names_in_
features_dict = features_dict[features_dict['id'].isin(best_model.feature_names_in_)]


In [11]:
features_dict[['display_name', 'description', 'folder']].to_csv('./results.csv')

In [134]:
# Antropometric features and Lifestyle and Behavioural features are the most important features

# Hyperparameter tuning of the best model

In [12]:
# load data and model
import pickle

best_model = pickle.load(open('../../models/cheap_features/ridge.pkl', 'rb'))
df = pd.read_csv('../../data/processed/cheap_features/Ant_Lif.csv')

In [13]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [14]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_



{'alpha': 4.434528937795567, 'solver': 'sparse_cg'}
-4.723208539258343
Ridge(alpha=4.434528937795567, random_state=1, solver='sparse_cg')


In [15]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

4.625547550719264
4.569842143977


In [16]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#

[-4.75856807 -4.8232436  -4.75379321 -4.76452204 -4.51591528]
-4.7232084399072125


# Feature selection models

In [17]:
datasets = os.listdir('../../data/interim/feature_selection')

In [18]:
datasets = [dataset for dataset in datasets if 'ahi_c0h4a' in dataset]

print(datasets)

['decision_tree_ahi_c0h4a.csv', 'mi_ahi_c0h4a.csv', 'random_forest_ahi_c0h4a.csv', 'forward_selection_ahi_c0h4a_AIC.csv']


In [19]:
folder_loc = '../../data/interim/feature_selection/'

In [35]:
target = 'ahi_c0h4'

In [36]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None

for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove(target)
    print(dataset, features)
    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset
    print(best_mae)
# Save model
import pickle
pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '_feature_selection.pkl', 'wb'))

  0%|          | 0/4 [00:00<?, ?it/s]

decision_tree_ahi_c0h4a.csv ['nsrrid', 'pptid', 'ecgdate', 'lvh3_1', 'lvh3_3', 'st4_1_3', 'st5_1_3', 'lvhst', 'lbbb', 'rbbb', 'ilbbb', 'irbbb', 'lah', 'iventblk', 'antsepmi', 'infmi', 'nonsp_st', 'nonsp_tw', 'ventrate', 'qrs', 'apbs', 'vpbs', 'truposmi', 'gender', 'race', 'mstat', 'srhype', 'parrptdiab', 'cgpkyr', 'alcoh', 'systbp', 'diasbp', 'hip', 'chol', 'hdl', 'trig', 'fev1', 'fvc', 'aai', 'ankbp', 'armbp', 'imdbpae', 'ursbpae', 'urdbpae', 'skrctnae', 'syst120', 'dias120', 'syst220', 'dias220', 'syst320', 'dias320', 'neck20', 'angina15', 'mi15', 'stroke15', 'hf15', 'cabg15', 'ca15', 'othrcs15', 'sa15', 'emphys15', 'crbron15', 'copd15', 'asthma15', 'asth1215', 'cough315', 'phlegm15', 'runny15', 'sinus15', 'coffee15', 'tea15', 'soda15', 'evsmok15', 'ns1yr15', 'yrsns15', 'smknow15', 'cigday15', 'avesmk15', 'wine15', 'beer15', 'shots15', 'asa15', 'asalw15', 'slpill15', 'nitro15', 'napshr15', 'napsmn15', 'stress15', 'estrgn1', 'progst1', 'htnmed1', 'anar1a1', 'lipid1', 'ohga1', 'insuln1

 25%|██▌       | 1/4 [01:31<04:34, 91.50s/it]

5.819445065906095
mi_ahi_c0h4a.csv ['av1deg', 'lbbb', 'rbbb', 'antsepmi', 'rtrial', 'rvh', 'qrs', 'afib', 'paced', 'apbs', 'vpbs', 'truposmi', 'gender', 'race', 'mstat', 'srhype', 'parrptdiab', 'cgpkyr', 'alcoh', 'systbp', 'hip', 'hdl', 'trig', 'fev1', 'fvc', 'armbp', 'ecg', 'imlohrae', 'urdbpae', 'skrctnae', 'tripae', 'othprbae', 'syst120', 'dias120', 'syst220', 'syst320', 'neck20', 'angina15', 'mi15', 'stroke15', 'hf15', 'cabg15', 'othrcs15', 'pacem15', 'sa15', 'emphys15', 'copd15', 'asthma15', 'asth1215', 'cough315', 'phlegm15', 'sinus15', 'coffee15', 'tea15', 'ns1yr15', 'yrsns15', 'smknow15', 'cigday15', 'avesmk15', 'wine15', 'beer15', 'asa15', 'asalw15', 'slpill15', 'nitro15', 'napsmn15', 'estrgn1', 'progst1', 'htnmed1', 'lipid1', 'ohga1', 'insuln1', 'sympth1', 'asa1', 'nsaid1', 'benzod1', 'premar1', 'pdei1', 'ntca1', 'warf1', 'loop1', 'hctz1', 'ccbsr1', 'alpha1', 'alphad1', 'anar1c1', 'anar31', 'pvdl1', 'niac1', 'thry1', 'betad1', 'ccb1', 'aced1', 'diuret1', 'ntg1', 'modact25', '

 50%|█████     | 2/4 [02:30<02:25, 72.67s/it]

5.819445065906095
random_forest_ahi_c0h4a.csv ['nsrrid', 'pptid', 'ecgdate', 'lvh3_1', 'lvh3_3', 'st4_1_3', 'st5_1_3', 'lvhst', 'lbbb', 'rbbb', 'ilbbb', 'irbbb', 'lah', 'iventblk', 'antsepmi', 'infmi', 'nonsp_st', 'nonsp_tw', 'ventrate', 'qrs', 'apbs', 'vpbs', 'truposmi', 'gender', 'race', 'mstat', 'srhype', 'parrptdiab', 'cgpkyr', 'alcoh', 'systbp', 'diasbp', 'hip', 'chol', 'hdl', 'trig', 'fev1', 'fvc', 'aai', 'ankbp', 'armbp', 'imdbpae', 'ursbpae', 'urdbpae', 'skrctnae', 'syst120', 'dias120', 'syst220', 'dias220', 'syst320', 'dias320', 'neck20', 'angina15', 'mi15', 'stroke15', 'hf15', 'cabg15', 'ca15', 'othrcs15', 'sa15', 'emphys15', 'crbron15', 'copd15', 'asthma15', 'asth1215', 'cough315', 'phlegm15', 'runny15', 'sinus15', 'coffee15', 'tea15', 'soda15', 'evsmok15', 'ns1yr15', 'yrsns15', 'smknow15', 'cigday15', 'avesmk15', 'wine15', 'beer15', 'shots15', 'asa15', 'asalw15', 'slpill15', 'nitro15', 'napshr15', 'napsmn15', 'stress15', 'estrgn1', 'progst1', 'htnmed1', 'anar1a1', 'lipid1',

 75%|███████▌  | 3/4 [04:01<01:20, 80.61s/it]

5.819445065906095
forward_selection_ahi_c0h4a_AIC.csv ['neck20', 'bmi_s1', 'age_s1', 'gender', 'diasbp', 'funres02', 'shhs1_tcvd', 'ess_s1', 'nonsp_st', 'hosnr02', 'benzod1', 'ccb1', 'nsrrid', 'shhs1_qc', 'twuweh02', 'race', 'estrgn1', 'waist', 'soda15', 'tea15', 'urdbpae', 'diuret1', 'pvdl1', 'height', 'weight', 'systbp', 'hvsnrd02', 'ntca1']


100%|██████████| 4/4 [04:48<00:00, 72.01s/it]

5.7830904622074595





In [39]:
best_dataset


'forward_selection_ahi_c0h4a_AIC.csv'

In [40]:
df = pd.read_csv(f'../../data/interim/feature_selection/{best_dataset}')

In [47]:
target="ahi_c0h4a"

In [46]:
df["ahi_c0h4a"]

0        4.314248
1       23.406593
2        4.853556
3        2.591362
4        5.513514
          ...    
5799    21.669196
5800    17.901391
5801     5.625000
5802     6.971570
5803    29.411765
Name: ahi_c0h4a, Length: 5804, dtype: float64

In [48]:
features = df.columns.tolist()
features.remove(target)

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

print(features)

['neck20', 'bmi_s1', 'age_s1', 'gender', 'diasbp', 'funres02', 'shhs1_tcvd', 'ess_s1', 'nonsp_st', 'hosnr02', 'benzod1', 'ccb1', 'nsrrid', 'shhs1_qc', 'twuweh02', 'race', 'estrgn1', 'waist', 'soda15', 'tea15', 'urdbpae', 'diuret1', 'pvdl1', 'height', 'weight', 'systbp', 'hvsnrd02', 'ntca1']


In [49]:
print(model_name, best_mae)

ridge 5.7830904622074595


In [50]:
# The best feature selection method is forward selection but the results are not as good as our simple feature selection