In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm
import pickle

# user defined methods
import sys
sys.path.append('../utils')

from modeling import split_data, train_model, find_best_data
#import modeling

random.seed(0)
np.random.seed(0)

In [2]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

# Models: Finding cheap features to predict Central Sleep Apnea

In [4]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')
target = 'ahi_c0h4'

In [5]:
# Save model
# print(folder_loc, target)
# print(find_best_data(folder_loc, target))
# print("HUH")
best_mae, best_model, model_name, best_dataset, results = find_best_data(folder_loc, datasets, target)

pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '.pkl', 'wb'))

  0%|          | 0/127 [00:00<?, ?it/s]

100%|██████████| 127/127 [41:09<00:00, 19.45s/it]


In [6]:
results = pd.read_csv('../../models/cheap_features/all_results.csv')

In [7]:
# process the dataset column with removing the [ and ] characters
results['dataset'] = [x.replace('[', '').replace(']', '') for x in results['dataset']]
results

Unnamed: 0,dataset,mae_xgb,mae_rf,mae_lr,mae_lasso,mae_ridge,mae_dt,best
0,"'Anthropometry, Demographics, Lifestyle and Be...",5.327808,4.844472,4.685196,4.710413,4.684644,7.618460,4.684644
1,"'Anthropometry, Clinical Data, Medical History'",5.102366,4.686072,4.875302,4.764244,4.870743,6.369666,4.686072
2,"'Anthropometry, Clinical Data, Demographics, G...",4.953066,4.864205,134610.319534,4.689135,4.917305,7.043752,4.689135
3,"'Anthropometry, Clinical Data, Demographics, G...",5.038402,4.855924,34271.562139,4.689135,4.926698,6.985511,4.689135
4,"'Anthropometry, Clinical Data, Demographics, G...",5.037567,4.777905,254372.799160,4.690161,4.933587,6.553584,4.690161
...,...,...,...,...,...,...,...,...
122,"'Lifestyle and Behavioral Health, Sleep Treatm...",5.653014,5.775633,5.317446,5.386041,5.307686,6.970905,5.307686
123,'Lifestyle and Behavioral Health',5.722719,5.767644,5.329440,5.386041,5.321404,7.068057,5.321404
124,'Medical History',5.593378,5.477151,5.355756,5.382871,5.351469,6.965778,5.351469
125,"'Medical History, Sleep Treatment'",5.729356,5.469705,5.373650,5.382871,5.361595,6.994889,5.361595


In [8]:
results.to_csv('../../models/cheap_features/' + 'all_results.csv', index=False)

In [9]:
print(model_name, best_mae)

ridge 4.684644069658721


In [10]:
best_model.feature_names_in_

array(['gender', 'race', 'mstat', 'hip', 'neck20', 'coffee15', 'tea15',
       'soda15', 'evsmok15', 'smknow15', 'asa15', 'age_s1', 'smokstat_s1',
       'ethnicity', 'bmi_s1', 'educat', 'weight', 'waist', 'height',
       'weight20', 'age_category_s1', 'nsrrid'], dtype=object)

In [11]:
features_dict = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [12]:
# keep only the rows where id == best_model.feature_names_in_
features_dict = features_dict[features_dict['id'].isin(best_model.feature_names_in_)]


In [13]:
features_dict[['display_name', 'description', 'folder']].to_csv('./results.csv')

In [14]:
# Antropometric features and Lifestyle and Behavioural features are the most important features

# Hyperparameter tuning of the best model

In [15]:
# load data and model
best_model = pickle.load(open('../../models/cheap_features/ridge.pkl', 'rb'))
df = pd.read_csv('../../data/processed/cheap_features/Ant_Lif.csv')

In [16]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [17]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_



{'alpha': 0.923385947687978, 'solver': 'sparse_cg'}
-4.769160181167741
Ridge(alpha=0.923385947687978, random_state=1, solver='sparse_cg')




In [18]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

4.710048526179917
4.666369687260867


In [19]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#

[-4.86010864 -4.89426738 -4.76167694 -4.74352474 -4.58622283]
-4.769160104843044


# Feature selection models

In [20]:
datasets = os.listdir('../../data/interim/feature_selection')
datasets = [dataset for dataset in datasets if 'ahi_c0h4a' in dataset]

folder_loc = '../../data/interim/feature_selection/'
target = 'ahi_c0h4a'

In [21]:
# Save model
best_mae, best_model, model_name, best_dataset = find_best_data(folder_loc, datasets, target)
pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '_feature_selection.pkl', 'wb'))

100%|██████████| 6/6 [19:54<00:00, 199.07s/it]


ValueError: too many values to unpack (expected 4)

In [None]:
best_dataset


'forward_selection_AIC_ahi_c0h4a.csv'

In [None]:
df = pd.read_csv(f'../../data/interim/feature_selection/{best_dataset}')

In [None]:
target="ahi_c0h4a"

In [None]:
df["ahi_c0h4a"]

0        4.314248
1       23.406593
2        4.853556
3        2.591362
4        5.513514
          ...    
5799    21.669196
5800    17.901391
5801     5.625000
5802     6.971570
5803    29.411765
Name: ahi_c0h4a, Length: 5804, dtype: float64

In [None]:
features = df.columns.tolist()
features.remove(target)

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

print(features)

['neck20', 'bmi_s1', 'age_s1', 'gender', 'diasbp', 'funres02', 'shhs1_tcvd', 'ess_s1', 'nonsp_st', 'hosnr02', 'benzod1', 'ccb1', 'nsrrid', 'shhs1_qc', 'twuweh02', 'race', 'estrgn1', 'waist', 'soda15', 'tea15', 'urdbpae', 'diuret1', 'pvdl1', 'height', 'weight', 'systbp']


In [None]:
print(model_name, best_mae)

ridge 5.747435889334489


In [None]:
# The best feature selection method is forward selection but the results are not as good as our simple feature selection