In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm
import pickle

# user defined methods
import sys
sys.path.append('../utils')

from modeling import split_data, train_model, find_best_data
#import modeling

random.seed(0)
np.random.seed(0)

In [2]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

# Models: Finding cheap features to predict Central Sleep Apnea

In [4]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')
target = 'ahi_c0h4'

In [5]:
# Save model
# print(folder_loc, target)
# print(find_best_data(folder_loc, target))
# print("HUH")
best_mae, best_model, model_name, best_dataset = find_best_data(folder_loc, datasets, target)

pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '.pkl', 'wb'))

  0%|          | 0/127 [00:00<?, ?it/s]

100%|██████████| 127/127 [23:35<00:00, 11.15s/it]


In [6]:
print(model_name, best_mae)

ridge 4.635646174021972


In [7]:
best_model.feature_names_in_

array(['hip', 'neck20', 'coffee15', 'tea15', 'soda15', 'evsmok15',
       'smknow15', 'asa15', 'smokstat_s1', 'bmi_s1', 'weight', 'waist',
       'height', 'weight20', 'nsrrid'], dtype=object)

In [8]:
features_dict = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [9]:
# keep only the rows where id == best_model.feature_names_in_
features_dict = features_dict[features_dict['id'].isin(best_model.feature_names_in_)]


In [10]:
features_dict[['display_name', 'description', 'folder']].to_csv('./results.csv')

In [11]:
# Antropometric features and Lifestyle and Behavioural features are the most important features

# Hyperparameter tuning of the best model

In [12]:
# load data and model
best_model = pickle.load(open('../../models/cheap_features/ridge.pkl', 'rb'))
df = pd.read_csv('../../data/processed/cheap_features/Ant_Lif.csv')

In [13]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [14]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_



{'alpha': 4.434528937795567, 'solver': 'sparse_cg'}
-4.723208539258343
Ridge(alpha=4.434528937795567, random_state=1, solver='sparse_cg')




In [15]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

4.625547550719264
4.569842143977


In [16]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#

[-4.75856807 -4.8232436  -4.75379321 -4.76452204 -4.51591528]
-4.7232084399072125


# Feature selection models

In [17]:
datasets = os.listdir('../../data/interim/feature_selection')
datasets = [dataset for dataset in datasets if 'ahi_c0h4a' in dataset]

folder_loc = '../../data/interim/feature_selection/'
target = 'ahi_c0h4a'

In [18]:
# Save model
best_mae, best_model, model_name, best_dataset = find_best_data(folder_loc, datasets, target)
pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '_feature_selection.pkl', 'wb'))

100%|██████████| 6/6 [01:57<00:00, 19.58s/it]


In [19]:
best_dataset


'forward_selection_AIC_ahi_c0h4a.csv'

In [20]:
df = pd.read_csv(f'../../data/interim/feature_selection/{best_dataset}')

In [21]:
target="ahi_c0h4a"

In [22]:
df["ahi_c0h4a"]

0        4.314248
1       23.406593
2        4.853556
3        2.591362
4        5.513514
          ...    
5799    21.669196
5800    17.901391
5801     5.625000
5802     6.971570
5803    29.411765
Name: ahi_c0h4a, Length: 5804, dtype: float64

In [23]:
features = df.columns.tolist()
features.remove(target)

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

print(features)

['neck20', 'bmi_s1', 'age_s1', 'gender', 'diasbp', 'funres02', 'shhs1_tcvd', 'ess_s1', 'nonsp_st', 'hosnr02', 'benzod1', 'ccb1', 'nsrrid', 'shhs1_qc', 'twuweh02', 'race', 'estrgn1', 'waist', 'soda15', 'tea15', 'urdbpae', 'diuret1', 'pvdl1', 'height', 'weight', 'systbp']


In [24]:
print(model_name, best_mae)

ridge 5.747435889334489


In [25]:
# The best feature selection method is forward selection but the results are not as good as our simple feature selection