In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm


# user defined methods
import sys
sys.path.append('../utils')

from modeling import split_data, train_model

random.seed(0)
np.random.seed(0)

In [4]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [10]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

# Models: Finding cheap features to predict Central Sleep Apnea

In [7]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')

In [8]:
target = 'ahi_c0h4'

In [11]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None

for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove(target)

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset

# Save model
import pickle
pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '.pkl', 'wb'))

100%|██████████| 127/127 [20:17<00:00,  9.59s/it]


In [12]:
print(model_name, best_mae)

ridge 4.635646174021972


In [13]:
best_model.feature_names_in_

array(['hip', 'neck20', 'coffee15', 'tea15', 'soda15', 'evsmok15',
       'smknow15', 'asa15', 'smokstat_s1', 'bmi_s1', 'weight', 'waist',
       'height', 'weight20', 'nsrrid'], dtype=object)

In [14]:
features_dict = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [15]:
# keep only the rows where id == best_model.feature_names_in_
features_dict = features_dict[features_dict['id'].isin(best_model.feature_names_in_)]


In [16]:
features_dict[['display_name', 'description', 'folder']].to_csv('./results.csv')

In [134]:
# Antropometric features and Lifestyle and Behavioural features are the most important features

# Hyperparameter tuning of the best model

In [18]:
# load data and model
import pickle

best_model = pickle.load(open('../../models/cheap_features/ridge.pkl', 'rb'))
df = pd.read_csv('../../data/processed/cheap_features/Ant_Lif.csv')

In [19]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [20]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_



{'alpha': 4.434528937795567, 'solver': 'sparse_cg'}
-4.723208539258343
Ridge(alpha=4.434528937795567, random_state=1, solver='sparse_cg')




In [21]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

4.625547550719264
4.569842143977


In [22]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#

[-4.75856807 -4.8232436  -4.75379321 -4.76452204 -4.51591528]
-4.7232084399072125


# Feature selection models

In [23]:
datasets = os.listdir('../../data/interim/feature_selection')

In [24]:
datasets = [dataset for dataset in datasets if 'ahi_c0h4a' in dataset]

In [25]:
folder_loc = '../../data/interim/feature_selection/'

In [26]:
target = 'ahi_c0h4a'

In [28]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None

for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove(target)

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset

# Save model
import pickle
pickle.dump(best_model, open('../../models/cheap_features/' + model_name + '_feature_selection.pkl', 'wb'))

100%|██████████| 6/6 [01:34<00:00, 15.78s/it]


In [29]:
best_dataset

'forward_selection_ahi_c0h4a_AIC.csv'

In [30]:
df = pd.read_csv(f'../../data/interim/feature_selection/{best_dataset}')

In [31]:
features = df.columns.tolist()
features.remove(target)

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [32]:
print(model_name, best_mae)

ridge 5.815896127964164


In [35]:
# The best feature selection method is forward selection but the results are not as good as our simple feature selection

In [37]:
# load data and model
import pickle

best_model = pickle.load(open('../../models/cheap_features/ridge_feature_selection.pkl', 'rb'))
df = pd.read_csv('../../data/processed/cheap_features/Ant_Lif.csv')

In [42]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

KeyError: 'ahi_c0h4a'

In [40]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

5.8025788737668
5.853749233904157


In [41]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#

[-6.02496851 -6.04835237 -5.84749582 -5.9696119  -5.75898685]
-5.92988309020622
