In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm
from utils import *

random.seed(0)
np.random.seed(0)

In [2]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
os.makedirs('../models/cheap_features/', exist_ok=True)

# Models: Finding cheap features to predict Central Sleep Apnea

In [5]:
folder_loc = '../data/processed/cheap_features/'
datasets = os.listdir('../data/processed/cheap_features/')

In [6]:
target = 'ahi_c0h4'

In [None]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None
results = []
for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove('nsrrid')
    # features.remove('weight')
    features.remove(target)

    print('Dataset: ', dataset)

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset

    results.append([dataset, mae_xgb, mae_rf, mae_lr, mae_lasso, mae_ridge, mae_dt])

    print('Best model: ', model_name)
    print('Best dataset: ', best_dataset)
    print('Best MAE: ', best_mae)

# Save model
results_df = pd.DataFrame(results, columns=['dataset', 'mae_xgb', 'mae_rf', 'mae_lr', 'mae_lasso', 'mae_ridge', 'mae_dt'])
import pickle
pickle.dump(best_model, open('../models/cheap_features/' + model_name + '.pkl', 'wb'))

In [None]:
results_df.to_csv('../models/cheap_features/results.csv', index=False)

In [10]:
import pandas as pd

results_df = pd.read_csv('../models/cheap_features/results.csv')

In [11]:
# sort the results by best value in the row
# make a column with the best value fron the row
results_df['best'] = results_df.iloc[:, 1:].min(axis=1)
results_df = results_df.sort_values(by=['best'])

In [12]:
# reindex
results_df = results_df.reset_index(drop=True)

In [13]:
results_df.head()

Unnamed: 0,dataset,mae_xgb,mae_rf,mae_lr,mae_lasso,mae_ridge,mae_dt,best
0,Ant_Cli_Med_Tre.csv,5.174459,4.663704,4.885938,4.763845,4.876071,6.218267,4.663704
1,Ant_Dem_Lif.csv,5.133372,4.913998,4.685755,4.714636,4.685201,7.192445,4.685201
2,Ant_Cli_Dem_Gen_Med_Tre.csv,4.981486,4.854411,1977.889,4.689888,4.926936,6.849647,4.689888
3,Ant_Cli_Dem_Gen_Med.csv,5.008574,4.849822,2449.117,4.689888,4.917609,6.80326,4.689888
4,Ant_Cli_Dem_Gen_Lif.csv,5.001279,4.85073,1136898.0,4.690832,4.789473,6.67349,4.690832


In [19]:
# Define the abbreviation mapping
abbreviations = {'Anthropometry': 'Ant',
                 'Clinical Data': 'Cli',
                 'Demographics': 'Dem',
                 'General Health': 'Gen',
                 'Lifestyle and Behavioral Health': 'Lif',
                 'Medical History': 'Med',
                 'Sleep Treatment': 'Tre'}

# Inverse the abbreviation mapping
abbreviations = {v: k for k, v in abbreviations.items()}

# Substitute the dataset abbreviations in the dataset column of results_df with the abbreviation mapping
results_df['dataset'] = [[', '.join([abbreviations.get(part, part) for part in x.split('.')[0].split('_')]) for x in row] for row in results_df['dataset'].str.split(', ')]

# Print the updated results_df
# print(results_df['dataset'])


In [22]:
results_df.to_csv('../models/cheap_features/results.csv', index=False)

In [29]:
df = pd.read_csv(folder_loc + dataset)

features = df.columns.tolist()
features.remove('nsrrid')
# features.remove('weight')
features.remove(target)

print('Dataset: ', dataset)

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

Dataset:  Ant_Dem_Lif.csv


In [30]:
import pickle
pickle.dump(model_ridge, open('../models/cheap_features/' + 'ridge_2' + '.pkl', 'wb'))

In [31]:
mae_ridge

4.685200668664608

In [None]:
best_model.feature_names_in_

In [None]:
features_dict = pd.read_csv('../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [None]:
# keep only the rows where id == best_model.feature_names_in_
features_dict = features_dict[features_dict['id'].isin(best_model.feature_names_in_)]


In [None]:
features_dict[['display_name', 'description', 'folder']].to_csv('./results.csv')

In [None]:
# Anthropometric features and Lifestyle and Behavioural features are the most important features

# Hyperparemeter tuning of the best model

In [23]:
# load data and model
import pickle

best_model = pickle.load(open('../models/cheap_features/ridge_2.pkl', 'rb'))
df = pd.read_csv('../data/processed/cheap_features/Ant_Dem_Lif.csv')

In [24]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [25]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_

{'alpha': 9.325573593386588, 'solver': 'svd'}
-4.7601723443325366
Ridge(alpha=9.325573593386588, random_state=1, solver='svd')


In [26]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

4.681305543662659
4.5422440981777115


In [27]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#

[-4.75662132 -4.93809086 -4.7319471  -4.78327317 -4.59092926]
-4.7601723443325366


In [28]:
# save model
import pickle
pickle.dump(model, open('../models/cheap_features/ridge_2.pkl', 'wb'))

# Feature selection models

In [None]:
datasets = os.listdir('../data/interim/feature_selection')

In [None]:
datasets = [dataset for dataset in datasets if 'ahi_c0h4a' in dataset]

In [None]:
folder_loc = '../data/interim/feature_selection/'

In [None]:
target = 'ahi_c0h4'

In [None]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None

for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove(target)

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset

# Save model
import pickle
pickle.dump(best_model, open('../models/cheap_features/' + model_name + '_feature_selection.pkl', 'wb'))

In [None]:
best_dataset

In [None]:
df = pd.read_csv(f'../data/interim/feature_selection/{best_dataset}')

In [None]:
features = df.columns.tolist()
features.remove(target)

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [None]:
print(model_name, best_mae)

In [None]:
# The best feature selection method is Decision Tree but the results are not as good as our simple feature selection

# Hyperparemeter tuning of the best model

In [None]:
# load data and model
import pickle

best_model = pickle.load(open('../models/cheap_features/lasso_feature_selection.pkl', 'rb'))
df = pd.read_csv('../data/processed/cheap_features/Ant_Lif.csv')

In [None]:
features = df.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

In [None]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_

In [None]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(mae)
print(mae_test)

In [None]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#