In [139]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm

random.seed(0)
np.random.seed(0)

In [140]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [141]:
os.makedirs('../models/cheap_features/', exist_ok=True)

In [142]:
def split_data(data, features, target):
    """Split on test/train/val.

    Args:
        df (pd.DataFrame): input dataset
        features (list): list of features
        target (str): target column name
    """
    # Split on test/train 80/20
    # split startified on target
    X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.3, random_state=1)
    # make validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
    return X_train, X_test, y_train, y_test, X_val, y_val

# Models: Finding cheap features to predict Central Sleep Apnea

In [143]:
folder_loc = '../data/processed/cheap_features/'
datasets = os.listdir('../data/processed/cheap_features/')

In [144]:
target = 'ahi_c0h4'

In [149]:
def train_model(model, X_train, y_train, X_test, y_test, X_val, y_val):
    """Train model and return predictions.

    Args:
        model (sklearn model): model to train
        X_train (pd.DataFrame): training data
        y_train (pd.DataFrame): training labels
        X_test (pd.DataFrame): test data
        y_test (pd.DataFrame): test labels
        X_val (pd.DataFrame): validation data
        y_val (pd.DataFrame): validation labels
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    # Evaluaate
    mae = mean_absolute_error(y_val, y_pred)
    return mae, model

In [150]:
best_model = None
model_name = None
best_mae = 100000
best_dataset = None

for dataset in tqdm(datasets):
    # Load data
    df = pd.read_csv(folder_loc + dataset)

    features = df.columns.tolist()
    features.remove(target)

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    # Train xgboost

    mae_xgb, model_xgb = train_model(XGBRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train random forest

    mae_rf, model_rf = train_model(RandomForestRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train linear regression

    mae_lr, model_lr = train_model(LinearRegression(), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train lasso

    mae_lasso, model_lasso = train_model(Lasso(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train ridge

    mae_ridge, model_ridge = train_model(Ridge(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Train decision tree

    mae_dt, model_dt = train_model(DecisionTreeRegressor(random_state=1), X_train, y_train, X_test, y_test, X_val, y_val)

    # Save best model

    if mae_xgb < best_mae:
        best_mae = mae_xgb
        best_model = model_xgb
        model_name = 'xgb'
        best_dataset = dataset
    if mae_rf < best_mae:
        best_mae = mae_rf
        best_model = model_rf
        model_name = 'rf'
        best_dataset = dataset
    if mae_lr < best_mae:
        best_mae = mae_lr
        best_model = model_lr
        model_name = 'lr'
        best_dataset = dataset
    if mae_lasso < best_mae:
        best_mae = mae_lasso
        best_model = model_lasso
        model_name = 'lasso'
        best_dataset = dataset
    if mae_ridge < best_mae:
        best_mae = mae_ridge
        best_model = model_ridge
        model_name = 'ridge'
        best_dataset = dataset
    if mae_dt < best_mae:
        best_mae = mae_dt
        best_model = model_dt
        model_name = 'dt'
        best_dataset = dataset

# Save model
import pickle
pickle.dump(best_model, open('../models/cheap_features/' + model_name + '.pkl', 'wb'))

100%|██████████| 127/127 [57:44<00:00, 27.28s/it] 


In [131]:
print(model_name, best_mae)

ridge 4.564480843250868


In [133]:
best_model.feature_names_in_

array(['hip', 'neck20', 'coffee15', 'tea15', 'soda15', 'evsmok15',
       'smknow15', 'asa15', 'surgtr02', 'o2thpy02', 'smokstat_s1',
       'bmi_s1', 'weight', 'waist', 'height', 'weight20', 'nsrrid'],
      dtype=object)

In [None]:
best_model.predict(X_val)


In [134]:
# Antropometric features and Lifestyle and Behavioural features are the most important features

# Hyperparemeter tuning of the best model

In [137]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_

InvalidParameterError: The 'scoring' parameter of RandomizedSearchCV must be a str among {'explained_variance', 'fowlkes_mallows_score', 'precision_macro', 'jaccard', 'f1', 'v_measure_score', 'neg_median_absolute_error', 'adjusted_mutual_info_score', 'roc_auc_ovo_weighted', 'matthews_corrcoef', 'mutual_info_score', 'average_precision', 'r2', 'recall_samples', 'neg_negative_likelihood_ratio', 'neg_brier_score', 'recall_weighted', 'top_k_accuracy', 'neg_mean_squared_error', 'normalized_mutual_info_score', 'neg_mean_absolute_percentage_error', 'positive_likelihood_ratio', 'neg_mean_absolute_error', 'jaccard_macro', 'f1_macro', 'completeness_score', 'rand_score', 'homogeneity_score', 'f1_micro', 'roc_auc_ovr_weighted', 'jaccard_weighted', 'f1_samples', 'precision_micro', 'jaccard_samples', 'f1_weighted', 'roc_auc_ovr', 'neg_root_mean_squared_error', 'jaccard_micro', 'adjusted_rand_score', 'neg_mean_poisson_deviance', 'precision_samples', 'accuracy', 'precision_weighted', 'recall_macro', 'recall', 'balanced_accuracy', 'max_error', 'precision', 'neg_mean_gamma_deviance', 'neg_mean_squared_log_error', 'roc_auc_ovo', 'recall_micro', 'neg_log_loss', 'roc_auc'}, a callable, an instance of 'list', an instance of 'tuple', an instance of 'dict' or None. Got 'mean_absolute_error' instead.

In [136]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(mae)

5.193325587528965


In [None]:
# cross validation training
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=best_param['alpha'], solver=best_param['solver'], random_state=1)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores)
print(scores.mean())

#