From 4.1 we get the best model to predict the Central Sleep Apnea, which is the Ridge Regression. In this notebook we will use this model as the baseline model to compare the result of this model by using cheap features only and all the features from feature selection 

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
import os
import random
import numpy as np
from tqdm import tqdm

random.seed(0)
np.random.seed(0)

In [7]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [8]:
# Load cheap data
df_cheap = pd.read_csv('../data/processed/cheap_features/Ant_Cli_Dem_Gen_Lif_Med_Tre.csv')
df_cheap

Unnamed: 0,ecgdate,lvh3_1,lvh3_3,st4_1_3,st5_1_3,lvhst,mob1,part2deg,mob2,av3deg,...,bmi_s1,educat,date25,weight,waist,height,weight20,age_category_s1,nsrrid,ahi_c0h4
0,-1138.6,0.4,0.4,0.2,2.000000e-01,0.2,0.0,0.0,0.0,2.710505e-20,...,21.777553,3.0,0.0,65.0,86.0,178.0,69.0,7.0,200001,1.278296
1,-325.6,0.0,0.2,0.0,4.000000e-01,0.0,0.0,0.0,0.0,2.710505e-20,...,32.950680,2.0,-37.0,93.0,107.0,168.0,93.0,9.0,200002,14.505495
2,-1317.4,0.2,0.0,0.0,2.775558e-17,0.0,0.0,0.0,0.0,2.710505e-20,...,24.114150,3.0,0.0,51.0,82.0,145.0,50.7,9.0,200003,4.184100
3,-869.8,0.0,0.0,0.0,2.775558e-17,0.0,0.0,0.0,0.0,2.710505e-20,...,20.185185,3.0,0.0,64.0,85.0,180.0,65.4,6.0,200004,0.199336
4,-561.2,0.0,0.0,0.0,2.000000e-01,0.0,0.0,0.0,0.0,2.710505e-20,...,23.309053,2.0,-33.0,56.0,76.0,155.0,56.0,8.0,200005,2.756757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,-854.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,...,25.693134,2.0,1.0,70.5,99.0,166.0,70.8,8.0,205800,10.743551
5800,-755.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,...,26.730372,3.0,1.0,83.6,99.0,176.0,82.8,6.0,205801,13.198483
5801,-768.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,...,24.228571,4.0,1.0,75.0,91.0,175.0,74.2,7.0,205802,2.019231
5802,-755.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,...,25.374483,2.0,0.0,76.8,93.0,176.0,78.6,6.0,205803,1.186650


In [11]:
def split_data(data, features, target):
    """Split on test/train/val.

    Args:
        df (pd.DataFrame): input dataset
        features (list): list of features
        target (str): target column name
    """
    # Split on test/train 80/20
    # split startified on target
    X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.3, random_state=1)
    # make validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
    return X_train, X_test, y_train, y_test, X_val, y_val

In [17]:
# Split on test/train 80/20
# split startified on target
target = 'ahi_c0h4'
features = df_cheap.columns.tolist()
features.remove('ahi_c0h4')
X_train, X_test, y_train, y_test = train_test_split(df_cheap[features], df_cheap['ahi_c0h4'], test_size=0.3, random_state=1)
# make validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [18]:
# do RandomizedSearchCV to find best hyperparameters
model = Ridge(random_state=1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

# Split data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(df_cheap, features, target)

param_grid = {
    'alpha': uniform(0, 10),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
best_param = search.best_params_



{'alpha': 8.78142503429413, 'solver': 'saga'}
-4.801120247188065
Ridge(alpha=8.78142503429413, random_state=1, solver='saga')


In [25]:
# test on validation set
model = Ridge(**best_param, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print('MAE Score for Cheap Data: ' + str(mae))

MAE Score for Cheap Data: 4.697375338383396


In [35]:
# Get the data from different feature selections
directory_path = '../data/interim/feature_selection'
file_names = os.listdir(directory_path)
# Tune models
all_mae = []
for file_name in file_names:
    if 'ahi_c0h4a' not in file_name:
        continue
    df = pd.read_csv(directory_path + '/' + file_name)
    target = 'ahi_c0h4a'
    features = df.columns.tolist()
    features.remove(target)
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=1)
    # make validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
    # do RandomizedSearchCV to find best hyperparameters
    model = Ridge(random_state=1)

    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import uniform, truncnorm, randint

    # Split data
    X_train, X_test, y_train, y_test, X_val, y_val = split_data(df, features, target)

    param_grid = {
        'alpha': uniform(0, 10),
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }

    search = RandomizedSearchCV(model, param_grid, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, random_state=1)
    search.fit(X_train, y_train)
    best_param = search.best_params_
    # test on validation set
    model = Ridge(**best_param, random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    all_mae.append(mae)
    print('MAE Score for ' + file_name + ': ' + str(mae))
print('average MAE: ' + str(sum(all_mae) / len(all_mae)))



MAE Score for decision_tree_ahi_c0h4a.csv: 5.8355797322642795




MAE Score for mi_ahi_c0h4a.csv: 5.821571522995246




MAE Score for random_forest_ahi_c0h4a.csv: 5.8355797322642795




MAE Score for forward_selection_ahi_c0h4a.csv: 5.812142902423194




MAE Score for backward_selection_ahi_c0h4a.csv: 5.833185805480809
average MAE: 5.827611939085561
