In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm
import pickle
import dataframe_image as dfi

# user defined methods
import sys

sys.path.append("../utils")

from modeling import split_data, train_model, find_best_data_each_model
from visualization import model_mae_dataset_table

random.seed(0)
np.random.seed(0)

In [2]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

In [4]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')
target = 'ahi_c0h4'

In [5]:
best_models_cheap_features = find_best_data_each_model(folder_loc, datasets, target)

  0%|          | 0/127 [00:00<?, ?it/s]

100%|██████████| 127/127 [19:35<00:00,  9.26s/it]


In [6]:
folder_loc = '../../data/interim/feature_selection/'
datasets = os.listdir(folder_loc)
# Filter dataset names that end with 'ahi_c0h4a'
datasets_with_target = [dataset for dataset in datasets if dataset.endswith('ahi_c0h4a.csv')]
target = 'ahi_c0h4a'

In [7]:
best_models_feature_selection = find_best_data_each_model(folder_loc, datasets_with_target, target)

100%|██████████| 6/6 [01:35<00:00, 15.91s/it]


In [11]:
cheap_features_table = model_mae_dataset_table(
    best_models_cheap_features, "cheap_feature"
)
cheap_features_table

Unnamed: 0,Model,MAE,Dataset
0,XGBoost,4.779168,"Anthropometry, Clinical Data, Lifestyle and Behavioral Health, Medical History, Sleep Treatment"
1,Random Forest,4.730034,"Anthropometry, Lifestyle and Behavioral Health"
2,Linear Regression,4.642372,"Anthropometry, Lifestyle and Behavioral Health"
3,Lasso Regression,4.677656,"Anthropometry, Clinical Data, Demographics, General Health, Lifestyle and Behavioral Health, Medical History, Sleep Treatment"
4,Ridge Regression,4.635646,"Anthropometry, Lifestyle and Behavioral Health"
5,Decision Tree,6.182289,"Anthropometry, Clinical Data, Lifestyle and Behavioral Health, Sleep Treatment"


In [12]:
feature_selection_table = model_mae_dataset_table(
    best_models_feature_selection, "feature_selection"
)
feature_selection_table

Unnamed: 0,Model,MAE,Dataset
0,XGBoost,6.219634,Mutual Information
1,Random Forest,5.960062,Backward Selection
2,Linear Regression,5.749217,Forward Selection
3,Lasso Regression,5.814538,Forward Selection
4,Ridge Regression,5.747436,Forward Selection
5,Decision Tree,8.131265,Decision Tree


In [13]:
os.chdir("../../vizualizations/")
dfi.export(
    cheap_features_table.hide(axis="index"),
    "cheap feature performance.png",
    dpi=300,
)
dfi.export(
    feature_selection_table.hide(axis="index"),
    "feature selection performance.png",
    dpi=300,
)