In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm
import pickle
import dataframe_image as dfi

# user defined methods
import sys

sys.path.append("../utils")

from modeling import find_best_data_each_model, find_all_mae_each_model
from visualization import model_mae_dataset_table, model_mae_dataset_table_all

random.seed(0)
np.random.seed(0)

In [2]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [3]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

In [4]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')
target = 'ahi_c0h4'

In [5]:
best_models_cheap_features = find_best_data_each_model(folder_loc, datasets, target)

  0%|          | 0/127 [00:00<?, ?it/s]

100%|██████████| 127/127 [11:50<00:00,  5.60s/it]


In [6]:
all_models_cheap_features = find_all_mae_each_model(folder_loc, datasets, target)

100%|██████████| 127/127 [11:57<00:00,  5.65s/it]


In [7]:
folder_loc = '../../data/interim/feature_selection/'
datasets = os.listdir(folder_loc)
# Filter dataset names that end with 'ahi_c0h4a'
datasets_with_target = [dataset for dataset in datasets if dataset.endswith('ahi_c0h4a.csv')]
target = 'ahi_c0h4a'

In [8]:
best_models_feature_selection = find_best_data_each_model(folder_loc, datasets_with_target, target)

100%|██████████| 6/6 [00:59<00:00,  9.89s/it]


In [9]:
all_models_feature_selection = find_all_mae_each_model(
    folder_loc, datasets_with_target, target
)

100%|██████████| 6/6 [00:59<00:00,  9.92s/it]


In [10]:
cheap_features_table = model_mae_dataset_table(
    best_models_cheap_features, "cheap_feature"
)
cheap_features_table

Unnamed: 0,Model,MAE,Dataset
0,XGBoost,4.810896,"Anthropometry, Lifestyle and Behavioral Health, Medical History, Sleep Treatment"
1,Random Forest,4.686072,"Anthropometry, Clinical Data, Medical History"
2,Linear Regression,4.685196,"Anthropometry, Demographics, Lifestyle and Behavioral Health"
3,Lasso Regression,4.689135,"Anthropometry, Clinical Data, Demographics, General Health, Medical History, Sleep Treatment"
4,Ridge Regression,4.684644,"Anthropometry, Demographics, Lifestyle and Behavioral Health"
5,Decision Tree,6.025305,"Anthropometry, Lifestyle and Behavioral Health, Medical History, Sleep Treatment"


In [11]:
cheap_features_table_all = model_mae_dataset_table_all(
    all_models_cheap_features, "cheap_feature"
)
cheap_features_table_all

Unnamed: 0,Model,MAE,Dataset
0,XGBoost,5.572799,"Clinical Data, Demographics, Lifestyle and Behavioral Health"
1,XGBoost,5.485372,"Clinical Data, Demographics, Medical History, Sleep Treatment"
2,XGBoost,5.378831,"Clinical Data, Demographics, General Health, Lifestyle and Behavioral Health, Medical History"
3,XGBoost,4.958477,"Anthropometry, Clinical Data"
4,XGBoost,5.748304,"Demographics, General Health, Medical History, Sleep Treatment"
5,XGBoost,5.627272,"Demographics, General Health, Lifestyle and Behavioral Health, Medical History, Sleep Treatment"
6,XGBoost,5.638442,"Lifestyle and Behavioral Health, Medical History, Sleep Treatment"
7,XGBoost,5.575402,"Clinical Data, General Health, Sleep Treatment"
8,XGBoost,5.98835,General Health
9,XGBoost,5.144006,"Anthropometry, Clinical Data, General Health, Lifestyle and Behavioral Health, Medical History, Sleep Treatment"


In [12]:
feature_selection_table = model_mae_dataset_table(
    best_models_feature_selection, "feature_selection"
)
feature_selection_table

Unnamed: 0,Model,MAE,Dataset
0,XGBoost,6.219634,Mutual Information
1,Random Forest,5.960062,Backward Selection
2,Linear Regression,5.749217,Forward Selection
3,Lasso Regression,5.814538,Forward Selection
4,Ridge Regression,5.747436,Forward Selection
5,Decision Tree,8.131265,Decision Tree


In [13]:
feature_selection_table_all = model_mae_dataset_table_all(
    all_models_feature_selection, "feature_selection"
)
feature_selection_table_all

Unnamed: 0,Model,MAE,Dataset
0,XGBoost,6.472459,Forward Selection
1,XGBoost,6.227025,Decision Tree
2,XGBoost,6.489891,Backward Selection
3,XGBoost,6.321379,Random Forest
4,XGBoost,6.219634,Mutual Information
5,XGBoost,6.504213,Forward Selection
6,Random Forest,6.001757,Forward Selection
7,Random Forest,5.997591,Decision Tree
8,Random Forest,5.960062,Backward Selection
9,Random Forest,5.989576,Random Forest


In [16]:
os.chdir("../../vizualizations/")
# Export the tables as images
dfi.export(
    cheap_features_table.hide(axis="index"),
    "cheap feature performance.png",
    dpi=300,
    max_rows=-1,  # Export all rows
)
dfi.export(
    feature_selection_table.hide(axis="index"),
    "feature selection performance.png",
    dpi=300,
    max_rows=-1,  # Export all rows
)
dfi.export(
    cheap_features_table_all.hide(axis="index"),
    "cheap feature performance_all.png",
    dpi=300,
    max_rows=-1,  # Export all rows
)
dfi.export(
    feature_selection_table_all.hide(axis="index"),
    "feature selection performance_all.png",
    dpi=300,
    max_rows=-1,  # Export all rows
)