In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import os
import random
import numpy as np
from tqdm import tqdm
import pickle
import dataframe_image as dfi

# user defined methods
import sys

sys.path.append("../utils")

from modeling import split_data, train_model, find_best_data_each_model
from visualization import feature_selection_cheap_feature_comparison

random.seed(0)
np.random.seed(0)

In [3]:
# supress warnings
import warnings

warnings.filterwarnings("ignore")

In [4]:
os.makedirs('../../models/cheap_features/', exist_ok=True)

In [5]:
folder_loc = '../../data/processed/cheap_features/'
datasets = os.listdir('../../data/processed/cheap_features/')
target = 'ahi_c0h4'

In [6]:
best_models_cheap_features = find_best_data_each_model(folder_loc, datasets, target)

100%|██████████| 127/127 [19:26<00:00,  9.18s/it]


In [8]:
folder_loc = '../../data/interim/feature_selection/'
datasets = os.listdir(folder_loc)
# Filter dataset names that end with 'ahi_c0h4a'
datasets_with_target = [dataset for dataset in datasets if dataset.endswith('ahi_c0h4a.csv')]
target = 'ahi_c0h4a'

In [9]:
best_models_feature_selection = find_best_data_each_model(folder_loc, datasets_with_target, target)

100%|██████████| 6/6 [01:35<00:00, 15.89s/it]


In [15]:
table_visualization = feature_selection_cheap_feature_comparison(
    best_models_cheap_features, best_models_feature_selection
)
table_visualization

Unnamed: 0,Model,MAE for Cheap Features,Dataset for Cheap Features,MAE for Feature Selection,Dataset for Feature Selection
0,XGBoost,4.779168,Ant_Cli_Lif_Med_Tre.csv,6.219634,mutual_information_ahi_c0h4a.csv
1,random forest,4.730034,Ant_Lif.csv,5.960062,backward_selection_AIC_ahi_c0h4a.csv
2,linear regression,4.642372,Ant_Lif.csv,5.749217,forward_selection_AIC_ahi_c0h4a.csv
3,lasso regression,4.677656,Ant_Cli_Dem_Gen_Lif_Med_Tre.csv,5.814538,forward_selection_BIC_ahi_c0h4a.csv
4,ridge regression,4.635646,Ant_Lif.csv,5.747436,forward_selection_AIC_ahi_c0h4a.csv
5,decision tree,6.182289,Ant_Cli_Lif_Tre.csv,8.131265,decision_tree_ahi_c0h4a.csv


In [16]:
os.chdir('../../vizualizations/')
dfi.export(styled_table.hide(axis='index'), 'cheap features and feature selection comparison.png', dpi=300)