# Import modules

In [1]:

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import random
import numpy as np

random.seed(0)
np.random.seed(0)

# user defined methods
import sys
sys.path.append('../utils')

from feature_selection import find_important_features, test_successful_import

# test that module was imported
test_successful_import()

feature_selection imported


upload 'shhs1-dataset-0.20.0.csv' and 'shhs-data-dictionary-0.20.0-variables.csv'

In [2]:
#read Sleep Heart Health Study Dataset

shhs1 = pd.read_csv('../../data/raw/shhs1-dataset-0.20.0.csv', encoding='cp1252', engine='python')
#read Sleep Heart Health Study Dictionary
var_dict = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv', encoding='cp1252', engine='python')

In [3]:
#find the shape of shhs1 - 5804 rows, 1271 columns
shhs1.shape

(5804, 1271)

In [4]:
#find the shape of var_dict - 1896 rows, 11 columns
var_dict.shape

(1896, 11)

# Data Processing

In [5]:
# remove sleep monitoring columns from independent variables (unattinable for predicting)
sleep_monitoring_col = var_dict[var_dict['folder'].str.contains(r'sleep monitoring', case=False, na=False)]['id']

In [6]:
sleep_monitoring_col

633        ahiov50
634     headbox_id
635        hrov150
636        hrund30
637     monitor_id
           ...    
1721      timest1p
1722       timest2
1723      timest2p
1724      timest34
1725          waso
Name: id, Length: 1093, dtype: object

In [7]:
# drop target variables, pptidr (has numerical and categorical values in one column), and sleep monitoring columns in independent variable
x = shhs1.drop(columns=['ahi_c0h4a', 'pptidr'])
for col in sleep_monitoring_col:
    if col in x.columns:
        x = x.drop(columns=col)

y = shhs1['ahi_c0h4a']

In [8]:
x

Unnamed: 0,nsrrid,pptid,ecgdate,lvh3_1,lvh3_3,st4_1_3,st5_1_3,lvhst,mob1,part2deg,...,educat,date02,date25,weight,waist,height,weight20,lang15,age_category_s1,visitnumber
0,200001,1,,,,,,,,,...,3.0,-48.0,0.0,65.0,86.0,178.0,69.0,1.0,7,1
1,200002,2,,,,,,,,,...,2.0,-111.0,-37.0,93.0,107.0,168.0,93.0,,9,1
2,200003,3,,,,,,,,,...,3.0,-7.0,0.0,51.0,82.0,145.0,50.7,1.0,9,1
3,200004,4,,,,,,,,,...,3.0,-24.0,0.0,64.0,85.0,180.0,65.4,1.0,6,1
4,200005,5,,,,,,,,,...,2.0,-89.0,-33.0,56.0,76.0,155.0,56.0,,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,205800,5835,-854.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,-168.0,1.0,70.5,99.0,166.0,70.8,1.0,8,1
5800,205801,5836,-755.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,-95.0,1.0,83.6,99.0,176.0,82.8,1.0,6,1
5801,205802,5837,-768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,-83.0,1.0,75.0,91.0,175.0,74.2,1.0,7,1
5802,205803,5838,-755.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,-89.0,0.0,76.8,93.0,176.0,78.6,1.0,6,1


In [9]:
# normalize x
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

imputer = KNNImputer(n_neighbors=5)  # You can change the number of neighbors if needed
x_imputed_scaled = imputer.fit_transform(x_scaled)

# reverse scaling
x_imputed = scaler.inverse_transform(x_imputed_scaled)

# Convert the result back to DataFrame
x_imputed = pd.DataFrame(x_imputed, columns=x.columns)

problem with knn: how many neighbors are the best?
NaNs categorical variables will be imputed, but the result might not be intepretable

In [10]:
x_imputed

Unnamed: 0,nsrrid,pptid,ecgdate,lvh3_1,lvh3_3,st4_1_3,st5_1_3,lvhst,mob1,part2deg,...,educat,date02,date25,weight,waist,height,weight20,lang15,age_category_s1,visitnumber
0,200001.0,1.0,-748.2,0.4,0.6,0.2,2.000000e-01,0.2,0.0,0.0,...,3.0,-48.0,0.0,65.0,86.0,178.0,69.0,1.0,7.0,1.0
1,200002.0,2.0,-75.8,0.2,0.2,0.0,2.000000e-01,0.0,0.0,0.0,...,2.0,-111.0,-37.0,93.0,107.0,168.0,93.0,1.0,9.0,1.0
2,200003.0,3.0,-1071.8,0.2,0.0,0.0,4.000000e-01,0.0,0.0,0.0,...,3.0,-7.0,0.0,51.0,82.0,145.0,50.7,1.0,9.0,1.0
3,200004.0,4.0,-728.6,0.0,0.0,0.0,2.000000e-01,0.0,0.0,0.0,...,3.0,-24.0,0.0,64.0,85.0,180.0,65.4,1.0,6.0,1.0
4,200005.0,5.0,-241.8,0.2,0.0,0.0,2.775558e-17,0.0,0.0,0.0,...,2.0,-89.0,-33.0,56.0,76.0,155.0,56.0,1.0,8.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,205800.0,5835.0,-854.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,2.0,-168.0,1.0,70.5,99.0,166.0,70.8,1.0,8.0,1.0
5800,205801.0,5836.0,-755.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,3.0,-95.0,1.0,83.6,99.0,176.0,82.8,1.0,6.0,1.0
5801,205802.0,5837.0,-768.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,4.0,-83.0,1.0,75.0,91.0,175.0,74.2,1.0,7.0,1.0
5802,205803.0,5838.0,-755.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,2.0,-89.0,0.0,76.8,93.0,176.0,78.6,1.0,6.0,1.0


# Feature selection

## Target variable: ahi_c0h4a

### Decision Tree


In [11]:
#create decision tree
tree = DecisionTreeRegressor(random_state = 131)
tree.fit(x_imputed, y)

In [12]:
find_important_features(x_imputed, shhs1, tree, "decision_tree", "ahi_c0h4a")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_dataset[target_var] = shhs1[target_var]


Number of features: 254


In [13]:
#identify the most important features through decision trees
tree_importances = tree.feature_importances_
decision_tree_features = np.array(x_imputed.columns)[tree_importances > 0]
decision_tree_dataset = x_imputed[decision_tree_features]
decision_tree_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  decision_tree_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']


In [14]:
#end up with 254 features
len(decision_tree_features)

254

In [15]:
#write to csv
decision_tree_dataset.to_csv('../../data/interim/feature_selection/decision_tree_ahi_c0h4a.csv', index=False)

### Random Forest

In [16]:
#create a random forest with 100 "trees"
forest = RandomForestRegressor(n_estimators=100, random_state = 52)  # You can change the number of trees if needed
forest.fit(x_imputed, y)

In [17]:
find_important_features(x_imputed, shhs1, forest, "random_forest", "ahi_c0h4a")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_dataset[target_var] = shhs1[target_var]


Number of features: 279


In [18]:
#identify the most important features through random forest
forest_importances = forest.feature_importances_
random_forest_features = np.array(x_imputed.columns)[forest_importances > 0]
random_forest_dataset = x_imputed[decision_tree_features]
random_forest_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  random_forest_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']


In [21]:
#end up with 279 features
len(random_forest_features)

279

In [22]:
#write to csv
random_forest_dataset.to_csv('../../data/interim/feature_selection/random_forest_ahi_c0h4a.csv', index=False)

### Mutual Information

In [23]:
#create mutual information model
mi = mutual_info_regression(x_imputed, y, random_state = 568)

In [24]:
# Get feature names where mutual information is greater than 0
mi_features = np.array(x_imputed.columns)[mi > 0]

# Create a new dataset with only the features having non-zero mutual information
mi_dataset = x_imputed[mi_features]

# Add the 'ahi_c0h4a' column to the new dataset
mi_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mi_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']


In [25]:
#end up with 198 features
len(mi_features)

193

In [26]:
#write to csv
mi_dataset.to_csv('../../data/interim/feature_selection/mi_ahi_c0h4a.csv', index=False)

### Forward selection

In [27]:
#USING AIC
forward_selection_features = []

# Start with an arbitrarily large initial AIC
previous_AIC = float('inf')
AIC_change_threshold = 3  # Adjust this threshold value based on the scale of your AIC

#run forward selection
while len(forward_selection_features) < len(x_imputed.columns):
    remaining_features = list(set(x_imputed.columns) - set(forward_selection_features))
    best_feature_AIC = float('inf')
    best_feature = None

    for feature in remaining_features:
        candidate_features = forward_selection_features.copy()
        candidate_features.append(feature)

        x_forward = x_imputed[candidate_features]
        y_forward = y.copy()

        x_forward = sm.add_constant(x_forward)
        model = sm.OLS(y_forward, x_forward).fit()
        AIC = model.aic

        # Update the best feature if the current one is better (lower AIC)
        if AIC < best_feature_AIC:
            best_feature_AIC = AIC
            best_feature = feature

    # Break the loop if change in AIC is smaller than the threshold or if no significant decrease in AIC
    if abs(best_feature_AIC - previous_AIC) < AIC_change_threshold or best_feature is None:
        print("No significant improvement in AIC or no further significant feature found. Stopping forward selection.")
        break

    forward_selection_features.append(best_feature)
    print(f"Added feature: {best_feature}, AIC with Features: {best_feature_AIC}")

    # Update the previous AIC for the next iteration
    previous_AIC = best_feature_AIC


Added feature: neck20, AIC with Features: 41518.08129720545
Added feature: bmi_s1, AIC with Features: 41364.24158861602
Added feature: age_s1, AIC with Features: 41255.61741102298
Added feature: gender, AIC with Features: 41217.242718044195
Added feature: diasbp, AIC with Features: 41197.10638671003
Added feature: funres02, AIC with Features: 41178.68179985349
Added feature: shhs1_tcvd, AIC with Features: 41168.85552545749
Added feature: ess_s1, AIC with Features: 41159.22469341669
Added feature: nonsp_st, AIC with Features: 41150.241932690806
Added feature: hosnr02, AIC with Features: 41142.70588275835
Added feature: benzod1, AIC with Features: 41134.709576518544
Added feature: ccb1, AIC with Features: 41127.89408107898
Added feature: nsrrid, AIC with Features: 41120.29037287675
Added feature: shhs1_qc, AIC with Features: 41113.16689854111
Added feature: twuweh02, AIC with Features: 41107.11970492668
Added feature: race, AIC with Features: 41100.13620305085
Added feature: estrgn1, AIC

In [22]:
# Create a new dataset with only the features having non-zero mutual information
forward_selection_dataset = x_imputed[forward_selection_features]

# Add the 'ahi_c0h4a' column to the new dataset
forward_selection_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']

#write to csv
forward_selection_dataset.to_csv('../../data/interim/feature_selection/forward_selection_ahi_c0h4a_AIC.csv', index=False)


In [27]:
print(len(forward_selection_features))

28


In [40]:
#USING BIC
forward_selection_features = []

# Start with an arbitrarily large initial BIC
previous_BIC = float('inf')
BIC_change_threshold = 2  # Adjust this threshold value based on the scale of your BIC

#run forward selection
while len(forward_selection_features) < len(x_imputed.columns):
    remaining_features = list(set(x_imputed.columns) - set(forward_selection_features))
    best_feature_BIC = float('inf')
    best_feature = None

    for feature in remaining_features:
        candidate_features = forward_selection_features.copy()
        candidate_features.append(feature)

        x_forward = x_imputed[candidate_features]
        y_forward = y.copy()

        x_forward = sm.add_constant(x_forward)
        model = sm.OLS(y_forward, x_forward).fit()
        BIC = model.bic

        # Update the best feature if the current one is better (lower BIC)
        if BIC < best_feature_BIC:
            best_feature_BIC = BIC
            best_feature = feature

    # Break the loop if change in BIC is smaller than the threshold or if no significant decrease in BIC
    if abs(best_feature_BIC - previous_BIC) < BIC_change_threshold or best_feature is None:
        print("No significant improvement in BIC or no further significant feature found. Stopping forward selection.")
        break

    forward_selection_features.append(best_feature)
    print(f"Added feature: {best_feature}, Mean Squared Error with Features: {best_feature_BIC}")

    # Update the previous MSE for the next iteration
    previous_BIC = best_feature_BIC


Added feature: neck20, Mean Squared Error with Features: 41531.41390243346
Added feature: bmi_s1, Mean Squared Error with Features: 41384.240496458035
Added feature: age_s1, Mean Squared Error with Features: 41282.282621479
Added feature: gender, Mean Squared Error with Features: 41250.57423111422
Added feature: diasbp, Mean Squared Error with Features: 41237.10420239405
Added feature: funres02, Mean Squared Error with Features: 41225.34591815152
Added feature: shhs1_tcvd, Mean Squared Error with Features: 41222.18594636952
Added feature: ess_s1, Mean Squared Error with Features: 41219.221416942724
Added feature: nonsp_st, Mean Squared Error with Features: 41216.90495883085
No significant improvement in BIC or no further significant feature found. Stopping forward selection.


In [28]:
# Create a new dataset with only the features having non-zero mutual information
forward_selection_dataset = x_imputed[forward_selection_features]

# Add the 'ahi_c0h4a' column to the new dataset
forward_selection_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']

#write to csv
forward_selection_dataset.to_csv('../../data/interim/feature_selection/forward_selection_ahi_c0h4a_BIC.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forward_selection_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']


### Backward selection

In [29]:
# USING AIC

# Starting with all the features
backward_selection_features = list(x_imputed.columns)

# Start with an arbitrarily large initial AIC
previous_AIC = float('inf')
AIC_change_threshold = 1.8 # Adjust this threshold value based on the scale of your AIC

#run backward selection
while len(backward_selection_features) > 0:
    removed_feature_AIC = float('inf')
    feature_to_remove = None

    for feature in backward_selection_features:
        candidate_features = backward_selection_features.copy()
        candidate_features.remove(feature)
        
        x_backward = x_imputed[candidate_features]
        y_backward = y.copy()
        
        x_backward = sm.add_constant(x_backward)
        model = sm.OLS(y_backward, x_backward).fit()
        AIC = model.aic
        # Update the best feature to remove if the current one is better (lower MSE)
        if AIC < removed_feature_AIC:
            removed_feature_AIC = AIC
            feature_to_remove = feature

    # Break the loop if change in MSE is smaller than the threshold or if removing a feature increases the MSE
    if abs(removed_feature_AIC - previous_AIC) < AIC_change_threshold or removed_feature_AIC > previous_AIC or best_feature is None:
        print("No significant improvement in AIC or removing any more features deteriorates the model. Stopping backward selection.")
        break

    backward_selection_features.remove(feature_to_remove)
    print(f"Removed feature: {feature_to_remove}, AIC without Feature: {removed_feature_AIC}")

    # Update the previous MSE for the next iteration
    previous_AIC = removed_feature_AIC


Removed feature: infmi, AIC without Feature: 41279.19479672043
Removed feature: down25, AIC without Feature: 41277.195110526256
Removed feature: bmi_s1, AIC without Feature: 41275.19569786143
Removed feature: lvhst, AIC without Feature: 41273.1966214187
Removed feature: rawre_s1, AIC without Feature: 41271.197773243824
Removed feature: phacls25, AIC without Feature: 41269.198916199595
Removed feature: drive02, AIC without Feature: 41267.20086974731
Removed feature: beta1, AIC without Feature: 41265.20338491277
Removed feature: av3deg, AIC without Feature: 41263.20595960944
Removed feature: mi15, AIC without Feature: 41261.208966046244
Removed feature: shhs1_cf, AIC without Feature: 41259.2142429573
Removed feature: istrd1, AIC without Feature: 41257.22183061651
Removed feature: lah, AIC without Feature: 41255.23068344611
Removed feature: nrvous25, AIC without Feature: 41253.23928783782
Removed feature: ecgdate, AIC without Feature: 41251.24999745495
Removed feature: hlthlm25, AIC witho

In [30]:
# Create a new dataset with only the features having non-zero mutual information
backward_selection_dataset = x_imputed[backward_selection_features]

# Add the 'ahi_c0h4a' column to the new dataset
backward_selection_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  backward_selection_dataset['ahi_c0h4a'] = shhs1['ahi_c0h4a']


In [31]:
#write to csv
backward_selection_dataset.to_csv('../../data/interim/feature_selection/backward_selection_ahi_c0h4a_AIC.csv', index=False)

## Target variable: hf15

In [32]:
y = x_imputed['hf15']
x_imputed = x_imputed.drop(columns=['hf15'])

## MRMR


In [34]:
#MRMR
import pandas as pd
from sklearn.datasets import make_classification
from mrmr import mrmr_classif
selected_features_10 = mrmr_classif(X=x_imputed, y=y, K=10)
selected_features_20 = mrmr_classif(X=x_imputed, y=y, K=20)

100%|██████████| 10/10 [00:03<00:00,  2.62it/s]
100%|██████████| 20/20 [00:07<00:00,  2.64it/s]


In [35]:
# Create a new dataset with only the features having non-zero mutual information
selected_features_10_dataset = x_imputed[selected_features_10]
selected_features_20_dataset = x_imputed[selected_features_20]

# Add the 'ahi_c0h4a' column to the new dataset
selected_features_10_dataset['hf15'] = shhs1['hf15']
selected_features_20_dataset['hf15'] = shhs1['hf15']

selected_features_10_dataset['nsrrid'] = shhs1['nsrrid']
selected_features_20_dataset['nsrrid'] = shhs1['nsrrid']

selected_features_10_dataset.to_csv('../../data/interim/feature_selection/selected_features_10_hf15.csv', index=False)
selected_features_20_dataset.to_csv('../../data/interim/feature_selection/selected_features_20_hf15.csv', index=False)

## Random Forest

In [36]:
forest = RandomForestRegressor(n_estimators=100, random_state = 55)  # You can change the number of trees if needed
forest.fit(x_imputed, y)

In [37]:
find_important_features(x_imputed, shhs1, forest, "random_forest", "hf15")

Number of features: 259


In [38]:
forest_importances = forest.feature_importances_
random_forest_features = np.array(x_imputed.columns)[forest_importances > 0]
random_forest_dataset = x_imputed[random_forest_features]
random_forest_dataset['hf15'] = shhs1['hf15']

In [39]:
len(random_forest_features)

259

In [40]:
random_forest_dataset.to_csv('../../data/interim/feature_selection/random_forest_hf15.csv', index=False)

### Decision Tree

In [41]:
tree = DecisionTreeRegressor(random_state = 194)
tree.fit(x_imputed, y)

In [42]:
find_important_features(x_imputed, shhs1, tree, "decision_tree", "hf15")

Number of features: 99


In [43]:
tree_importances = tree.feature_importances_
decision_tree_features = np.array(x_imputed.columns)[tree_importances > 0]
decision_tree_dataset = x_imputed[decision_tree_features]
decision_tree_dataset['hf15'] = shhs1['hf15']

In [44]:
len(decision_tree_features)

99

In [45]:
decision_tree_dataset.to_csv('../../data/interim/feature_selection/decision_tree_hf15.csv', index=False)

### Mutual Information

In [46]:
mi = mutual_info_regression(x_imputed, y, random_state = 99)

In [47]:
# Get feature names where mutual information is greater than 0
mi_features = np.array(x_imputed.columns)[mi > 0]

# Create a new dataset with only the features having non-zero mutual information
mi_dataset = x_imputed[mi_features]

# Add the 'ahi_c0h4a' column to the new dataset
mi_dataset['hf15'] = shhs1['hf15']

In [48]:
len(mi_features)

162

In [49]:
mi_dataset.to_csv('../../data/interim/feature_selection/mi_hf15.csv', index=False)

### Forward Selection

In [50]:
#USING AIC
forward_selection_features = []

# Start with an arbitrarily large initial AIC
previous_AIC = float('inf')
AIC_change_threshold = 3  # Adjust this threshold value based on the scale of your AIC

#run forward selection
while len(forward_selection_features) < len(x_imputed.columns):
    remaining_features = list(set(x_imputed.columns) - set(forward_selection_features))
    best_feature_AIC = float('inf')
    best_feature = None

    for feature in remaining_features:
        candidate_features = forward_selection_features.copy()
        candidate_features.append(feature)

        x_forward = x_imputed[candidate_features]
        y_forward = y.copy()

        x_forward = sm.add_constant(x_forward)
        model = sm.OLS(y_forward, x_forward).fit()
        AIC = model.aic

        # Update the best feature if the current one is better (lower AIC)
        if AIC < best_feature_AIC:
            best_feature_AIC = AIC
            best_feature = feature

    # Break the loop if change in AIC is smaller than the threshold or if no significant decrease in AIC
    if abs(best_feature_AIC - previous_AIC) < AIC_change_threshold or best_feature is None:
        print("No significant improvement in AIC or no further significant feature found. Stopping forward selection.")
        break

    forward_selection_features.append(best_feature)
    print(f"Added feature: {best_feature}, AIC with Features: {best_feature_AIC}")

    # Update the previous AIC for the next iteration
    previous_AIC = best_feature_AIC


Added feature: ca15, AIC with Features: 10402.540699719797
Added feature: prev_hx_mi, AIC with Features: 10234.591701308265
Added feature: ntg1, AIC with Features: 10140.19491657621
Added feature: paced, AIC with Features: 10088.318130917713
Added feature: anar1a1, AIC with Features: 10053.87704350314
Added feature: copd15, AIC with Features: 10022.421325565636
Added feature: afib, AIC with Features: 9998.443109182313
Added feature: anar31, AIC with Features: 9979.172221253239
Added feature: sa15, AIC with Features: 9960.771618478764
Added feature: nitro15, AIC with Features: 9943.80104174349
Added feature: infmi, AIC with Features: 9932.463725929947
Added feature: yrsns15, AIC with Features: 9919.341738048402
Added feature: age_s1, AIC with Features: 9906.659544118535
Added feature: prev_hx_stroke, AIC with Features: 9898.180161970471
Added feature: re_s1, AIC with Features: 9889.76772114306
Added feature: shhs1_cf, AIC with Features: 9883.257109852257
Added feature: sob02, AIC with F

In [51]:
len(forward_selection_features)

24

In [52]:
# Create a new dataset with only the features having non-zero mutual information
forward_selection_dataset = x_imputed[forward_selection_features]

# Add the 'ahi_c0h4a' column to the new dataset
forward_selection_dataset['hf15'] = shhs1['hf15']

forward_selection_dataset['nsrrid'] = shhs1['nsrrid']

In [53]:
forward_selection_dataset.to_csv('../../data/interim/feature_selection/forward_selection_hf15.csv', index=False)

### Backward selection

In [54]:
# Starting with all the features
backward_selection_features = list(x_imputed.columns)

# Start with an arbitrarily large initial MSE
previous_mse = float('inf')
mse_change_threshold = 1e-4  # Adjust this threshold value based on the scale of your MSE

while len(backward_selection_features) > 0:
    removed_feature_mse = float('inf')
    feature_to_remove = None

    for feature in backward_selection_features:
        candidate_features = backward_selection_features.copy()
        candidate_features.remove(feature)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(x_imputed[candidate_features], y, test_size=0.2, random_state=42)

        # Fit a model with the selected features
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Evaluate the model without the current feature
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Update the best feature to remove if the current one is better (lower MSE)
        if mse < removed_feature_mse:
            removed_feature_mse = mse
            feature_to_remove = feature

    # Break the loop if change in MSE is smaller than the threshold or if removing a feature increases the MSE
    if abs(removed_feature_mse - previous_mse) < mse_change_threshold or removed_feature_mse > previous_mse:
        print("No significant improvement in MSE or removing any more features deteriorates the model. Stopping backward selection.")
        break

    backward_selection_features.remove(feature_to_remove)
    print(f"Removed feature: {feature_to_remove}, Mean Squared Error without Feature: {removed_feature_mse}")

    # Update the previous MSE for the next iteration
    previous_mse = removed_feature_mse

Removed feature: mh_s1, Mean Squared Error without Feature: 0.5080315179335119
Removed feature: rawrp_s1, Mean Squared Error without Feature: 0.42720555619003797
Removed feature: anar31, Mean Squared Error without Feature: 0.42081042879677577
Removed feature: prev_hx_mi, Mean Squared Error without Feature: 0.417418842668346
Removed feature: angina15, Mean Squared Error without Feature: 0.4148126211891606
Removed feature: asthma15, Mean Squared Error without Feature: 0.4124365154472934
Removed feature: othrcs15, Mean Squared Error without Feature: 0.41005198676846505
Removed feature: pcs_s1, Mean Squared Error without Feature: 0.4077665720718637
Removed feature: lvhst, Mean Squared Error without Feature: 0.4057659438974488
Removed feature: rawbp_s1, Mean Squared Error without Feature: 0.4038182776261818
Removed feature: nges02, Mean Squared Error without Feature: 0.4019860756218264
Removed feature: apbs, Mean Squared Error without Feature: 0.40013911561110715
Removed feature: beta1, Mea

In [55]:
# Create a new dataset with only the features having non-zero mutual information
backward_selection_dataset = x_imputed[backward_selection_features]

# Add the 'ahi_c0h4a' column to the new dataset
backward_selection_dataset['hf15'] = shhs1['hf15']

In [56]:
len(backward_selection_features)

209

In [57]:
backward_selection_dataset.to_csv('../../data/interim/feature_selection/backward_selection_hf15.csv', index=False)