# Feature Selection

In this notebook, we run various feature selection models on the initial Sleep Heart Health Study dataset using imputed variables for the non sleep test data. Resulting datasets of each feature selection model is then saved to the interim data folder.

## Import modules

In [1]:

import pandas as pd
import numpy as np
import random
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

random.seed(0)
np.random.seed(0)

# user defined methods
import sys
sys.path.append('../utils')

from feature_selection import write_features_to_csv, get_features_from_model

filepath = "../../data/interim/feature_selection/"

upload 'shhs1-dataset-0.20.0.csv' and 'shhs-data-dictionary-0.20.0-variables.csv'

In [2]:
#read Sleep Heart Health Study Dataset
shhs1 = pd.read_csv('../../data/raw/shhs1-dataset-0.20.0.csv', encoding='cp1252', engine='python')
#read Sleep Heart Health Study Dictionary
var_dict = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv', encoding='cp1252', engine='python')

In [3]:
#find the shape of shhs1 - 5804 rows, 1271 columns
shhs1.shape

(5804, 1271)

In [4]:
#find the shape of var_dict - 1896 rows, 11 columns
var_dict.shape

(1896, 11)

## Data Processing

In [5]:
# remove sleep monitoring columns from independent variables (unattinable for predicting)
sleep_monitoring_col = var_dict[var_dict['folder'].str.contains(r'sleep monitoring', case=False, na=False)]['id']

In [6]:
sleep_monitoring_col

633        ahiov50
634     headbox_id
635        hrov150
636        hrund30
637     monitor_id
           ...    
1721      timest1p
1722       timest2
1723      timest2p
1724      timest34
1725          waso
Name: id, Length: 1093, dtype: object

In [7]:
# drop target variables, pptidr (has numerical and categorical values in one column), and sleep monitoring columns in independent variable
x = shhs1.drop(columns=['ahi_c0h4a', 'pptidr'])
for col in sleep_monitoring_col:
    if col in x.columns:
        x = x.drop(columns=col)

In [8]:
x

Unnamed: 0,nsrrid,pptid,ecgdate,lvh3_1,lvh3_3,st4_1_3,st5_1_3,lvhst,mob1,part2deg,...,educat,date02,date25,weight,waist,height,weight20,lang15,age_category_s1,visitnumber
0,200001,1,,,,,,,,,...,3.0,-48.0,0.0,65.0,86.0,178.0,69.0,1.0,7,1
1,200002,2,,,,,,,,,...,2.0,-111.0,-37.0,93.0,107.0,168.0,93.0,,9,1
2,200003,3,,,,,,,,,...,3.0,-7.0,0.0,51.0,82.0,145.0,50.7,1.0,9,1
3,200004,4,,,,,,,,,...,3.0,-24.0,0.0,64.0,85.0,180.0,65.4,1.0,6,1
4,200005,5,,,,,,,,,...,2.0,-89.0,-33.0,56.0,76.0,155.0,56.0,,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,205800,5835,-854.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,-168.0,1.0,70.5,99.0,166.0,70.8,1.0,8,1
5800,205801,5836,-755.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,-95.0,1.0,83.6,99.0,176.0,82.8,1.0,6,1
5801,205802,5837,-768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,-83.0,1.0,75.0,91.0,175.0,74.2,1.0,7,1
5802,205803,5838,-755.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,-89.0,0.0,76.8,93.0,176.0,78.6,1.0,6,1


In [9]:
# normalize x
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

imputer = KNNImputer(n_neighbors=5)  # You can change the number of neighbors if needed
x_imputed_scaled = imputer.fit_transform(x_scaled)

# reverse scaling
x_imputed = scaler.inverse_transform(x_imputed_scaled)

# Convert the result back to DataFrame
x_imputed = pd.DataFrame(x_imputed, columns=x.columns)

# save target hf15 col
hf15 = x_imputed['hf15']

problem with knn: how many neighbors are the best?
NaNs categorical variables will be imputed, but the result might not be intepretable

In [10]:
x_imputed

Unnamed: 0,nsrrid,pptid,ecgdate,lvh3_1,lvh3_3,st4_1_3,st5_1_3,lvhst,mob1,part2deg,...,educat,date02,date25,weight,waist,height,weight20,lang15,age_category_s1,visitnumber
0,200001.0,1.0,-748.2,0.4,0.6,0.2,2.000000e-01,0.2,0.0,0.0,...,3.0,-48.0,0.0,65.0,86.0,178.0,69.0,1.0,7.0,1.0
1,200002.0,2.0,-75.8,0.2,0.2,0.0,2.000000e-01,0.0,0.0,0.0,...,2.0,-111.0,-37.0,93.0,107.0,168.0,93.0,1.0,9.0,1.0
2,200003.0,3.0,-1071.8,0.2,0.0,0.0,4.000000e-01,0.0,0.0,0.0,...,3.0,-7.0,0.0,51.0,82.0,145.0,50.7,1.0,9.0,1.0
3,200004.0,4.0,-728.6,0.0,0.0,0.0,2.000000e-01,0.0,0.0,0.0,...,3.0,-24.0,0.0,64.0,85.0,180.0,65.4,1.0,6.0,1.0
4,200005.0,5.0,-241.8,0.2,0.0,0.0,2.775558e-17,0.0,0.0,0.0,...,2.0,-89.0,-33.0,56.0,76.0,155.0,56.0,1.0,8.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,205800.0,5835.0,-854.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,2.0,-168.0,1.0,70.5,99.0,166.0,70.8,1.0,8.0,1.0
5800,205801.0,5836.0,-755.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,3.0,-95.0,1.0,83.6,99.0,176.0,82.8,1.0,6.0,1.0
5801,205802.0,5837.0,-768.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,4.0,-83.0,1.0,75.0,91.0,175.0,74.2,1.0,7.0,1.0
5802,205803.0,5838.0,-755.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,2.0,-89.0,0.0,76.8,93.0,176.0,78.6,1.0,6.0,1.0


## Feature selection

In [11]:
# DO NOT CHANGE THIS DICT UNLESS YOU ARE ADDING ANOTHER EXISTING FEATURE SELECTION METHOD. change selected methods if trying to test a subset of methods
all_methods = {
    "ahi_c0h4a": ["decision tree", "random forest", "mutual information", "forward selection AIC", "forward selection BIC", \
        "backward selection AIC"],
    "hf15": ["MRMR 10", "MRMR 20", "random forest", "decision tree", "mutual information", "forward selection AIC", \
        "forward selection BIC", "backward selection AIC"]
}

In [12]:

# select from all_methods
selected_methods = {
    "ahi_c0h4a": all_methods["ahi_c0h4a"],
    "hf15": all_methods["hf15"]
}

In [None]:
# !!!: This block of code on all the feature selection methods (specifically backward selection) will take around 2 hours.

# reset x_imputed if hf15 is removed in for loop
x_imputed["hf15"] = hf15

# iterate through chosen methods (out of the available methods) and write important features to a CSV file
for target in ["ahi_c0h4a", "hf15"]:
    y = shhs1[target]
    targets = [target]

    if target == "hf15":
        y = hf15
        x_imputed = x_imputed.drop(columns=['hf15'])
            
        # if target is heart failure, add nsrrid for preprocessing
        targets.append("nsrrid")
    
    print("target variable: ", target)
    for method in tqdm(selected_methods[target]):
        print(f'Beginning {method} on target {target}')
        # runs feature selection using the method given and returns either the importances or features (depending on the model)
        model_importances, model_features = get_features_from_model(method, x_imputed, y)

        # creates a new DataFrame using the important features from the model and writes the new DF to a CSV file in the given file path
        print(f'# features selected using {method} on {target}: \
            {write_features_to_csv(x_imputed, shhs1, method, targets, filepath, model_importances, model_features)}')

    print("\n\n")
        