# Data Preprocessing

In this notebook we preprocess the datasets that will be used for training the models for finding a new threshold for CSA diagnosis. We will use 50 different threholds indicator variables allong with the feature selection variables. We will try to predict if a patient had heart failure or not. The dataset then will yield the best results will be our new proposed threshold

## Imports

In [4]:
import pandas as pd
import os
import random
import numpy as np

random.seed(0)
np.random.seed(0)

pd.options.mode.chained_assignment = None  # default='warn'

# user defined methods
import sys
sys.path.append('../utils')

from preprocessing import subsets, write_ahi_to_csv, generate_subset_dataset

In [5]:
feature_selection_folder = '../../data/interim/feature_selection/'

In [6]:
feature_selection_datasets = os.listdir(feature_selection_folder)

In [7]:
feature_selection_datasets

['decision_tree_ahi_c0h4a.csv',
 'random_forest_ahi_c0h4a.csv',
 'random_forest_hf15.csv',
 'decision_tree_hf15.csv',
 'MRMR_10_hf15.csv',
 'MRMR_20_hf15.csv',
 'backward_selection_AIC_ahi_c0h4a.csv',
 'backward_selection_AIC_hf15.csv',
 'forward_selection_AIC_ahi_c0h4a.csv',
 'forward_selection_AIC_hf15.csv',
 'forward_selection_BIC_ahi_c0h4a.csv',
 'forward_selection_BIC_hf15.csv',
 'mutual_information_ahi_c0h4a.csv',
 'mutual_information_hf15.csv']

In [5]:
shhs1 = pd.read_csv('../../data/raw/shhs1-dataset-0.20.0.csv', encoding='cp1252', engine='python')

In [6]:
shhs1 = shhs1[['nsrrid', 'ahi_c0h4', 'ahi_o0h4', 'ahi_a0h4']]

In [7]:
shhs1.shape

(5804, 4)

In [8]:
processed_filepath = '../../data/processed/threshold/'
os.makedirs(processed_filepath, exist_ok=True) 

## Preprocessing for Threshold Prediction

In [9]:
print(list(filter(lambda dataset: 'hf15' in dataset, feature_selection_datasets)))

['backward_selection_AIC_hf15.csv', 'forward_selection_AIC_hf15.csv', 'MRMR_20_hf15.csv', 'random_forest_hf15.csv', 'MRMR_10_hf15.csv', 'forward_selection_BIC_hf15.csv', 'mutual_information_hf15.csv', 'decision_tree_hf15.csv']


In [10]:
# Make different datasets with a dummy variable for each threshold
for dataset in list(filter(lambda dataset: 'hf15' in dataset, feature_selection_datasets)):
    hf = pd.read_csv(feature_selection_folder + dataset)
    
    hf_ahi = pd.merge(hf, shhs1, on='nsrrid', how='inner')
    hf_ahi = hf_ahi[hf_ahi['hf15'] != 8]
    
    # Calculates AHI given a range of thresholds and writes them to a CSV file
    for threshold in range(1, 10):
        for threshold_2 in range(1, 5):
            write_ahi_to_csv(hf_ahi.copy(), threshold, threshold_2, processed_filepath, dataset)

## Preprocessing for Cheap Feature Selection

In [8]:
os.makedirs('../../data/processed/cheap_features', exist_ok=True)

In [9]:
features_df = pd.read_csv('../../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [10]:
features_df.head(3)

Unnamed: 0,folder,id,display_name,description,type,units,domain,labels,calculation,commonly_used,forms
0,Administrative/Interim,calldt,Interim Follow-up: Days from index date to call,,numeric,days from index date,,interim_shhs,,,
1,Administrative/Interim,cmplbp,Interim Follow-up: Completed blood pressure (BP),,choices,,complete2,interim_shhs,,,
2,Administrative/Interim,cmplcvd,Interim Follow-up: Completed cardiovascular di...,,choices,,complete4,interim_shhs,,,


In [11]:
cheap_features = ['Anthropometry', 'Clinical Data', 'Demographics', 'General Health', 'Lifestyle and Behavioral Health', 'Medical History', 'Sleep Treatment']
# cheap_features = ['Anthropometry', 'Demographics', 'General Health', 'Lifestyle and Behavioral Health', 'Medical History', 'Sleep Treatment']

In [12]:
abbreviations = { 'Anthropometry': 'Ant',
                'Clinical Data': 'Cli',
                'Demographics': 'Dem',
                'General Health': 'Gen',
                'Lifestyle and Behavioral Health': 'Lif',
                'Medical History': 'Med',
                'Sleep Treatment': 'Tre'}

In [13]:
# make subsets of cheap_features
subsets = subsets(cheap_features)    
subsets = [list(subset) for subset in subsets]

In [14]:
shhs1 = pd.read_csv('../../data/raw/shhs1-dataset-0.20.0.csv', encoding='cp1252', engine='python')
cahi = shhs1[['nsrrid', 'ahi_c0h4']]

In [15]:
filepath = "../../data/processed/cheap_features/"

Creating all the possible subset of feature sets for the cheap feature selection.

In [16]:
# make a list of datasets with the given features
datasets = []
for subset in subsets:
    generate_subset_dataset(features_df, subset, shhs1, cahi, abbreviations, filepath)