In this notebook we preprocess the datasets that will be used for training the models for finding a new threshold for CSA diagnosis. We will use 50 different threholds indicator variables allong with the feature selection variables. We will try to predict if a patient had heart failure or not. The dataset the will yield the best results will be our new proposed threshold

# Imports

In [57]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import random
import numpy as np

random.seed(0)
np.random.seed(0)

In [5]:
feature_selection_folder = '../data/interim/feature_selection/'

In [6]:
feature_selection_datasets = os.listdir(feature_selection_folder)

In [7]:
feature_selection_datasets

['decision_tree_ahi_c0h3a.csv',
 'decision_tree_hf15.csv',
 'mi_hf15.csv',
 'random_forest_ahi_c0h3a.csv',
 'random_forest_hf15.csv',
 'forward_selection_ahi_c0h3a.csv']

In [8]:
ahi = pd.read_csv(feature_selection_folder + 'decision_tree_ahi_c0h3a.csv')
hf = pd.read_csv(feature_selection_folder + 'decision_tree_hf15.csv')

In [9]:
shhs1 = pd.read_csv('../data/raw/shhs1-dataset-0.20.0.csv')

  shhs1 = pd.read_csv('../data/raw/shhs1-dataset-0.20.0.csv')


In [10]:
shhs1 = shhs1[['nsrrid', 'ahi_c0h4', 'ahi_o0h4', 'ahi_a0h4']]

In [11]:
# join hf and shhs1 on nsrrid to get ahi_c0h4 and ahi_o0h4

hf_ahi = pd.merge(hf, shhs1, on='nsrrid', how='inner')

In [12]:
# drop the rows that have hf15 = 8
hf_ahi = hf_ahi[hf_ahi['hf15'] != 8]

In [13]:
hf_ahi

Unnamed: 0,nsrrid,ecgdate,lvh3_3,st4_1_3,infmi,ventrate,qrs,afib,vpbs,truposmi,...,ess_s1,shhs1_cf,date02,weight,waist,height,hf15,ahi_c0h4,ahi_o0h4,ahi_a0h4
0,200001.0,-748.2,0.6,0.2,0.0,64.0,19.0,0.0,0.0,0.0,...,6.0,1.0,-48.0,65.0,86.0,178.0,0.0,1.278296,1.597870,1.597870
1,200002.0,-75.8,0.2,0.0,0.0,58.8,15.4,0.0,0.0,0.0,...,14.0,1.0,-111.0,93.0,107.0,168.0,0.0,14.505495,19.780220,19.780220
2,200003.0,-1071.8,0.0,0.0,0.0,69.6,21.2,0.0,0.0,0.0,...,5.0,1.0,-7.0,51.0,82.0,145.0,0.0,4.184100,5.020921,5.020921
3,200004.0,-728.6,0.0,0.0,0.0,63.2,20.2,0.0,0.0,0.4,...,11.0,1.0,-24.0,64.0,85.0,180.0,0.0,0.199336,1.395349,1.395349
4,200005.0,-241.8,0.0,0.0,0.0,59.8,-5.2,0.0,0.0,0.0,...,7.0,1.0,-89.0,56.0,76.0,155.0,0.0,2.756757,2.918919,3.081081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,205800.0,-854.0,0.0,0.0,0.0,62.2,29.6,0.0,0.0,0.0,...,7.0,1.0,-168.0,70.5,99.0,166.0,0.0,10.743551,44.977238,44.977238
5800,205801.0,-755.0,0.0,0.0,2.0,55.6,19.0,0.0,0.2,0.4,...,12.0,1.0,-95.0,83.6,99.0,176.0,0.0,13.198483,13.805310,14.260430
5801,205802.0,-768.0,0.0,0.0,0.0,67.2,36.2,0.0,0.0,0.0,...,17.0,1.0,-83.0,75.0,91.0,175.0,0.0,2.019231,3.605769,3.605769
5802,205803.0,-755.0,0.0,0.0,0.0,61.4,53.2,0.0,0.0,0.0,...,4.0,1.0,-89.0,76.8,93.0,176.0,0.0,1.186650,2.818294,2.966625


In [38]:
import os

os.makedirs('../data/processed/threshold/', exist_ok=True) 

# Preprocessing for Threshold Prediction

In [None]:
# Make different datasets with a dummy variable for each threshold
# remove the ahi columns in the new dataframes

dfs = []
for dataset in feature_selection_datasets:
    if dataset.endswith('hf15.csv'):
        hf = pd.read_csv(feature_selection_folder + dataset)
        hf_ahi = pd.merge(hf, shhs1, on='nsrrid', how='inner')
        hf_ahi = hf_ahi[hf_ahi['hf15'] != 8]
        for threshold in range(1, 10):
            for threshold_2 in range(1, 5):
                new_df = hf_ahi.copy()
                new_df['CSA'] = 0
                new_df['CSA'][(new_df['ahi_c0h4'] >= threshold) & (new_df['ahi_c0h4'] > new_df['ahi_o0h4']*(1/threshold_2))] = 1
                new_df.drop(['ahi_c0h4', 'ahi_o0h4', 'ahi_a0h4'], axis=1, inplace=True)
                # drop na
                new_df.dropna(inplace=True)
                # save the new dataframe
                dataset_name = dataset.split('.')[0]
                new_df.to_csv('../data/processed/threshold/' + dataset_name + '_threshold_' + str(threshold) + '_' + str(threshold_2) + '.csv', index=False)


# Preprocessing for Cheap Feature Selection

In [62]:
os.makedirs('../data/processed/cheap_features', exist_ok=True)

In [63]:
features_df = pd.read_csv('../data/interim/shhs-data-dictionary-0.20.0-variables.csv')

In [64]:
features_df.head(3)

Unnamed: 0,folder,id,display_name,description,type,units,domain,labels,calculation,commonly_used,forms
0,Administrative/Interim,calldt,Interim Follow-up: Days from index date to call,,numeric,days from index date,,interim_shhs,,,
1,Administrative/Interim,cmplbp,Interim Follow-up: Completed blood pressure (BP),,choices,,complete2,interim_shhs,,,
2,Administrative/Interim,cmplcvd,Interim Follow-up: Completed cardiovascular di...,,choices,,complete4,interim_shhs,,,


In [65]:
cheap_features = ['Anthropometry', 'Clinical Data', 'Demographics', 'General Health', 'Lifestyle and Behavioral Health', 'Medical History', 'Sleep Treatment']
# cheap_features = ['Anthropometry', 'Demographics', 'General Health', 'Lifestyle and Behavioral Health', 'Medical History', 'Sleep Treatment']

In [66]:
abreviations = { 'Anthropometry': 'Ant',
                'Clinical Data': 'Cli',
                'Demographics': 'Dem',
                'General Health': 'Gen',
                'Lifestyle and Behavioral Health': 'Lif',
                'Medical History': 'Med',
                'Sleep Treatment': 'Tre'}

In [67]:
from utils import *

In [68]:
# make subsets of cheap_features
subsets = subsets(cheap_features)    
subsets = [list(subset) for subset in subsets]

In [69]:
shhs1 = pd.read_csv('../data/raw/shhs1-dataset-0.20.0.csv')
cahi = shhs1[['nsrrid', 'ahi_c0h4']]

  shhs1 = pd.read_csv('../data/raw/shhs1-dataset-0.20.0.csv')


Creating all the possible subset of feature sets for the cheap feature selection.

In [73]:
# make a list of datasets with the given features
datasets = []
for subset in subsets:
    cheap_features_labels = features_df[features_df['folder'].str.startswith(tuple(subset))]['id'].values
    match_columns = shhs1.columns.intersection(cheap_features_labels)
    features = match_columns.copy()
    match_columns = match_columns.tolist() + ['nsrrid']
    dataset = shhs1[match_columns].copy()
    dataset = pd.merge(dataset, cahi, on='nsrrid', how='inner')

    # if a column is missing more than 50% of the values, drop it
    dataset.dropna(thresh=dataset.shape[0]*0.5, axis=1, inplace=True)

    features = dataset.columns.tolist()

    # normalize x
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(dataset[features])

    imputer = KNNImputer(n_neighbors=5)  # You can change the number of neighbors if needed
    x_imputed_scaled = imputer.fit_transform(x_scaled)

    # reverse scaling
    x_imputed = scaler.inverse_transform(x_imputed_scaled)

    # Convert the result back to DataFrame
    x_imputed = pd.DataFrame(x_imputed, columns=features)

    x_imputed['nsrrid'] = dataset['nsrrid']

    dataset_name = [abreviations[feature] for feature in subset]
    dataset_name = '_'.join(dataset_name)

    x_imputed.to_csv(f'../data/processed/cheap_features/{dataset_name}.csv', index=False)
    