# This contains the methods and models for the paper "Has machine learning over-promised in healthcare?"

It contains the models: 
- ozkan (Model A in the paper): PCA with nearest neighbours https://doi.org/10.3390/e18040115
- caliskan (Model B in the paper): stacked auto encoder https://electricajournal.org/Content/files/sayilar/58/3311-3318.pdf
- ulhaq (Model C in the paper): SVM with feature selection https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8672565

## Load and prepare/preprocess data
OPD is the Oxford Parkinsons Disease dataset, retrieved from UCI. This was originally created by Max Little, and is used extensively in PD classification research (https://archive.ics.uci.edu/ml/datasets/parkinsons)

mPower is the larger dataset, from the mPower study, a Parkinsons mobile application developed by Sage Bionetworks and described in Synpase (doi:10.7303/syn4993293).

In [None]:
import pandas as pd
import dataHandler as dh
from os.path import join

OPD_samples = pd.read_csv(join('Data','OPD_data.csv'))
OPD_participants = pd.read_csv(join('Data','OPD_participants.csv'))

print('One of the participants (S31) does not have demographic data (i.e., age and gender), which is why their submissions are missing from counts that include anything to do with gender or age\n')

dh.OPD_summary(OPD_samples, OPD_participants)

In [None]:
import dataLoader as dl
import mPower_data_adjustments as mda

#load the original submission data, the features extracted from them, and information needed to filter them
submissions = dl.pickleLoad(join('Data','submissions_Full.pickle'))
features = dl.pickleLoad(join('Data','combinedFeatures.pickle'))
d2 = dl.pickleLoad(join('Data','d2Feature5.pickle'))
rms_energy = dl.pickleLoad(join('Data','rms_energy_notNormalised.pickle'))

features.rename(columns={'recordID':'recordId'},inplace=True)
d2 = d2[d2['d2'] != 'nan'] #drop nans in d2

#filter out bad submissions and see how many remains
rms_energy = rms_energy[rms_energy['rms_1_mean'] > 300] #250
rms_energy = rms_energy[rms_energy['rms_1_std'] < 2000] #2500
rms_energy = rms_energy[rms_energy['energy_1_mean'] > 50000] #45000
features = features[features['Degree of voice breaks (%)'] < 30] #40
features = features[features['Fraction of locally unvoiced frames (%)'] < 30]#40

#combine them and see how many remains
mPower_samples = pd.merge(rms_energy['recordId'],submissions,on='recordId')
mPower_samples = pd.merge(mPower_samples,features,on='recordId')
mPower_samples = mPower_samples.merge(d2[['recordId','d2']],on='recordId',how='inner')
mPower_samples = mPower_samples[~(mPower_samples['gender'] == 'Prefer not to answer')] #retains only male and female

#Remove samples that I found were bad, by looking at the 10 most extreme values for all 22 features and listening
mPower_samples = mda.remove_bad_samples(mPower_samples)

#Recalculate spread1 and spread2, the original was wrong.
mPower_samples = mda.recalculate_spreads(mPower_samples)

#Adjust the scale of several mPower features to be on the same scale as OPD. 
#Things like Jitter (%) is currently 2%, but in OPD features would be 0.02
mPower_samples = mda.adjust_feature_scale(mPower_samples)

mPower_participants = mPower_samples.drop_duplicates(subset=['healthCode'])[['healthCode', 'age', 'diagnosis-year','gender','onset-year','professional-diagnosis']]

#print summary of the dataset
dh.mPower_summary(mPower_samples, mPower_participants)

## Load and evaluate the models (UNMODIFIED)

In [None]:
from ozkan_model import ozkan
from caliskan_model import caliskan
from ulhaq_model import ulhaq

import itertools
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import torch.nn as nn
from datetime import date

import evaluation
import resultsHandler as rh

### with OPD

In [None]:
kfold_splits = 10
repetitions = 30


###################### Seeds ######################
seeds = list(range(repetitions))
###################################################

######################## to_numpy method ########################
def to_numpy(OPD_samples):
    
    features = ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 
                'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 
                'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
    
    X = OPD_samples[features].to_numpy()
    y = OPD_samples['status'].to_numpy()
    
    return X, y
#################################################################

######################## Global settings ########################
preprocessors = [StandardScaler, MinMaxScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [2,5,8,11,14]
ks = [1,3,5,7,9,11]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4,[0.03]*4,]
epochses = [[400]*4,[200]*4,[100]*4]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Tanh],[nn.ReLU,nn.Sigmoid],[nn.ReLU,nn.ReLU]]
latent_sizes = [4,6]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf','linear']
gammas = ['scale','auto',0.4,0.3,0.2,0.09,0.075,0.04]
Cs = [1,5,10]
num_featureses = [6,8,10,12,14,16,18,20] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    repeated_kfold_results,repeated_traintest_results = evaluation.unmodified('OPD',OPD_samples,OPD_participants,to_numpy,
                                             global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                             ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                             seeds=seeds,repetitions=repetitions,verbose_odds=0,#0.025
                                             n_splits=kfold_splits,training_split=0.7)

In [None]:
rh.print_results(repeated_kfold_results,top=3)
rh.print_results(repeated_traintest_results,top=3)

d = date.today().strftime("%d%B%y")
dl.pickleSave(repeated_kfold_results,join('Results','repeated_kfold with OPD unmodified ' + d + '.pickle'))
dl.pickleSave(repeated_traintest_results,join('Results','repeated_traintest with OPD unmodified' + d + '.pickle'))

### with mPower

In [None]:
kfold_splits = 10
repetitions = 10

###################### Seeds ######################
seeds = list(range(repetitions))
###################################################

######################## to_numpy method ########################

def to_numpy(mPower_samples):

    features = ['Mean pitch (Hz)','Minimum pitch (Hz)','Maximum pitch (Hz)','Jitter (local) (%)','Jitter (local, absolute)','Jitter (rap) (%)','Jitter (ppq5) (%)','Jitter (ddp) (%)',
                'Shimmer (local) (%)','Shimmer (local, dB) (dB)','Shimmer (apq3) (%)','Shimmer (apq5) (%)','Shimmer (apq11) (%)','Shimmer (dda) (%)',
                'Mean noise-to-harmonics ratio','Mean harmonics-to-noise ratio (dB)','spread1 (negative entropy of F0)','spread2 (standard error of F0)','PPE','DFA','RPDE','d2']
    
    X = mPower_samples[features].to_numpy().astype(float)
    y = (mPower_samples['professional-diagnosis']*1).to_numpy(dtype='int64')
    
    return X, y

#################################################################

######################## Global settings ########################
preprocessors = [StandardScaler, MinMaxScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [5,8,12,16]
ks = [1,5,9,11]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.03]*4,[0.003]*4,]
epochses = [[50]*4]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid],[nn.ReLU,nn.ReLU],]
latent_sizes = [4,6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf','linear']
gammas = ['scale','auto',0.2,0.1,0.005,0.0001]
Cs = [1,10]
num_featureses = [5,10,15,20] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
repeated_kfold_results,repeated_traintest_results = evaluation.unmodified('mPower',mPower_samples,mPower_participants,to_numpy,
                                             global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                             ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                             seeds=seeds,repetitions=repetitions,verbose_odds=0,#0.025
                                             n_splits=kfold_splits,training_split=0.7)

In [None]:
rh.print_results(repeated_kfold_results,top=2)
rh.print_results(repeated_traintest_results,top=2)

d = date.today().strftime("%d%B%y")
dl.pickleSave(repeated_kfold_results,join('Results','repeated_kfold with mPower unmodified ' + d + '.pickle'))
dl.pickleSave(repeated_traintest_results,join('Results','repeated_traintest with mPower unmodified' + d + '.pickle'))

## Evaluate the models (MODIFIED)

### with OPD

In [None]:
kfold_splits = 8
repetitions = 30

###################### Seeds ######################
seeds = list(range(repetitions))
###################################################


######################## to_numpy method ########################

def to_numpy(OPD_samples):
    
    features = ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 
                'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 
                'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
    
    X = OPD_samples[features].to_numpy()
    y = OPD_samples['status'].to_numpy()
    
    return X, y

#################################################################


#The model we will be using for Ozkan is "ozkan PCA_14 k_1 MinMaxScaler X only"
#This was the best model for 10fold CV, and best for 70/30 split

#The model we will be using for Caliskan is "caliskan ReLU Sigmoid latent:6, epochs:400, lr:0.0030 MinMaxScaler X only"
#This was the 2nd best model for 10fold CV, and best for 70/30 split

#The model we will be using for Ul-Haq is "ulhaq rbf gamma:0.2000 C:5 num_features:14 StandardScaler X only"
#This was the 2nd best model for both 10fold CV and 70/30 split

######################## Global settings ########################
preprocessors = [MinMaxScaler,StandardScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [14]
ks = [1]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4]
epochses = [[400]*4]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid]]
latent_sizes = [6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf']
gammas = [0.2]
Cs = [5]
num_featureses = [14] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
repeated_kfold_results,repeated_traintest_results = evaluation.modified_OPD(OPD_samples,OPD_participants,to_numpy,
                                             global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                             ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                             seeds=seeds,repetitions=repetitions,verbose_odds=0.005,#0.025
                                             n_splits=kfold_splits,training_split=0.7)

In [None]:
rh.print_results(repeated_kfold_results,top=5)
rh.print_results(repeated_traintest_results,top=5)

d = date.today().strftime("%d%B%y")
dl.pickleSave(repeated_kfold_results,join('Results','repeated_kfold with OPD modified ' + d + '.pickle'))
dl.pickleSave(repeated_traintest_results,join('Results','repeated_traintest with OPD modified' + d + '.pickle'))

### with mPower

In [None]:
kfold_splits = 5
repetitions = 10

###################### Seeds ######################
seeds = list(range(repetitions))
###################################################


######################## to_numpy method ########################

def to_numpy(mPower_samples):

    features = ['Mean pitch (Hz)','Minimum pitch (Hz)','Maximum pitch (Hz)','Jitter (local) (%)','Jitter (local, absolute)','Jitter (rap) (%)','Jitter (ppq5) (%)','Jitter (ddp) (%)',
                'Shimmer (local) (%)','Shimmer (local, dB) (dB)','Shimmer (apq3) (%)','Shimmer (apq5) (%)','Shimmer (apq11) (%)','Shimmer (dda) (%)',
                'Mean noise-to-harmonics ratio','Mean harmonics-to-noise ratio (dB)','spread1 (negative entropy of F0)','spread2 (standard error of F0)','PPE','DFA','RPDE','d2']
    
    X = mPower_samples[features].to_numpy()
    y = (mPower_samples['professional-diagnosis']*1).to_numpy(dtype='int64')
    
    return X, y

#################################################################


#The model we will be using for Ozkan is "ozkan PCA_16 k_11 StandardScaler X only"
#This was the best model for both 10fold CV and 70/30 split

#The model we will be using for Caliskan is "caliskan ReLU Sigmoid latent:6, epochs:50, lr:0.0030 StandardScaler X only"
#This was the best model for both 10fold CV and 70/30 split

#The model we will be using for Ulhaq is "ulhaq rbf gamma:scale C:10 num_features:20 StandardScaler X only"
#This was the 2nd best model for 10fold CV and best for 70/30 split


######################## Global settings ########################
preprocessors = [StandardScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [16]
ks = [11]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4,]
epochses = [[50]*4,]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid],]
latent_sizes = [6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf']
gammas = ['scale']
Cs = [10]
num_featureses = [20] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
repeated_kfold_results,repeated_traintest_results = evaluation.modified_mPower(mPower_samples,mPower_participants,to_numpy,
                                             global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                             ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                             seeds=seeds,repetitions=repetitions,verbose_odds=0.0025,#0.025
                                             n_splits=kfold_splits,training_split=0.7,age_range=3)

In [None]:
rh.print_results(repeated_kfold_results,top=1)
rh.print_results(repeated_traintest_results,top=1)

d = date.today().strftime("%d%B%y")
dl.pickleSave(repeated_kfold_results,join('Results','repeated_kfold with mPower modified ' + d + '.pickle'))
dl.pickleSave(repeated_traintest_results,join('Results','repeated_traintest with mPower modified' + d + '.pickle'))