This is an auxiliary experiment to show that the decrease in performance are not due to a reduction in training and test size. 

We do so by randomly reducing the size of the data sets to the same number as our modified evaluation method, and with the same models, that is, until the OPD data has 96 samples, and until the mPower data has ~18300 samples

In [None]:
import pandas as pd
import dataHandler as dh
from os.path import join

OPD_samples = pd.read_csv(join('Data','OPD_data.csv'))
OPD_participants = pd.read_csv(join('Data','OPD_participants.csv'))


print('One of the participants (S31) does not have demographic data (i.e., age and gender), which is why their submissions are missing from counts that include anything to do with gender or age\n')

dh.OPD_summary(OPD_samples, OPD_participants)

In [None]:
import dataLoader as dl
import mPower_data_adjustments as mda


#load the original submission data, the features extracted from them, and information needed to filter them
submissions = dl.pickleLoad(join('Data','submissions_Full.pickle'))
features = dl.pickleLoad(join('Data','combinedFeatures.pickle'))
d2 = dl.pickleLoad(join('Data','d2Feature5.pickle'))
rms_energy = dl.pickleLoad(join('Data','rms_energy_notNormalised.pickle'))

features.rename(columns={'recordID':'recordId'},inplace=True)
d2 = d2[d2['d2'] != 'nan'] #drop nans in d2

#filter out bad submissions and see how many remains
rms_energy = rms_energy[rms_energy['rms_1_mean'] > 300] #250
rms_energy = rms_energy[rms_energy['rms_1_std'] < 2000] #2500
rms_energy = rms_energy[rms_energy['energy_1_mean'] > 50000] #45000
features = features[features['Degree of voice breaks (%)'] < 30] #40
features = features[features['Fraction of locally unvoiced frames (%)'] < 30]#40

#combine them and see how many remains
mPower_samples = pd.merge(rms_energy['recordId'],submissions,on='recordId')
mPower_samples = pd.merge(mPower_samples,features,on='recordId')
mPower_samples = mPower_samples.merge(d2[['recordId','d2']],on='recordId',how='inner')
mPower_samples = mPower_samples[~(mPower_samples['gender'] == 'Prefer not to answer')] #retains only male and female

#Remove samples that I found were bad, by looking at the 10 most extreme values for all 22 features and listening
mPower_samples = mda.remove_bad_samples(mPower_samples)

#Recalculate spread1 and spread2, the original was wrong.
mPower_samples = mda.recalculate_spreads(mPower_samples)

#Adjust the scale of several mPower features to be on the same scale as OPD. 
#Things like Jitter (%) is currently 2%, but in OPD features would be 0.02
mPower_samples = mda.adjust_feature_scale(mPower_samples)

mPower_participants = mPower_samples.drop_duplicates(subset=['healthCode'])[['healthCode', 'age', 'diagnosis-year','gender','onset-year','professional-diagnosis']]

#print summary of the dataset
dh.mPower_summary(mPower_samples, mPower_participants)

## Load and evaluate the models (UNMODIFIED)

In [None]:
from ozkan_model import ozkan
from caliskan_model import caliskan
from ulhaq_model import ulhaq

import itertools
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import torch.nn as nn
from datetime import date

import evaluation
import resultsHandler as rh

import numpy as np
import random

In [None]:
def initialise_results_df():
    ozkan_kf_results = rh.results_DataFrame()
    caliskan_kf_results = rh.results_DataFrame()
    ulhaq_kf_results = rh.results_DataFrame()
    kf_results = {'ozkan':ozkan_kf_results, 'caliskan':caliskan_kf_results, 'ulhaq':ulhaq_kf_results}

    ozkan_split_results = rh.multiple_runs_DataFrame()
    caliskan_split_results = rh.multiple_runs_DataFrame()
    ulhaq_split_results = rh.multiple_runs_DataFrame()
    split_results = {'ozkan':ozkan_split_results, 'caliskan':caliskan_split_results, 'ulhaq':ulhaq_split_results}
    
    return kf_results,split_results
    
    
def process_results(kf_results_dict,kfold_results,split_results_dict,traintest_results):
    
    ozkan_result,caliskan_result,ulhaq_result = rh.separate_results(kfold_results)
    kf_results_dict['ozkan'].loc[rep] = ozkan_result['run results'].loc[0]
    kf_results_dict['caliskan'].loc[rep] = caliskan_result['run results'].loc[0]
    kf_results_dict['ulhaq'].loc[rep] = ulhaq_result['run results'].loc[0]
    
    ozkan_result,caliskan_result,ulhaq_result = rh.separate_results(traintest_results)
    split_results_dict['ozkan'].loc[rep] = ozkan_result['run results'].loc[0]
    split_results_dict['caliskan'].loc[rep] = caliskan_result['run results'].loc[0]
    split_results_dict['ulhaq'].loc[rep] = ulhaq_result['run results'].loc[0]
    
    return kf_results_dict,split_results_dict


def process_repeated_results(kf_results_dict,split_results_dict):
    repeated_kf_results = rh.results_DataFrame()
    repeated_split_results = rh.results_DataFrame()
    
    repeated_kf_results.loc['ozkan'] = rh.process_multiple_kfold_runs(kf_results_dict['ozkan'])
    repeated_kf_results.loc['caliskan'] = rh.process_multiple_kfold_runs(kf_results_dict['caliskan'])
    repeated_kf_results.loc['ulhaq'] = rh.process_multiple_kfold_runs(kf_results_dict['ulhaq'])

    repeated_split_results.loc['ozkan'] = rh.process_multiple_runs(split_results_dict['ozkan'])
    repeated_split_results.loc['caliskan'] = rh.process_multiple_runs(split_results_dict['caliskan'])
    repeated_split_results.loc['ulhaq'] = rh.process_multiple_runs(split_results_dict['ulhaq'])
    
    return repeated_kf_results,repeated_split_results

### with OPD
Using the best models found from previous grid search

In [None]:
kfold_splits = 8
repetitions = 30


###################### Seeds ######################
seeds = list(range(repetitions))
###################################################

######################## to_numpy method ########################
def to_numpy(OPD_samples):
    
    features = ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 
                'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 
                'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
    
    X = OPD_samples[features].to_numpy()
    y = OPD_samples['status'].to_numpy()
    
    return X, y
#################################################################

######################## Global settings ########################
preprocessors = [MinMaxScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [14]
ks = [1]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4]
epochses = [[400]*4]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid]]
latent_sizes = [6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf']
gammas = [0.2]
Cs = [5]
num_featureses = [14] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
kf_results_dict_mm,split_results_dict_mm = initialise_results_df()
    
for rep,seed in zip(range(repetitions),seeds):
    np.random.seed(seed)
    random.seed(seed)
    
    #Let us randomly remove samples until we are at the same point
    OPD_samples_reduced = OPD_samples.sample(96)
    cond = OPD_participants['participant'].isin(OPD_samples_reduced['participant'])
    OPD_participants_reduced = OPD_participants[cond]

    kfold_results,traintest_results = evaluation.unmodified('OPD',OPD_samples_reduced,OPD_participants_reduced,
                                              to_numpy,global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                              ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                              seeds=[seed],repetitions=1,verbose_odds=0,#0.025
                                              n_splits=kfold_splits,training_split=0.7)

    kf_results_dict_mm,split_results_dict_mm = process_results(kf_results_dict_mm,kfold_results,split_results_dict_mm,traintest_results)
    
repeated_kf_results_mm,repeated_split_results_mm = process_repeated_results(kf_results_dict_mm,split_results_dict_mm)

In [None]:
display(repeated_kf_results_mm)
display(repeated_split_results_mm)

Couldnt be bothered to write the code to make ulhaq use standard scaler and ozkan and caliskan use minmax, so I computed both and combined them

In [None]:
kfold_splits = 8
repetitions = 30


###################### Seeds ######################
seeds = list(range(repetitions))
###################################################

######################## to_numpy method ########################
def to_numpy(OPD_samples):
    
    features = ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 
                'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 
                'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
    
    X = OPD_samples[features].to_numpy()
    y = OPD_samples['status'].to_numpy()
    
    return X, y
#################################################################

######################## Global settings ########################
preprocessors = [StandardScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [14]
ks = [1]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4]
epochses = [[400]*4]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid]]
latent_sizes = [6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf']
gammas = [0.2]
Cs = [5]
num_featureses = [14] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
kf_results_dict_ss,split_results_dict_ss = initialise_results_df()
    
for rep,seed in zip(range(repetitions),seeds):
    np.random.seed(seed)
    random.seed(seed)
    
    #Let us randomly remove samples until we are at the same point
    OPD_samples_reduced = OPD_samples.sample(96)
    cond = OPD_participants['participant'].isin(OPD_samples_reduced['participant'])
    OPD_participants_reduced = OPD_participants[cond]

    kfold_results,traintest_results = evaluation.unmodified('OPD',OPD_samples_reduced,OPD_participants_reduced,
                                              to_numpy,global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                              ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                              seeds=[seed],repetitions=1,verbose_odds=0,#0.025
                                              n_splits=kfold_splits,training_split=0.7)

    kf_results_dict_ss,split_results_dict_ss = process_results(kf_results_dict_ss,kfold_results,split_results_dict_ss,traintest_results)
    
repeated_kf_results_ss,repeated_split_results_ss = process_repeated_results(kf_results_dict_ss,split_results_dict_ss)

In [None]:
display(repeated_kf_results_ss)
display(repeated_split_results_ss)

In [None]:
#combine
import copy

repeated_kf_results = copy.copy(repeated_kf_results_mm)
repeated_split_results = copy.copy(repeated_split_results_mm)

repeated_kf_results.loc['ulhaq'] = repeated_kf_results_ss.loc['ulhaq']
repeated_split_results.loc['ulhaq'] = repeated_split_results_ss.loc['ulhaq']

display(repeated_kf_results)
display(repeated_split_results)


dl.pickleSave(repeated_kf_results,join('Results','repeated_kfold restricted OPD 27Jan22.pickle'))
dl.pickleSave(repeated_split_results,join('Results','repeated_traintest restricted OPD 27Jan22.pickle'))

### with mPower
Using the best models found from previous grid search

In [None]:
kfold_splits = 5
repetitions = 10

###################### Seeds ######################
seeds = list(range(repetitions))
###################################################

######################## to_numpy method ########################

def to_numpy(mPower_samples):

    features = ['Mean pitch (Hz)','Minimum pitch (Hz)','Maximum pitch (Hz)','Jitter (local) (%)','Jitter (local, absolute)','Jitter (rap) (%)','Jitter (ppq5) (%)','Jitter (ddp) (%)',
                'Shimmer (local) (%)','Shimmer (local, dB) (dB)','Shimmer (apq3) (%)','Shimmer (apq5) (%)','Shimmer (apq11) (%)','Shimmer (dda) (%)',
                'Mean noise-to-harmonics ratio','Mean harmonics-to-noise ratio (dB)','spread1 (negative entropy of F0)','spread2 (standard error of F0)','PPE','DFA','RPDE','d2']
    
    X = mPower_samples[features].to_numpy()
    y = (mPower_samples['professional-diagnosis']*1).to_numpy(dtype='int64')
    
    return X, y

#################################################################


#The model we will be using for Ozkan is "ozkan PCA_16 k_11 StandardScaler X only"
#This was the best model for both 10fold CV and 70/30 split

#The model we will be using for Caliskan is "caliskan ReLU Sigmoid latent:6, epochs:50, lr:0.0030 StandardScaler X only"
#This was the best model for both 10fold CV and 70/30 split

#The model we will be using for Ulhaq is "ulhaq rbf gamma:scale C:10 num_features:20 StandardScaler X only"
#This was the 2nd best model for 10fold CV and best for 70/30 split


######################## Global settings ########################
preprocessors = [StandardScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [16]
ks = [11]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4,]
epochses = [[50]*4,]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid],]
latent_sizes = [6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf']
gammas = ['scale']
Cs = [10]
num_featureses = [20] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################

In [None]:
kf_results_dict,split_results_dict = initialise_results_df()
    
for rep,seed in zip(range(repetitions),seeds):
    np.random.seed(seed)
    random.seed(seed)
    
    #Let us randomly remove samples until we are at the same point
    mPower_samples_reduced = mPower_samples.sample(n=18300)
    mPower_participants_reduced = mPower_samples_reduced.drop_duplicates(subset=['healthCode'])[['healthCode', 'age', 'diagnosis-year','gender','onset-year','professional-diagnosis']]

    kfold_results,traintest_results = evaluation.unmodified('mPower',mPower_samples_reduced,mPower_participants_reduced,
                                             to_numpy,global_settings,ozkan_settings,caliskan_settings,ulhaq_settings,
                                             ozkan_method=ozkan,caliskan_method=caliskan,ulhaq_method=ulhaq,
                                             seeds=[seed],repetitions=1,verbose_odds=0,#0.025
                                             n_splits=kfold_splits,training_split=0.7)

    kf_results_dict,split_results_dict = process_results(kf_results_dict,kfold_results,split_results_dict,traintest_results)
    
repeated_kf_results,repeated_split_results = process_repeated_results(kf_results_dict,split_results_dict)

In [None]:
display(repeated_kf_results)
display(repeated_split_results)


dl.pickleSave(repeated_kf_results,join('Results','repeated_kfold restricted mPower 26Jan22.pickle'))
dl.pickleSave(repeated_split_results,join('Results','repeated_traintest restricted mPower 26Jan22.pickle'))


We can see that although there is a decrease in performance, it is tiny compare to the decrease from all the modifications. Thus we can conclude that the performance decrease is not because of a reduction of training and test samples