# This notebook specifically is to determine the generalisation performance of the models chosen

## Load and prepare/preprocess data
OPD is the Oxford Parkinsons Disease dataset, retrieved from UCI. This was originally created by Max Little, and is used extensively in PD classification research (https://archive.ics.uci.edu/ml/datasets/parkinsons)

mPower is the larger dataset, from the mPower study, a Parkinsons mobile application developed by Sage Bionetworks and described in Synpase (doi:10.7303/syn4993293).

In [None]:
import pandas as pd
import dataHandler as dh
from os.path import join

OPD_samples = pd.read_csv(join('Data','OPD_data.csv'))
OPD_participants = pd.read_csv(join('Data','OPD_participants.csv'))

print('One of the participants (S31) does not have demographic data (i.e., age and gender), which is why their submissions are missing from counts that include anything to do with gender or age\n')

dh.OPD_summary(OPD_samples, OPD_participants)

In [None]:
import dataLoader as dl
import mPower_data_adjustments as mda

#load the original submission data, the features extracted from them, and information needed to filter them
submissions = dl.pickleLoad(join('Data','submissions_Full.pickle'))
features = dl.pickleLoad(join('Data','combinedFeatures.pickle'))
d2 = dl.pickleLoad(join('Data','d2Feature5.pickle'))
rms_energy = dl.pickleLoad(join('Data','rms_energy_notNormalised.pickle'))

features.rename(columns={'recordID':'recordId'},inplace=True)
d2 = d2[d2['d2'] != 'nan'] #drop nans in d2

#filter out bad submissions and see how many remains
rms_energy = rms_energy[rms_energy['rms_1_mean'] > 300] #250
rms_energy = rms_energy[rms_energy['rms_1_std'] < 2000] #2500
rms_energy = rms_energy[rms_energy['energy_1_mean'] > 50000] #45000
features = features[features['Degree of voice breaks (%)'] < 30] #40
features = features[features['Fraction of locally unvoiced frames (%)'] < 30]#40

#combine them and see how many remains
mPower_samples = pd.merge(rms_energy['recordId'],submissions,on='recordId')
mPower_samples = pd.merge(mPower_samples,features,on='recordId')
mPower_samples = mPower_samples.merge(d2[['recordId','d2']],on='recordId',how='inner')
mPower_samples = mPower_samples[~(mPower_samples['gender'] == 'Prefer not to answer')] #retains only male and female

#Remove samples that I found were bad, by looking at the 10 most extreme values for all 22 features and listening
mPower_samples = mda.remove_bad_samples(mPower_samples)

#Recalculate spread1 and spread2, the original was wrong.
mPower_samples = mda.recalculate_spreads(mPower_samples)

#Adjust the scale of several mPower features to be on the same scale as OPD. 
#Things like Jitter (%) is currently 2%, but in OPD features would be 0.02
mPower_samples = mda.adjust_feature_scale(mPower_samples)

mPower_participants = mPower_samples.drop_duplicates(subset=['healthCode'])[['healthCode', 'age', 'diagnosis-year','gender','onset-year','professional-diagnosis']]

#print summary of the dataset
dh.mPower_summary(mPower_samples, mPower_participants)

## Load and evaluate the models

In [None]:
from ozkan_model import ozkan
from caliskan_model import caliskan
from ulhaq_model import ulhaq

import itertools
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import torch.nn as nn
from datetime import date
import time

import evaluation
import resultsHandler as rh

### Define the search space for gridsearch. 
This is narrower than our gridsearch notebook, as the main purpose of this is comparison, not finding the best model. We have used our previous gridsearch notebook to inform us of a good-ish search space

In [None]:
kfold_splits = 5
test_repetitions = 10

###################### Seeds ######################
test_seeds = list(range(test_repetitions))
###################################################

######################## to_numpy method ########################

def to_numpy(mPower_samples):

    features = ['Mean pitch (Hz)','Minimum pitch (Hz)','Maximum pitch (Hz)','Jitter (local) (%)','Jitter (local, absolute)','Jitter (rap) (%)','Jitter (ppq5) (%)','Jitter (ddp) (%)',
                'Shimmer (local) (%)','Shimmer (local, dB) (dB)','Shimmer (apq3) (%)','Shimmer (apq5) (%)','Shimmer (apq11) (%)','Shimmer (dda) (%)',
                'Mean noise-to-harmonics ratio','Mean harmonics-to-noise ratio (dB)','spread1 (negative entropy of F0)','spread2 (standard error of F0)','PPE','DFA','RPDE','d2']
    
    X = mPower_samples[features].to_numpy()
    y = (mPower_samples['professional-diagnosis']*1).to_numpy(dtype='int64')
    
    return X, y

#################################################################


######################## Global settings ########################
preprocessors = [StandardScaler]
preprocessing_methods = ['X only']
global_settings = list(itertools.product(preprocessors,preprocessing_methods))
##################################################################

######################## Ozkan settings ########################
components = [8,12,16]
ks = [5,9,11]
ozkan_settings = list(itertools.product(components,ks))
################################################################

######################## Caliskan settings ########################
lrses = [[0.003]*4,[0.03]*4]
epochses = [[50]*4,]
rhoses = [[0.15,0.25],]
lamses = [[0.03,0.03],]
Bses = [[2,2],]
activationses = [[nn.ReLU,nn.Sigmoid],[nn.ReLU,nn.ReLU],]
latent_sizes = [4,6,]
caliskan_settings = list(itertools.product(lrses,epochses,rhoses,lamses,Bses,activationses,latent_sizes))
###################################################################

######################## Ul-Haq settings ########################
kernels = ['rbf']
gammas = ['scale','auto',0.2]
Cs = [1,10]
num_featureses = [10,15,20] #best at 10 in paper
ulhaq_settings = list(itertools.product(kernels,gammas,Cs,num_featureses))
#################################################################



In [None]:
unmodified_results = evaluation.generalisationPerformance_unmodified(OPD_samples=OPD_samples,
                                                                    OPD_participants=OPD_participants,
                                                                    mPower_samples=mPower_samples,
                                                                    test_size=100,
                                                                    to_numpy=to_numpy,
                                                                    global_settings=global_settings,
                                                                    ozkan_settings=ozkan_settings,
                                                                    caliskan_settings=caliskan_settings,
                                                                    ulhaq_settings=ulhaq_settings,
                                                                    ozkan_method=ozkan,
                                                                    caliskan_method=caliskan,
                                                                    ulhaq_method=ulhaq,
                                                                    test_seeds=test_seeds,
                                                                    validation_repetitions=1,
                                                                    test_repetitions=test_repetitions,
                                                                    verbose_odds = 0,
                                                                    n_splits=kfold_splits)
unmodified_results

In [None]:
twinned_results = evaluation.generalisationPerformance_twinned(OPD_samples=OPD_samples,
                                                                OPD_participants=OPD_participants,
                                                                mPower_samples=mPower_samples,
                                                                test_size=100,
                                                                to_numpy=to_numpy,
                                                                global_settings=global_settings,
                                                                ozkan_settings=ozkan_settings,
                                                                caliskan_settings=caliskan_settings,
                                                                ulhaq_settings=ulhaq_settings,
                                                                ozkan_method=ozkan,
                                                                caliskan_method=caliskan,
                                                                ulhaq_method=ulhaq,
                                                                test_seeds=test_seeds,
                                                                validation_repetitions=1,
                                                                test_repetitions=test_repetitions,
                                                                verbose_odds = 0,
                                                                n_splits=kfold_splits,
                                                                age_range=3)
twinned_results

In [None]:
print('Unmodified results:\n')
display(unmodified_results)
d = date.today().strftime("%d%B%y")
dl.pickleSave(unmodified_results,join('Results','unmodified generalisation ' + d + '.pickle'))

print('Twinned results:\n')
display(twinned_results)
d = date.today().strftime("%d%B%y")
dl.pickleSave(twinned_results,join('Results','twinned generalisation ' + d + '.pickle'))