# Classification of Heart Rate Variability data (RR-series)

In [1]:
# import modules
using Pkg
using CSV
using DataFrames
using Ripserer
using Plots
using MultivariateStats
using Statistics
using MultivariateAnomalies
using LIBSVM
using GLM
using StatsBase
using Lathe
using MLBase
using ClassImbalance
using ROCAnalysis
using Lathe.preprocess: TrainTestSplit
using Random

## Loading data

In [2]:
# load the data
Control = CSV.read("C:\\Users\\qwang\\OneDrive\\Desktop\\rank functions\\Data\\normalRRs_CON.csv", header=false)
Stroke = CSV.read("C:\\Users\\qwang\\OneDrive\\Desktop\\rank functions\\Data\\normalRRs_STR.csv", header=false)
# convert DataFrame to array
Control_subjects = Control[:,1];
Control_data = Control[:,2:501];# length reduced because of missing values
Control_data = Matrix{Float64}(Control_data);
Stroke = Stroke[setdiff(1:end, 26),:];
Stroke_subjects = Stroke[:,1];
Stroke_data = Stroke[:,2:501];
Stroke_data = Matrix{Float64}(Stroke_data);

In [3]:
# creat an empty array of arrays
Control_size = size(Control_data)[1];
Stroke_size = size(Stroke_data)[1];
Control_barcodes = Array{Array{Float64}}(undef,Control_size);
Stroke_barcodes = Array{Array{Float64}}(undef,Stroke_size);

## Define functions

In [4]:
# define a function that converts the outputted PersistenceDiagram into array
function PD_to_array(result)
    output = Array{Float64}(undef, (length(result),2))
    for i in 1:length(result)
        output[i,:] = [result[i][1],result[i][2]]
    end
    return output
end  

# for each row of RR intervals of a person calculate the barcode of the curve
for i in 1:Control_size
    result, _ = ripserer(Cubical(Control_data[i,:]));
    Control_barcodes[i] = PD_to_array(result);
end
for j in 1:Stroke_size
    result, _ = ripserer(Cubical(Stroke_data[j,:]));
    Stroke_barcodes[j] = PD_to_array(result);
end

# functions to convert barcodes to rank functions (main function "B2rank")
function B2rank(Barcode,minval,maxval,maxord)
    f                           =   interval2ordinal(minval,maxval,maxord)
    B                           =   f.(Barcode).+1
    r                           =   barcode2rankfun(B,maxord)
    return r
end

# assumes an integer barcode with minval 1
# B is an arbitrary (floating point) barcode
# B will be translated on to an integer interval [1,gridmax]
function barcode2rankfun(B,gridmax)
    if isempty(B)
        return zeros(Int64,gridmax,gridmax)
    elseif maximum(B) > gridmax ###### infinity from essential cycle
        print(maximum(B))
        print("Error: B may take values no greater than gridmax.")
        return
    end
    M                           =   zeros(Int64,gridmax,gridmax)
    for p                       =   1:size(B,1)
        I                       =   B[p,1]: B[p,2]
        M[I,I].+=1
    end
    return M
end

function interval2ordinal(a,b,maxord)
    function f(u)
        u                       =   (u-a)/(b-a)
        u                       =   ceil.(Int64,u*maxord)
    end
    return f
end

function expand2interval(B;lowerlim=0,upperlim=1) # changing range of interval
    if isempty(B)
        return B
    end
    minval                      =   minimum(B)
    maxval                      =   maximum(B)
    C                           =   (B-minval)/(maxval-minval)
    C                           =   C*(upperlim-lowerlim)+lowerlim
end

# find maximum of all barcodes
maxs = Array{Float64}(undef,0)
for a in Control_barcodes
    append!(maxs, [maximum(a[1:(end-1),:])])
end
for b in Stroke_barcodes
    append!(maxs, [maximum(b[1:(end-1),:])])
end
maxi = maximum(maxs);

# find minimum of all barcodes
mins = Array{Float64}(undef,0)
for a in Control_barcodes
    append!(mins, [minimum(a[1:(end-1),:])])
end
for b in Stroke_barcodes
    append!(mins, [minimum(b[1:(end-1),:])])
end
mini = minimum(mins);

# for ease of use substitute all infinities with value for maxi
for a in Control_barcodes
    a[end,2] = maxi;
end
for b in Stroke_barcodes
    b[end,2] = maxi;
end


In [5]:
# Convert barcodes into rank functions (matrix)
# gridsize = 100
Control_rankfunc = Array{Array{Float64}}(undef,Control_size);
Stroke_rankfunc = Array{Array{Float64}}(undef,Stroke_size);
f_mini = floor(mini);
c_maxi = ceil(maxi)+15 # add on some values to ensure max of barcodes does not go over gridmax when converted usually (c_maxi-f_mini)/gridmax would be sufficient
for i in 1:Control_size
    Control_rankfunc[i] = B2rank(Control_barcodes[i],f_mini,c_maxi,100)
end
for j in 1:Stroke_size
    Stroke_rankfunc[j] = B2rank(Stroke_barcodes[j],f_mini,c_maxi,100)
end

# Convert rank function matrices to vectors
Control_rankvec = Array{Array{Float64}}(undef,Control_size);
Stroke_rankvec = Array{Array{Float64}}(undef,Stroke_size);
for i in 1:Control_size
    Control_rankvec[i] = vec(Control_rankfunc[i])
end
for j in 1:Stroke_size
    Stroke_rankvec[j] = vec(Stroke_rankfunc[j])
end

# Combine vectors into matrix
d = length(Control_rankvec[1])
Control_rankmatrix_all = Array{Float64}(undef,d,Control_size);
Stroke_rankmatrix_all = Array{Float64}(undef,d,Stroke_size);
for i in 1:Control_size
    Control_rankmatrix_all[:,i] = Control_rankvec[i]
end
for j in 1:Stroke_size
    Stroke_rankmatrix_all[:,j] = Stroke_rankvec[j]
end


Control_meanrank = mean(Control_rankmatrix_all, dims=2)
Stroke_meanrank = mean(Stroke_rankmatrix_all, dims=2)
# create centred rank matrix
Control_rankmatrix_centred = Array{Float64}(undef,d,Control_size);
Stroke_rankmatrix_centred = Array{Float64}(undef,d,Stroke_size);
for i in 1:Control_size
    Control_rankmatrix_centred[:,i] = Control_rankmatrix_all[:,i]-Control_meanrank
end
for j in 1:Stroke_size
    Stroke_rankmatrix_centred[:,j] = Stroke_rankmatrix_all[:,j]-Stroke_meanrank
end

In [8]:
# Combine two matrices (control and stroke) together
Rankmatrix_all_centred = hcat(Control_rankmatrix_centred, Stroke_rankmatrix_centred);
Rankmatrix_all = hcat(Control_rankmatrix_all, Stroke_rankmatrix_all);

# Some visualization


In [None]:
# examples of RR-series 
plot(1:500, Stroke_data[1:5,:]',legend=true)
ylabel!("RR-intervals")
xlabel!("Time")

In [None]:
# one series from healthy individual and one series from stroke patient
plot(1:500, hcat(Control_data[8,:], Stroke_data[5,:]),label=["Healthy individual 1" "Stroke patient 1"])
ylabel!("RR-intervals")
xlabel!("Time")

In [None]:
# computed persistence diagram on example
bars = ripserer(Cubical(Control_data[1,:]))
plot(bars)
# savefig("pd.png") 

In [None]:
# plot barcode
barcode(bars, linewidth=3)
# savefig("barcodes.png") 

In [None]:
# heatmap of rank function
heatmap(1:100,1:100,Control_rankfunc[1],c=cgrad([:white, :blue]))

In [None]:
# plot example rank function in 3D
pyplot()
x=range(f_mini,stop=c_maxi,length=100)
y=range(f_mini,stop=c_maxi,length=100)
plot(x,y,Control_rankfunc[1],st=:surface,camera=(-10,30))

# Train svm on functions


In [9]:
# Add labels to each rank function 0 for control and 1 for Stroke
labels = hcat(zeros(1,Control_size),ones(1,Stroke_size));
# Stack labels and matrix together
Data_all = vcat(labels, Rankmatrix_all_centred);

Control_rank = DataFrame(Control_rankmatrix_centred');
Stroke_rank = DataFrame(Stroke_rankmatrix_centred');
# make into a dataframe and add column names
colnames = vcat(["labels"],names(Control_rank));
df = DataFrame(Data_all');
rename!(df, Symbol.(colnames));

In [74]:
# model performance very variable
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1])
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end

In [75]:
print(ac) # average accuracy
print("Average accuracy: \n", mean(ac))

[74.70588235294117, 73.3986928104575, 74.44444444444443, 76.66666666666666, 64.9673202614379, 72.09150326797385, 74.37908496732027, 80.13071895424837, 75.49019607843138, 79.34640522875817]Average accuracy: 
74.56209150326796

In [76]:
print(aucs_rec) # average accuracy
print("Average AUC_ROC: \n", mean(aucs_rec))

[0.7584199134199133, 0.7296969696969697, 0.7322619047619048, 0.7691666666666667, 0.6906080031080031, 0.743531746031746, 0.7460042735042736, 0.790873015873016, 0.7328571428571429, 0.7913888888888889]Average AUC_ROC: 
0.7484808524808525

## Linear Kernel


In [10]:
# model performance very variable
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Linear)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
print(ac) # average accuracy
print("Average accuracy on Linear Kernel: \n", mean(ac), "sd: \n", std(ac))
print(aucs_rec) # average accuracy
print("Average AUC_ROC on Linear Kernel: \n", mean(aucs_rec), "sd: \n", std(aucs_rec))

[34.37908496732026, 39.607843137254896, 29.2156862745098, 37.84313725490197, 36.66666666666667, 35.294117647058826, 41.11111111111111, 44.18300653594771, 49.28104575163398, 47.385620915032675]Average accuracy on Linear Kernel: 
39.496732026143796sd: 
6.172006009770957[0.35892857142857143, 0.40062409812409816, 0.28575396825396826, 0.40241647241647244, 0.36949494949494943, 0.37144688644688645, 0.42725829725829734, 0.46036075036075036, 0.4986330336330337, 0.5048096348096348]Average AUC_ROC on Linear Kernel: 
0.4079726662226662sd: 
0.06745815246922615

## Sigmoid Kernel

In [78]:
# model performance very variable
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Sigmoid)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
print(ac) # average accuracy
print("Average accuracy on Linear Kernel: \n", mean(ac))
print(aucs_rec) # average accuracy
print("Average AUC_ROC on Linear Kernel: \n", mean(aucs_rec))

[25.62091503267974, 30.26143790849673, 23.333333333333332, 26.862745098039216, 26.79738562091503, 29.084967320261438, 32.48366013071895, 31.241830065359476, 24.640522875816995, 24.37908496732026]Average accuracy on Linear Kernel: 
27.470588235294116[0.2904700854700855, 0.3142352092352092, 0.22873015873015873, 0.32456349206349205, 0.26535714285714285, 0.28583333333333333, 0.3297727272727273, 0.31452380952380954, 0.261978021978022, 0.2625793650793651]Average AUC_ROC on Linear Kernel: 
0.2878043345543346

## Polynomial Kernel

In [11]:
# model performance very variable
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Linear Kernel: \n", mean(ac), "sd: \n", std(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Linear Kernel: \n", mean(aucs_rec), "sd: \n", std(aucs_rec))

Average accuracy on Linear Kernel: 
82.75163398692811sd: 
2.2704932858435294Average AUC_ROC on Linear Kernel: 
0.8379918692418693sd: 
0.020340080521201325

In [13]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=2)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 2 Kernel: \n", mean(ac))
print("sd: \n", std(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on polynomial 2 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 2 Kernel: 
82.56862745098039sd: 
1.2309078987097506Average AUC_ROC on polynomial 2 Kernel: 
0.828917748917749sd: 
0.013832509896864133

In [14]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
print(ac) # average accuracy
print("Average accuracy on Polynomial 3 Kernel: \n", mean(ac))
print(aucs_rec) # average accuracy
print("Average AUC_ROC on polynomial 3 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

[79.15032679738563, 83.72549019607843, 79.281045751634, 81.37254901960785, 77.97385620915034, 83.72549019607843, 82.41830065359476, 82.61437908496733, 80.3921568627451, 84.9673202614379]Average accuracy on Polynomial 3 Kernel: 
81.56209150326798[0.8, 0.8608585858585858, 0.807936507936508, 0.8269444444444444, 0.7903663003663004, 0.836926406926407, 0.8286111111111112, 0.8394444444444444, 0.8185714285714287, 0.8799825174825175]Average AUC_ROC on polynomial 3 Kernel: 
0.8289641747141747sd: 
2.311720346096856sd: 
0.027236522017148476

In [15]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
print(ac) # average accuracy
print("Average accuracy on Polynomial Kernel: \n", mean(ac))
print(aucs_rec) # average accuracy
print("Average AUC_ROC on polynomial Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

[81.30718954248366, 84.83660130718954, 79.01960784313727, 83.8562091503268, 81.43790849673202, 82.54901960784314, 82.61437908496733, 81.37254901960785, 83.79084967320262, 84.9673202614379]Average accuracy on Polynomial Kernel: 
82.57516339869281[0.8278751803751805, 0.8519769119769119, 0.7969733044733045, 0.845544733044733, 0.8144841269841269, 0.8133333333333332, 0.8322619047619048, 0.8199025974025973, 0.8454545454545453, 0.860108225108225]Average AUC_ROC on polynomial Kernel: 
0.8307914862914864sd: 
1.8543364083836058sd: 
0.019957116235096897

In [16]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=5)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 5 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on polynomial 5 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 5 Kernel: 
76.04575163398691Average AUC_ROC on polynomial 5 Kernel: 
0.7748165723165725sd: 
2.6058776080315282sd: 
0.020157481141709458

In [17]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df[shuffle(axes(df, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.RadialBasis, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on RBF Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on RBF Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on RBF Kernel: 
75.76470588235294Average AUC_ROC on RBF Kernel: 
0.7615501720501722sd: 
4.537012844288927sd: 
0.03730722474196066

# PCA & SVM

In [18]:
M = MultivariateStats.fit(PCA,Rankmatrix_all_centred,maxoutdim=200);# where 0.99 of variances preserved in the principal subspace
Rankmatrix_pc = MultivariateStats.transform(M,Rankmatrix_all_centred);
size(Rankmatrix_pc)

(30, 86)

In [19]:
# dimensionality of data is reduced to 30 and still retains 99% of its variance
# combine with labels into a dataframe
data_pc = vcat(labels, Rankmatrix_pc)
df_pc = DataFrame(data_pc')
colnames = vcat(["labels"],names(df_pc)[1:end-1])
rename!(df_pc, Symbol.(colnames))
first(df_pc,5)

Unnamed: 0_level_0,labels,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.0,-172.589,182.782,68.8925,-18.6114,-164.719,60.3388,-54.9248,88.8277
2,0.0,258.017,-28.9971,-3.27234,3.7357,-80.6344,-7.26449,-27.8529,5.42965
3,0.0,341.043,-67.9044,-13.7642,-27.3186,-32.242,22.2072,-19.9863,11.9234
4,0.0,-319.083,-666.266,-71.3241,251.432,-68.6331,125.647,-32.2654,-9.06935
5,0.0,-176.036,207.247,60.4541,19.6871,-236.378,9.38392,-55.6536,-115.045


In [20]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_pc[shuffle(axes(df_pc, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1])#, kernel= Kernel.RadialBasis, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy: 
47.44444444444444Average AUC_ROC: 
0.5066666666666666sd: 
5.41651372639803sd: 
0.016101529717988283

In [21]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_pc[shuffle(axes(df_pc, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Linear)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Linear Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Linear Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Linear Kernel: 
66.84313725490196Average AUC_ROC on Linear Kernel: 
0.6807392607392607sd: 
7.941829440804971sd: 
0.08163499727277132















In [22]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_pc[shuffle(axes(df_pc, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.RadialBasis, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on RBF Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on RBF Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on RBF Kernel: 
50.31372549019608Average AUC_ROC on RBF Kernel: 
0.5016666666666667sd: 
4.304450758722694sd: 
0.005270462766947316

In [27]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_pc[shuffle(axes(df_pc, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=2)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 2 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Polynomial 2 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 2 Kernel: 
84.19607843137254Average AUC_ROC on Polynomial 2 Kernel: 
0.8422443944943945sd: 
1.7848443432019958sd: 
0.024060892499139355

In [31]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_pc[shuffle(axes(df_pc, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 3 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Polynomial 3 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 3 Kernel: 
82.01307189542483Average AUC_ROC on Polynomial 3 Kernel: 
0.8287987845487844sd: 
2.0388246091677513sd: 
0.02020303788495808

In [32]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_pc[shuffle(axes(df_pc, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=5)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 5 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Polynomial 5 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 5 Kernel: 
75.50980392156863Average AUC_ROC on Polynomial 5 Kernel: 
0.7735602453102453sd: 
2.1910431779326003sd: 
0.023204564002515724

# Basis & SVM

In [35]:
using Wavelets

## Haar

In [42]:
# convert centred rankfunctions by wavelets 
Rankmatrix_W_haar = Array{Float64}(undef,10000,86);
for i in 1:86
    current = reshape(Rankmatrix_all_centred[:,i], 100,100);
    transform = dwt(current, wavelet(WT.haar));
    Rankmatrix_W_haar[:,i] = vec(transform);
end
data_W_haar = vcat(labels, Rankmatrix_W_haar);
df_W = DataFrame(data_W_haar');
colnames = vcat(["labels"],names(df_W)[1:end-1]);
rename!(df_W, Symbol.(colnames));

In [43]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_W[shuffle(axes(df_W, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Linear)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Linear Kernel: \n", mean(ac))
#
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Linear Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Linear Kernel: 
39.65359477124183Average AUC_ROC on Linear Kernel: 
0.40374916749916745sd: 
6.6773413335374485sd: 
0.06894689884314317

In [44]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_W[shuffle(axes(df_W, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.RadialBasis, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on RBF Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on RBF Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on RBF Kernel: 
74.98692810457516Average AUC_ROC on RBF Kernel: 
0.7557225552225553sd: 
2.8319171924731066sd: 
0.02473558808802646

In [60]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_W[shuffle(axes(df_W, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=2)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
    
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 2 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Polynomial 2 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 2 Kernel: 
83.95424836601308Average AUC_ROC on Polynomial 2 Kernel: 
0.8424294594294593sd: 
3.0147735338680683sd: 
0.0317995839414399

In [52]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_W[shuffle(axes(df_W, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=3)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 3 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Polynomial 3 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 3 Kernel: 
80.33333333333334Average AUC_ROC on Polynomial 3 Kernel: 
0.816205932955933sd: 
1.2352960388530003sd: 
0.01320480530812734

In [53]:
# 10 iterations of 5-fold cv
iter = 10
ac = zeros(iter)
ac_list = Array{Float64}(undef,5,iter);
aucs_rec = zeros(iter)
auc_list = Array{Float64}(undef,5,iter);
for k in 1:iter
    # first shuffle dataframe to ensure an even mix of control and stroke patients
    shuffled_df = df_W[shuffle(axes(df_W, 1)), :];
    # choose 5 folds for cross validation
    tot_num = Control_size + Stroke_size
    folds_allocation = collect(Kfold(tot_num, 5))
    # record accuracy and AUC
    test_accuracy = zeros(5);
    AUCs = zeros(5);
    for i in 1:5
        train_set = shuffled_df[folds_allocation[i],:];
        test_set = shuffled_df[Not(folds_allocation[i]),:];
        # accuracy on test set 
        model = svmtrain(convert(Array, train_set[:,2:end])',train_set[:,1], kernel= Kernel.Polynomial, degree=5)
        (predicted_labels, decision_values) = svmpredict(model, convert(Array, test_set[:,2:end])');
        test_accuracy[i] = sum(predicted_labels .== test_set[:,1])/length(test_set[:,1])*100;
        AUCs[i] = MultivariateAnomalies.auc(convert(Array{Int},predicted_labels), convert(Array{Int},test_set[:,1]));
    end
    ac[k] = mean(test_accuracy);
    ac_list[:,k] = test_accuracy;
    aucs_rec[k] = mean(AUCs);
    auc_list[:,k] = AUCs;
end
#print(ac) # average accuracy
print("Average accuracy on Polynomial 5 Kernel: \n", mean(ac))
#print(aucs_rec) # average accuracy
print("Average AUC_ROC on Polynomial 5 Kernel: \n", mean(aucs_rec))
print("sd: \n", std(ac))
print("sd: \n", std(aucs_rec))

Average accuracy on Polynomial 5 Kernel: 
76.62091503267973Average AUC_ROC on Polynomial 5 Kernel: 
0.7861602286602287sd: 
2.039581089359529sd: 
0.015267319314436633