<a href="https://colab.research.google.com/github/PDNow-Research/PDNow/blob/main/HandPD-Replication/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report

In [5]:
spiral_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/Spiral_HandPD.txt', delimiter=' ', index_col=0, names=['Image', 'Label', 'RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff'])

In [6]:
spiral_df.head()

Unnamed: 0_level_0,Label,RMS,Max_dist,Min_dist,SD,MRT,Max_ET,Min_ET,SD_ET,HT_ET_Diff
Image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,3521.258301,6247.052734,30801.99219,0.014133,26.785328,176.600113,0.00213,1781.795898,0.25
2,1,4098.876465,6032.535156,34369.70313,0.022838,26.529615,168.352737,0.08496,1443.217529,0.273585
3,1,3854.601807,6453.114746,34709.44531,0.000251,23.670755,180.8983,0.009303,1621.75,0.256329
4,1,4069.221924,6844.231445,32181.26367,0.000168,23.456329,179.116043,0.021419,1454.390137,0.249221
5,1,4104.271973,6949.925293,36444.95313,0.004731,22.488258,188.25621,0.0,1553.536499,0.214511


In [7]:
meander_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/Meander_HandPD.txt', delimiter=' ', index_col=0, names=['Image', 'Label', 'RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff'])

In [8]:
meander_df.head()

Unnamed: 0_level_0,Label,RMS,Max_dist,Min_dist,SD,MRT,Max_ET,Min_ET,SD_ET,HT_ET_Diff
Image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,3176.216064,7098.378906,46569.03516,0.000672,21.280848,224.197754,0.156795,802.821106,0.065831
2,1,2751.015869,6263.803711,44059.59766,0.0,22.056967,212.937836,0.009198,939.975647,0.126183
3,1,3050.623779,6548.623047,40298.10938,2.6e-05,22.451719,223.401764,0.0,997.580139,0.06051
4,1,2594.598877,6989.15918,54217.63281,0.0,30.559263,233.222504,0.367697,2060.858887,0.149533
5,1,3310.786865,6060.232422,35212.75781,0.000667,18.138407,196.811325,0.073079,562.886475,0.166667


##**SVM IMPLEMENTATION**

In [112]:
X_spiral = spiral_df[['RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff']]
y_spiral = spiral_df['Label']

y_spiral = pd.DataFrame(y_spiral)

In [113]:
# normalizing X_spiral with the paper's method
avgdev = X_spiral.mad(axis=0)
stddev = X_spiral.std(axis=0)

X_spiral = X_spiral.sub(avgdev)
X_spiral = X_spiral.divide(stddev)

In [114]:
# choosing only control patients from meander_df
X_meander = meander_df[meander_df["Label"] == 1]
X_meander = X_meander[['RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff']]

y_meander = meander_df[meander_df["Label"] == 1]
y_meander = y_meander[['Label']]

In [115]:
# normalizing X_meander with the paper's method
avgdev = X_meander.mad(axis=0)
stddev = X_meander.std(axis=0)

X_meander = X_meander.sub(avgdev)
X_meander = X_meander.divide(stddev)

In [116]:
X = pd.concat([X_spiral, X_meander])
y = pd.concat([y_spiral, y_meander])

In [117]:
y_label = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2, stratify = y_label)

In [118]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [119]:
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [120]:
y_train.value_counts()

Label
2        237
1        115
dtype: int64

In [121]:
y_test.value_counts()

Label
2        59
1        29
dtype: int64

In [122]:
clf = SVC(kernel='rbf', probability=True, class_weight='balanced')
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

To note, it may be useful to look at the weights for this model (see which features are most valuable), but these are only relevant for a linear kernel. RBF kernel does not have relevant/interpretable weights. And of course, linear kernel is not very helpful and good in predicting (51% accuracy). The features are not linearly separable.

In [123]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)

In [124]:
clf.score(X_test, y_test)

0.8636363636363636

In [125]:
y_pred = pd.Series(y_pred)

In [126]:
type(y_test)

pandas.core.frame.DataFrame

In [127]:
target_names = ['Control', 'PD']
results = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
results = pd.DataFrame(results).transpose()
conf_mat = confusion_matrix(y_test, y_pred)

In [128]:
results

Unnamed: 0,precision,recall,f1-score,support
Control,1.0,0.586207,0.73913,29.0
PD,0.830986,1.0,0.907692,59.0
accuracy,0.863636,0.863636,0.863636,0.863636
macro avg,0.915493,0.793103,0.823411,88.0
weighted avg,0.886684,0.863636,0.852144,88.0


In [129]:
conf_mat

array([[17, 12],
       [ 0, 59]])

In [130]:
TN, FP, FN, TP = conf_mat.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)

TP:  59
TN:  17
FP:  12
FN:  0
Sensitivity:  1.0
Specificity:  0.5862068965517241
NPV:  1.0
PPV:  0.8309859154929577
