<a href="https://colab.research.google.com/github/PDNow-Research/PDNow/blob/main/HandPD-Replication/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report

In [4]:
spiral_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/Spiral_HandPD.txt', delimiter=' ', index_col=0, names=['Patient', 'Label', 'RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff'])

In [5]:
spiral_df.head()

Unnamed: 0_level_0,Label,RMS,Max_dist,Min_dist,SD,MRT,Max_ET,Min_ET,SD_ET,HT_ET_Diff
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,3521.258301,6247.052734,30801.99219,0.014133,26.785328,176.600113,0.00213,1781.795898,0.25
2,1,4098.876465,6032.535156,34369.70313,0.022838,26.529615,168.352737,0.08496,1443.217529,0.273585
3,1,3854.601807,6453.114746,34709.44531,0.000251,23.670755,180.8983,0.009303,1621.75,0.256329
4,1,4069.221924,6844.231445,32181.26367,0.000168,23.456329,179.116043,0.021419,1454.390137,0.249221
5,1,4104.271973,6949.925293,36444.95313,0.004731,22.488258,188.25621,0.0,1553.536499,0.214511


##**SVM IMPLEMENTATION**

DAY 1 Process
1. Import extracted features. No normalization procedures performed (doesn't seem as if extracted features have been normalized yet either). Run model. Complete overfitting - predicts all to be class for 81% accuracy.
2. Try normalization with StandardScaler from sklearn. Didn't work very well. Same overfitting problem.
3. Tried weighting classes with class_weight='balanced' property since we had many more PD data than non-PD (class-balance). Better results - not overfitting so extremely. 62% accuracy, with 24 false negatives (and 4 false positives)

Next Steps: Try the paper's normalization method. Try to see why we have: 10/14 control rows predicted right. 36/60 PD rows predicted right. Also, important to consider than while we have 368 images for spirals, each patient drew 4, so technically we are predicting per 1 image, not per patient. (And we might be unable to predict per patient anyway, since we think we may lack their indexes/don't know which images belong to a specific patient.)

In [14]:
X = spiral_df[['RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff']]
y = spiral_df['Label']

In [15]:
# scaler = StandardScaler().fit(X_train)
# using the paper's suggested normalization method: (fi - ui)/oi

avgdev = X.mad(axis=0)
stddev = X.std(axis=0)

In [16]:
X.sub(avgdev)
X.divide(stddev)

Unnamed: 0_level_0,RMS,Max_dist,Min_dist,SD,MRT,Max_ET,Min_ET,SD_ET,HT_ET_Diff
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,4.729041,8.290428,9.106523,0.023608,9.812717,30.333172,0.073786,8.346760,5.324048
2,5.504781,8.005743,10.161307,0.038149,9.719037,28.916587,2.943137,6.760702,5.826319
3,5.176721,8.563891,10.261751,0.000419,8.671703,31.071437,0.322269,7.597031,5.458832
4,5.464955,9.082940,9.514301,0.000281,8.593149,30.765313,0.741985,6.813039,5.307459
5,5.512027,9.223206,10.774849,0.007903,8.238499,32.335246,0.000000,7.277487,4.568268
...,...,...,...,...,...,...,...,...,...
364,7.511675,10.613706,10.753051,0.000317,9.878282,30.757736,2.076824,8.831221,4.230553
365,10.725576,9.949403,8.842628,4.365838,8.161527,34.998053,1.156054,7.811570,5.140454
366,6.248353,9.655240,11.670796,0.401849,7.921156,34.132516,0.000000,6.884646,6.388858
367,6.962033,10.669915,10.445904,0.034589,9.980789,31.276993,0.628465,8.357919,4.919698


In [17]:
y_label = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2, stratify = y_label)

In [19]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [20]:
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [21]:
y_train.value_counts()

2    236
1     58
Name: Label, dtype: int64

In [22]:
y_test.value_counts()

2    60
1    14
Name: Label, dtype: int64

In [23]:
clf = SVC(kernel='rbf', probability=True, class_weight='balanced')
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [24]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)

In [25]:
clf.score(X_test, y_test)

0.7567567567567568

In [26]:
y_pred = pd.Series(y_pred)

In [27]:
type(y_test)

pandas.core.series.Series

In [28]:
target_names = ['Control', 'PD']
results = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
results = pd.DataFrame(results).transpose()
conf_mat = confusion_matrix(y_test, y_pred)

In [29]:
results

Unnamed: 0,precision,recall,f1-score,support
Control,0.357143,0.357143,0.357143,14.0
PD,0.85,0.85,0.85,60.0
accuracy,0.756757,0.756757,0.756757,0.756757
macro avg,0.603571,0.603571,0.603571,74.0
weighted avg,0.756757,0.756757,0.756757,74.0


In [30]:
conf_mat

array([[ 5,  9],
       [ 9, 51]])

In [31]:
TN, FP, FN, TP = conf_mat.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)

TP:  51
TN:  5
FP:  9
FN:  9
Sensitivity:  0.85
Specificity:  0.35714285714285715
NPV:  0.35714285714285715
PPV:  0.85
