<a href="https://colab.research.google.com/github/PDNow-Research/PDNow/blob/main/HandPD-Replication/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report

In [3]:
spiral_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/Spiral_HandPD.txt', delimiter=' ', index_col=0, names=['Image', 'Label', 'RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff'])

In [4]:
# extracting necessary columns from spiral df
X_spiral = spiral_df[['RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff']]
y_spiral = spiral_df['Label']

y_spiral = pd.DataFrame(y_spiral)

In [5]:
meander_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/Meander_HandPD.txt', delimiter=' ', index_col=0, names=['Image', 'Label', 'RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff'])

In [6]:
# choosing only control patients from meander_df
X_meander = meander_df[meander_df["Label"] == 1]
X_meander = X_meander[['RMS', 'Max_dist', 'Min_dist', 'SD', 'MRT', 'Max_ET', 'Min_ET', 'SD_ET', 'HT_ET_Diff']]

y_meander = meander_df[meander_df["Label"] == 1]
y_meander = y_meander[['Label']]

##**Preprocessing**

###**Normalization**

fi' = (fi - avg)/std

In [29]:
# normalization with the paper's method (formula above)
def feature_normalization(df):
  avg_dev = df.mad(axis = 0)
  std_dev = df.std(axis = 0)

  df = df.sub(avg_dev)
  df = df.divide(std_dev)

  return df

In [30]:
# normalizing X_spiral
X_spiral = feature_normalization(X_spiral)

In [31]:
# normalizing X_meander
X_meander = feature_normalization(X_meander)

###Train-Test-Split

In [32]:
# creating our final dataset with control and PD Spiral patients and control Meander patients
X = pd.concat([X_spiral, X_meander])
y = pd.concat([y_spiral, y_meander])

In [33]:
# Train-Test Split, 80% train, 20% test, stratification across PD diagnosis (label)
y_label = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2, stratify = y_label)

In [34]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [35]:
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

###**Quick Check**

In [36]:
y_train.value_counts()

Label
2        237
1        115
dtype: int64

In [37]:
y_test.value_counts()

Label
2        59
1        29
dtype: int64

##**SVM Implementation**

##**Training**

In [38]:
clf = SVC(kernel = 'rbf', probability = True, class_weight = 'balanced')
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

To note, it may be useful to look at the weights for this model (see which features are most valuable), but these are only relevant for a linear kernel. RBF kernel does not have relevant/interpretable weights. And of course, linear kernel is not very helpful and good in predicting (51% accuracy). The features are not linearly separable.

In [39]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)

In [41]:
y_pred = pd.Series(y_pred)

##**Results**

In [40]:
clf.score(X_test, y_test)

0.8636363636363636

In [43]:
target_names = ['Control', 'PD']
results = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
results = pd.DataFrame(results).transpose()
conf_mat = confusion_matrix(y_test, y_pred)

In [44]:
results

Unnamed: 0,precision,recall,f1-score,support
Control,1.0,0.586207,0.73913,29.0
PD,0.830986,1.0,0.907692,59.0
accuracy,0.863636,0.863636,0.863636,0.863636
macro avg,0.915493,0.793103,0.823411,88.0
weighted avg,0.886684,0.863636,0.852144,88.0


In [45]:
conf_mat

array([[17, 12],
       [ 0, 59]])

In [46]:
TN, FP, FN, TP = conf_mat.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)

TP:  59
TN:  17
FP:  12
FN:  0
Sensitivity:  1.0
Specificity:  0.5862068965517241
NPV:  1.0
PPV:  0.8309859154929577


Can we somehow get the weights for the model to learn which features are considered most useful?