<a href="https://colab.research.google.com/github/PDNow-Research/PDNow/blob/main/HandPD/SVM_Exact_Replication_Experiment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [38]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Results/Scoring
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.model_selection import ShuffleSplit

In [8]:
#NewSpiral is from the new HandPD dataset. The new one also comes with images and such, but we are using the data that has the exact same features from old HandPd dataset, except also includes patient ID.
spiral_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/NewSpiral.csv')

In [9]:
spiral_df.shape

(264, 16)

In [10]:
spiral_df['CLASS_TYPE'].value_counts()

1    140
2    124
Name: CLASS_TYPE, dtype: int64

In [11]:
meander_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/NewMeander.csv')

In [12]:
meander_df['ID_PATIENT'][meander_df['_ID_EXAM'] == "P26"] = 32

In [13]:
spiral_df['ID_PATIENT'][spiral_df['_ID_EXAM'] == "P26"] = 32

##**Preprocessing**

###Normalization

fi' = (fi - avg)/std

In [55]:
# normalization with the paper's method (formula above)
def feature_normalization(df):
  df_dup = df.copy()

  df1 = df_dup[['RMS', 'MAX_BETWEEN_ET_HT',
       'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT',
       'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']]

  df2 = df_dup[['_ID_EXAM', 'IMAGE_NAME', 'ID_PATIENT', 'CLASS_TYPE', 'GENDER',
       'RIGH/LEFT-HANDED', 'AGE']]
  avg_dev = df1.mad(axis = 0)
  std_dev = df1.std(axis = 0)

  df1 = df1.sub(avg_dev)
  df1 = df1.divide(std_dev)

  return df2, df1

In [56]:
# normalizing spiral_df
spiral_df = pd.concat(feature_normalization(spiral_df), axis = 1)

In [57]:
spiral_df.columns

Index(['_ID_EXAM', 'IMAGE_NAME', 'ID_PATIENT', 'CLASS_TYPE', 'GENDER',
       'RIGH/LEFT-HANDED', 'AGE', 'RMS', 'MAX_BETWEEN_ET_HT',
       'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT',
       'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT'],
      dtype='object')

In [58]:
X_spiral = spiral_df[['RMS', 'MAX_BETWEEN_ET_HT',
       'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT',
       'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']]

In [59]:
y_spiral = spiral_df['CLASS_TYPE'] # 1 bracket so we make series instead of dataframe

In [60]:
# normalizing X_meander
meander_df =  pd.concat(feature_normalization(meander_df), axis = 1)

##**SVM Implementation**

###**Training with Cross-Val**

In [40]:
clf = SVC(kernel = 'rbf', probability = True, class_weight = 'balanced')
cv = ShuffleSplit(n_splits=20, test_size = 0.25, random_state=0)

scores = cross_val_score(clf, X_spiral, y_spiral, cv=cv)

In [41]:
scores

array([0.6969697 , 0.62121212, 0.59090909, 0.65151515, 0.59090909,
       0.65151515, 0.60606061, 0.65151515, 0.62121212, 0.6969697 ,
       0.59090909, 0.66666667, 0.63636364, 0.63636364, 0.60606061,
       0.6969697 , 0.53030303, 0.62121212, 0.63636364, 0.74242424])

In [62]:
scores.mean()

0.6371212121212121

In [63]:
scores.std()

0.046816710774182965

In [61]:
# using cross_val_predict to get confusion matrix

y_pred = cross_val_predict(clf, X_spiral, y_spiral, cv=20)
conf_mat = confusion_matrix(y_spiral, y_pred)

print (conf_mat)

[[ 54  86]
 [  7 117]]


##**Results**

#### Deviance of probabilites from 100/0 depending on class

In [None]:
# flattening y_meander_proba & removing the second percentage we don't need

y_meander_proba = [x for list in y_meander_proba for x in list]
y_meander_proba = y_meander_proba[0::2] # takes every other element starting from the first element

In [None]:
testing = y_meander_proba[i for i in y_meander_proba if i < 0.2775]

SyntaxError: ignored

In [None]:
len(y_meander_proba)

In [None]:
# calculating standard dev 

proba1 = [i for i in y_meander_proba if i > 0.77]
proba2 = [i for i in y_meander_proba if i < 0.77]

In [None]:
len(proba2)

In [None]:
print(proba2)

In [None]:
meanval =0

for i in proba1:
  meanval += (i)

meanval /= len(proba1)

meanval

In [None]:
len(proba2)

#### Y Meander Results

In [None]:
# y_meander_test = y_meander_test['CLASS_TYPE'].astype(str).astype(int)

In [None]:
y_meander_test.dtypes

CLASS_TYPE    int64
dtype: object

In [None]:
clf.score(X_meander_test, y_meander_test)

0.6212121212121212

In [None]:
target_names = ['Control', 'PD']
results = classification_report(y_meander_test, y_meander_pred, target_names = target_names, output_dict=True)
results = pd.DataFrame(results).transpose()
conf_mat = confusion_matrix(y_meander_test, y_meander_pred)

In [None]:
results

Unnamed: 0,precision,recall,f1-score,support
Control,0.916667,0.314286,0.468085,35.0
PD,0.555556,0.967742,0.705882,31.0
accuracy,0.621212,0.621212,0.621212,0.621212
macro avg,0.736111,0.641014,0.586984,66.0
weighted avg,0.747054,0.621212,0.579778,66.0


In [None]:
conf_mat

array([[11, 24],
       [ 1, 30]])

In [None]:
TN, FP, FN, TP = conf_mat.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)

TP:  30
TN:  11
FP:  24
FN:  1
Sensitivity:  0.967741935483871
Specificity:  0.3142857142857143
NPV:  0.9166666666666666
PPV:  0.5555555555555556


Can we somehow get the weights for the model to learn which features are considered most useful?