<a href="https://colab.research.google.com/github/PDNow-Research/PDNow/blob/main/HandPD/SVM_Exact_Replication_Experiment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report

In [3]:
spiral_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/NewSpiral.csv')

In [4]:
meander_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/NewSpiral.csv')

In [5]:
spiral_df['ID_PATIENT'][spiral_df['_ID_EXAM'] == "P26"] = 32
meander_df['ID_PATIENT'][meander_df['_ID_EXAM'] == "P26"] = 32

In [6]:
spiral_df.columns

Index(['_ID_EXAM', 'IMAGE_NAME', 'ID_PATIENT', 'CLASS_TYPE', 'GENDER',
       'RIGH/LEFT-HANDED', 'AGE', 'RMS', 'MAX_BETWEEN_ET_HT',
       'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT',
       'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT'],
      dtype='object')

In [7]:
# extracting necessary columns from spiral df
X_spiral = spiral_df[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']]
y_spiral = spiral_df[['CLASS_TYPE']]

In [8]:
# extracting necessary columns from meander df
X_meander = meander_df[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']]
y_meander = meander_df[['CLASS_TYPE']]

##**Preprocessing**

###**Normalization**

fi' = (fi - avg)/std

In [9]:
# normalization with the paper's method (formula above)
def feature_normalization(df):
  avg_dev = df.mad(axis = 0)
  std_dev = df.std(axis = 0)

  df = df.sub(avg_dev)
  df = df.divide(std_dev)

  return df

In [10]:
# normalizing X_spiral
X_spiral = feature_normalization(X_spiral)

In [11]:
# normalizing X_meander
X_meander = feature_normalization(X_meander)

###Train-Test-Split

In [12]:
def data_split_experiment_3(X,y):# Train-Test Split, 80% train, 20% test, stratification across PD diagnosis (label)
  y_label = y.to_numpy()
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.25, stratify =y_label)

  X_train = X_train.reset_index(drop = True)
  X_test = X_test.reset_index(drop = True)  

  y_train = y_train.reset_index(drop = True)
  y_test = y_test.reset_index(drop = True)

  return X_train, X_test, y_train, y_test

1. try without stratified label perhaps because we want to avoid shuffle for experiment 3? even if it shuffles, it should choose the right patient since it shuffles together. It then resets for easiness, but right patient should be preserved.

In [13]:
X_spiral_train, X_spiral_test, y_spiral_train, y_spiral_test = data_split_experiment_3(X_spiral,y_spiral)

###**Quick Check**

In [14]:
y_train.value_counts()

NameError: ignored

In [None]:
y_test.value_counts()

##**SVM Implementation**

In [None]:
score_arr = []

for i in range(20):

  X_spiral = feature_normalization(X_spiral)
  X_meander = feature_normalization(X_meander)

  X_meander_train, X_meander_test, y_meander_train, y_meander_test = data_split_experiment_3(X_meander,y_meander)
  X_spiral_train, X_spiral_test, y_spiral_train, y_spiral_test = data_split_experiment_3(X_spiral,y_spiral)

  clf = SVC(kernel = 'rbf', probability = True, class_weight = 'balanced')
  clf.fit(X_meander_train, y_meander_train)

  meander_score = clf.score(X_meander_test, y_meander_test)

  clf1 = SVC(kernel = 'rbf', probability = True, class_weight = 'balanced')
  clf1.fit(X_spiral_train, y_spiral_train)

  spiral_score = clf1.score(X_spiral_test, y_spiral_test)


  if (spiral_score > meander_score): # consider recall and such too? will there be one with highest recall, precision, and accuracy?
    score = spiral_score
    y_pred = clf.predict(X_spiral_test)
    conf_mat = confusion_matrix(y_spiral_test, y_pred)
  else:
    score = meander_score
    y_pred = clf.predict(X_meander_test)
    conf_mat = confusion_matrix(y_meander_test, y_pred)

  score_arr.append(score)
  print(conf_mat)
  print('')

In [25]:
score_arr

[0.6212121212121212,
 0.6060606060606061,
 0.6060606060606061,
 0.6060606060606061,
 0.6060606060606061,
 0.5909090909090909,
 0.5909090909090909,
 0.5909090909090909,
 0.5909090909090909,
 0.5909090909090909,
 0.5909090909090909,
 0.5757575757575758,
 0.5606060606060606,
 0.5606060606060606,
 0.5454545454545454,
 0.5454545454545454,
 0.5454545454545454,
 0.5454545454545454,
 0.5606060606060606,
 0.5606060606060606]

In [29]:
np.mean(score_arr)

0.5795454545454545

In [30]:
np.std(score_arr)

0.023896682266041698

In [19]:
x

[1, 1]

###**Training**

##**Results**

In [None]:
clf.score(X_test, y_test)

In [None]:
y_test.dtypes

In [None]:
target_names = ['Control', 'PD']
results = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
results = pd.DataFrame(results).transpose()
conf_mat = confusion_matrix(y_test, y_pred)

In [None]:
results

In [None]:
conf_mat

In [None]:
TN, FP, FN, TP = conf_mat.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)

Can we somehow get the weights for the model to learn which features are considered most useful?