<a href="https://colab.research.google.com/github/PDNow-Research/PDNow/blob/main/HandPD/Baseline/Baseline_SVM_%26_LogReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report

In [3]:
spiral_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/NewSpiral.csv')
meander_df = pd.read_csv('/content/drive/My Drive/Data/HandPD-Replication/NewMeander.csv')

In [4]:
meander_df.shape

(264, 16)

In [5]:
spiral_df['CLASS_TYPE'].value_counts()

1    140
2    124
Name: CLASS_TYPE, dtype: int64

## Fixing Duplicates

Don't run this unless re-importing data.

### Identifying Duplicates

In [6]:
spiral_df.shape

(264, 16)

In [7]:
id = spiral_df['ID_PATIENT']

In [8]:
x = np.unique(id, return_counts=True)

In [9]:
x

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  31,  38,  47,  53,
         59,  76,  78,  80,  86,  98, 102, 104, 127, 138, 143, 149, 153,
        157, 165, 176, 181, 186, 187, 192, 196, 203, 207, 218, 224, 230,
        235, 247, 253, 261, 268, 273, 281, 297, 299, 301, 305]),
 array([4, 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8,
        8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]))

In [10]:
meander_df.shape

(264, 16)

In [11]:
id1 = spiral_df['ID_PATIENT']

In [12]:
x1 = np.unique(id1, return_counts=True)

In [13]:
x1

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  31,  38,  47,  53,
         59,  76,  78,  80,  86,  98, 102, 104, 127, 138, 143, 149, 153,
        157, 165, 176, 181, 186, 187, 192, 196, 203, 207, 218, 224, 230,
        235, 247, 253, 261, 268, 273, 281, 297, 299, 301, 305]),
 array([4, 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8,
        8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]))

Upon looking closer, we see there are duplicates in 5, 23, and 31, each case in which two different patients have the same ID. We'll have to change these manually. Fortunately, these duplicates exist in both datasets, so we can concatenate the two and change these only once.

### Changing Duplicates to ID of 500, 501, 502

In [14]:
data_all = pd.concat((spiral_df, meander_df))

In [15]:
data_all.shape, (264 *2) # Good :)

((528, 16), 528)

In [16]:
data_all.head()

Unnamed: 0,_ID_EXAM,IMAGE_NAME,ID_PATIENT,CLASS_TYPE,GENDER,RIGH/LEFT-HANDED,AGE,RMS,MAX_BETWEEN_ET_HT,MIN_BETWEEN_ET_HT,STD_DEVIATION_ET_HT,MRT,MAX_HT,MIN_HT,STD_HT,CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT
0,H1,sp1-H1.jpg,59,1,M,R,58,14900.77267,13207.24121,40856.39894,0.087041,56.97832,192.73717,0.026662,4394.30709,0.120773
1,H2,sp1-H2.jpg,76,1,M,R,60,6076.106534,9440.754502,36137.73814,0.065872,39.597318,169.753376,0.266595,2031.447046,0.172414
2,H3,sp1-H3.jpg,301,1,F,R,56,12220.46234,12247.2633,39871.73569,0.0,46.18381,192.536776,0.007857,3843.754479,0.123894
3,H4,sp1-H4.jpg,102,1,M,R,24,4810.072073,4904.029458,32262.38975,0.044374,31.48579,200.401229,0.11453,1586.941716,0.162011
4,H5,sp1-H5.jpg,305,1,M,R,79,5458.875925,5911.591981,31795.46792,0.040906,47.231442,134.897863,0.026703,2251.866128,0.202899


In [17]:
# Fixing the errors in 5

# Both patients have different Exam IDs, H20 vs P25
data_all[data_all["ID_PATIENT"] == 5]

data_all["ID_PATIENT"][data_all["_ID_EXAM"] == 'P25'] = 500
data_all[(data_all["_ID_EXAM"] == 'P25')]

Unnamed: 0,_ID_EXAM,IMAGE_NAME,ID_PATIENT,CLASS_TYPE,GENDER,RIGH/LEFT-HANDED,AGE,RMS,MAX_BETWEEN_ET_HT,MIN_BETWEEN_ET_HT,STD_DEVIATION_ET_HT,MRT,MAX_HT,MIN_HT,STD_HT,CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT
163,P25,mea1-P25.jpg,500,2,M,R,56,4302.294432,7112.289057,38432.54467,0.008281,24.719923,178.186833,0.006865,1595.973219,0.244373
194,P25,mea2-P25.jpg,500,2,M,R,56,4663.05319,6529.079545,36974.76448,0.007655,27.645273,175.520033,0.14144,1611.986226,0.280277
225,P25,mea3-P25.jpg,500,2,M,R,56,4042.898486,6039.829996,30143.20222,0.017548,21.859079,184.11795,0.019396,1440.945103,0.13355
256,P25,mea4-P25.jpg,500,2,M,R,56,5564.3893,6594.615573,38032.49238,0.015983,19.820385,177.608053,0.038987,1446.30497,0.232026
163,P25,mea1-P25.jpg,500,2,M,R,56,9182.846795,8636.994876,73318.05243,0.05229,22.097903,184.176975,0.077042,866.209635,0.128205
194,P25,mea2-P25.jpg,500,2,M,R,56,4264.606794,6307.535371,28726.66317,0.023481,21.221035,197.514503,0.153504,954.338581,0.243333
225,P25,mea3-P25.jpg,500,2,M,R,56,5604.846099,6643.504858,29059.69876,0.0,20.961555,213.44187,0.0,781.796205,0.112676
256,P25,mea4-P25.jpg,500,2,M,R,56,4329.851866,7130.966577,42329.49826,0.045973,16.953067,145.771101,0.0,469.381393,0.162712


In [18]:
# Fixing the errors in 23

# Both patients have different Exam IDs, H38 vs P3
data_all[data_all["ID_PATIENT"] == 23]

data_all["ID_PATIENT"][data_all["_ID_EXAM"] == 'P3'] = 501
data_all[(data_all["_ID_EXAM"] == 'P3')]

Unnamed: 0,_ID_EXAM,IMAGE_NAME,ID_PATIENT,CLASS_TYPE,GENDER,RIGH/LEFT-HANDED,AGE,RMS,MAX_BETWEEN_ET_HT,MIN_BETWEEN_ET_HT,STD_DEVIATION_ET_HT,MRT,MAX_HT,MIN_HT,STD_HT,CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT
142,P3,mea1-P3.jpg,501,2,M,R,38,4605.06806,6940.764086,36201.60163,0.006727,23.25223,173.62247,0.011017,1531.662606,0.236667
173,P3,mea2-P3.jpg,501,2,M,R,38,4907.351889,6989.718387,33353.58716,0.003516,25.44598,171.125323,0.018792,1656.167938,0.286184
204,P3,mea3-P3.jpg,501,2,M,R,38,4289.031878,6881.157664,30235.2244,0.0,23.787134,170.993405,0.0,1650.085919,0.289389
235,P3,mea4-P3.jpg,501,2,M,R,38,6240.661807,7416.196105,35299.12915,0.03269,24.843456,170.936138,0.035313,1617.511364,0.21875
142,P3,mea1-P3.jpg,501,2,M,R,38,2905.484532,4976.359434,27854.32217,4e-06,18.156801,211.149234,0.121348,485.567401,0.13961
173,P3,mea2-P3.jpg,501,2,M,R,38,2970.152525,5841.126448,34993.26235,0.001843,19.526983,163.882916,0.0,597.913048,0.097403
204,P3,mea3-P3.jpg,501,2,M,R,38,4724.371455,6165.996087,37788.3049,0.0,21.355,182.017144,0.152095,899.713743,0.18
235,P3,mea4-P3.jpg,501,2,M,R,38,4850.07672,7218.410231,34100.62597,0.0,21.315119,209.26753,0.017976,901.423884,0.177632


In [19]:
# Fixing the errors in 31

# Both patients have different Exam IDs (and then just Right/Left-handed!!), P2 vs P26
data_all[data_all["ID_PATIENT"] == 31]

data_all["ID_PATIENT"][data_all["_ID_EXAM"] == 'P26'] = 502
data_all[(data_all["_ID_EXAM"] == 'P26')]

Unnamed: 0,_ID_EXAM,IMAGE_NAME,ID_PATIENT,CLASS_TYPE,GENDER,RIGH/LEFT-HANDED,AGE,RMS,MAX_BETWEEN_ET_HT,MIN_BETWEEN_ET_HT,STD_DEVIATION_ET_HT,MRT,MAX_HT,MIN_HT,STD_HT,CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT
164,P26,mea1-P26.jpg,502,2,F,L,59,4587.166408,6561.231877,37033.96196,0.018378,22.328549,184.8065,0.057158,1388.31596,0.266871
195,P26,mea2-P26.jpg,502,2,F,L,59,6210.596735,7365.102809,37813.01828,0.002359,20.958406,160.395258,0.024497,1281.094994,0.128205
226,P26,mea3-P26.jpg,502,2,F,L,59,6544.571924,8612.729092,36154.08597,0.013198,27.237956,180.306815,0.003577,1761.99889,0.243421
257,P26,mea4-P26.jpg,502,2,F,L,59,4921.267695,6948.380564,35304.08954,0.017636,25.699685,182.482156,0.055527,1698.122391,0.235915
164,P26,mea1-P26.jpg,502,2,F,L,59,3920.970004,6238.937338,38250.87273,0.001111,20.249795,198.848794,0.077745,824.315791,0.298077
195,P26,mea2-P26.jpg,502,2,F,L,59,6258.718,4848.219744,27252.50816,53.220445,17.078864,170.677569,0.005952,605.873333,0.323944
226,P26,mea3-P26.jpg,502,2,F,L,59,6534.143743,8216.197515,62196.78628,1.6e-05,21.01735,210.416386,0.0,774.2606,0.101124
257,P26,mea4-P26.jpg,502,2,F,L,59,5606.216748,6911.43788,33526.4098,0.0,19.501336,133.597316,0.0,580.637851,0.179402


### Check

In [20]:
ids = data_all['ID_PATIENT']

In [21]:
len(ids)

528

In [22]:
xs = np.unique(ids, return_counts=True)

In [23]:
xs, len(xs[0])

((array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,  14,
          15,  16,  17,  18,  19,  20,  21,  22,  23,  31,  38,  47,  53,
          59,  76,  78,  80,  86,  98, 102, 104, 127, 138, 143, 149, 153,
         157, 165, 176, 181, 186, 187, 192, 196, 203, 207, 218, 224, 230,
         235, 247, 253, 261, 268, 273, 281, 297, 299, 301, 305, 500, 501,
         502]),
  array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8])),
 66)

Everything looks great! Now we have 66 unique patient IDs we can use. 66 is the nuber of patients we have.

##**Preprocessing**

###Normalization

fi' = (fi - avg)/std

Only run once (unless restart runtime ofc)

In [24]:
# normalization with the paper's method (formula above)
def feature_normalization(df):

  avg_dev = df.mad(axis = 0)
  std_dev = df.std(axis = 0)

  df = df.sub(avg_dev)
  df = df.divide(std_dev)

  return df

In [25]:
data_all['CLASS_TYPE'].value_counts()

1    280
2    248
Name: CLASS_TYPE, dtype: int64

In [26]:
data_all.columns

Index(['_ID_EXAM', 'IMAGE_NAME', 'ID_PATIENT', 'CLASS_TYPE', 'GENDER',
       'RIGH/LEFT-HANDED', 'AGE', 'RMS', 'MAX_BETWEEN_ET_HT',
       'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT',
       'STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT'],
      dtype='object')

In [27]:
# normalizing data_all

# new df
data_2 = data_all.copy()
normalize_cols = data_2[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT','STD_HT', 'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']]

data_all[['RMS', 'MAX_BETWEEN_ET_HT','MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT',
          'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
          'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']] = feature_normalization(normalize_cols)

In [28]:
data_all.head()

Unnamed: 0,_ID_EXAM,IMAGE_NAME,ID_PATIENT,CLASS_TYPE,GENDER,RIGH/LEFT-HANDED,AGE,RMS,MAX_BETWEEN_ET_HT,MIN_BETWEEN_ET_HT,STD_DEVIATION_ET_HT,MRT,MAX_HT,MIN_HT,STD_HT,CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT
0,H1,sp1-H1.jpg,59,1,M,R,58,7.143607,7.83846,3.299772,-0.143373,4.797496,8.766534,-0.270699,4.828854,0.860394
1,H2,sp1-H2.jpg,76,1,M,R,60,2.471925,5.392804,2.842963,-0.14459,3.118298,7.628783,1.066993,1.819921,1.584295
2,H3,sp1-H3.jpg,301,1,F,R,56,5.72468,7.215127,3.204447,-0.148378,3.754626,8.756614,-0.375542,4.127764,0.904144
3,H4,sp1-H4.jpg,102,1,M,R,24,1.8017,2.447016,2.467794,-0.145827,2.334635,9.145922,0.219189,1.253875,1.438466
4,H5,sp1-H5.jpg,305,1,M,R,79,2.14517,3.101247,2.422591,-0.146026,3.855839,5.903356,-0.27047,2.100608,2.011632


###Train-Test-Split

In [29]:
# creating our final datasets
X = data_all[data_all.columns.drop('CLASS_TYPE')]
y = data_all['CLASS_TYPE']

In [30]:
# Train-Test Split, 80% train, 20% test, stratification across PD diagnosis (label)
y_label = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2, stratify = y_label, shuffle = True)

In [31]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [32]:
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

###Quick Check

In [33]:
y_train.value_counts()

1    224
2    198
Name: CLASS_TYPE, dtype: int64

In [34]:
y_test.value_counts()

1    56
2    50
Name: CLASS_TYPE, dtype: int64

##**SVM Implementation**

###Training

In [35]:
X_train.columns

Index(['_ID_EXAM', 'IMAGE_NAME', 'ID_PATIENT', 'GENDER', 'RIGH/LEFT-HANDED',
       'AGE', 'RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT',
       'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT'],
      dtype='object')

In [36]:
clf = SVC(kernel = 'rbf', probability = True, class_weight = None, C = 1.0)
clf.fit(X_train[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT',
       'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']], y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [37]:
y_pred = clf.predict(X_test[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT',
       'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']])
y_proba = clf.predict_proba(X_test[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT',
       'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']])

In [38]:
y_pred = pd.Series(y_pred)

##**Results**

In [39]:
clf.score(X_test[['RMS', 'MAX_BETWEEN_ET_HT', 'MIN_BETWEEN_ET_HT',
       'STD_DEVIATION_ET_HT', 'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
       'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']], y_test)

0.7169811320754716

In [40]:
target_names = ['Control', 'PD']
results = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
results = pd.DataFrame(results).transpose()
conf_mat = confusion_matrix(y_test, y_pred)

In [41]:
results

Unnamed: 0,precision,recall,f1-score,support
Control,0.861111,0.553571,0.673913,56.0
PD,0.642857,0.9,0.75,50.0
accuracy,0.716981,0.716981,0.716981,0.716981
macro avg,0.751984,0.726786,0.711957,106.0
weighted avg,0.758161,0.716981,0.709803,106.0


In [42]:
conf_mat

array([[31, 25],
       [ 5, 45]])

### Patient Results

One important thing to consider is that right now, we have a bunch of Patient IDs, few of which have all 8 of their images. It may be better to perform train test split by patients, for 11 patients in one group and 55 in the other, so that we can have all possible information about a patient (that we will have when running the model).

In [43]:
patient_ids = np.unique(X_test['ID_PATIENT'])
patient_ids

array([  1,   5,   6,   8,  11,  12,  13,  14,  16,  17,  18,  19,  20,
        21,  22,  23,  31,  38,  47,  53,  59,  76,  78,  80,  86,  98,
       102, 127, 138, 143, 149, 153, 165, 176, 181, 186, 187, 192, 196,
       203, 207, 218, 224, 230, 247, 253, 261, 268, 273, 281, 297, 299,
       301, 305, 500, 501, 502])

In [44]:
len(patient_ids)

57

In [45]:
data_results = X_test.copy() # we only get predictions for X_test
data_results['PRED'] = y_pred
data_results['LABEL'] = y_test

In [46]:
final_results = pd.DataFrame(columns = ['ID_PATIENT', 'PRED', 'LABEL', 'ONE_PRED', 'TWO_PRED', 'NUM_ROWS'])

In [47]:
for i in range(len(patient_ids)):
  selected_rows = data_results[data_results['ID_PATIENT'] == patient_ids[i]]
  not_pd = selected_rows[selected_rows['PRED'] == 1]
  yes_pd = selected_rows[selected_rows['PRED'] == 2]

  label = selected_rows['LABEL'].to_list()[0]

  if (not_pd.shape[0] > yes_pd.shape[0]):
    pred = 1
  elif (yes_pd.shape[0] > not_pd.shape[0]):
    pred = 2
  else:
    pred = 2 # There are slightly less PD predictions, so we'll just choose this.

  add_to_df = [patient_ids[i], pred, label, len(not_pd), len(yes_pd), len(selected_rows)]

  final_results.loc[i] = add_to_df

In [48]:
final_results.head()

Unnamed: 0,ID_PATIENT,PRED,LABEL,ONE_PRED,TWO_PRED,NUM_ROWS
0,1,2,1,0,1,1
1,5,2,1,1,2,3
2,6,1,1,2,1,3
3,8,1,1,1,0,1
4,11,1,1,4,0,4


In [49]:
correct = 0
wrong = 0

for i in range(len(final_results)):
  if (final_results.loc[i]["PRED"] == final_results.loc[i]["LABEL"]):
    correct += 1
  else:
    wrong += 1

In [50]:
correct/(correct+wrong)

0.7192982456140351

## **Cross-Val**

In [51]:
cross_val_X = data_all[['RMS', 'MAX_BETWEEN_ET_HT','MIN_BETWEEN_ET_HT', 'STD_DEVIATION_ET_HT',
          'MRT', 'MAX_HT', 'MIN_HT', 'STD_HT',
          'CHANGES_FROM_NEGATIVE_TO_POSITIVE_BETWEEN_ET_HT']]
cross_val_y = data_all['CLASS_TYPE']

In [52]:
from sklearn.model_selection import cross_validate
clf = SVC(kernel = 'rbf', probability = True, class_weight = None, C = 1.0)

In [65]:
cv_results = cross_validate(clf, cross_val_X, cross_val_y, cv=5)

In [66]:
cv_results

{'fit_time': array([0.04075146, 0.0327971 , 0.03322959, 0.03315496, 0.03541303]),
 'score_time': array([0.00295949, 0.00275183, 0.0025053 , 0.00272655, 0.00348353]),
 'test_score': array([0.75471698, 0.68867925, 0.72641509, 0.76190476, 0.73333333])}

In [68]:
scores = np.array([0.75471698, 0.68867925, 0.72641509, 0.76190476, 0.73333333])

In [73]:
np.std(scores)

0.025748448154667026

In [72]:
np.mean(scores)

0.733009882