In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier


In [17]:
path = "/content/drive/MyDrive/Colab_folders/detecting-parkinsons-disease-ml-code/"
#read csv file of parkison's dataset
df=pd.read_csv(path + "parkinsons.csv")
# name is just an ID and status is our output, dropping the columns to get features
features = df.drop(['name','status'],axis=1)
output = df.loc[:,'status']

#scale all the datas in the range between -1,1
scaler= MinMaxScaler((-1,1))
features_c=scaler.fit_transform(features)

#split the dataset into training and testing sets where 20% data for testing purpose.
x_train,x_test,y_train,y_test=train_test_split(features_c, output , test_size=0.2,random_state=10)

In [12]:
df.columns[1:-1]

Index(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2'],
      dtype='object')

In [18]:
#initialize the random forest classifier and fit the datas
model= RandomForestClassifier(random_state=2)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [19]:
#predict the output for x_test
y_pred=model.predict(x_test)

#calculate accuracy,root mean squared error
print("Accuracy :",accuracy_score(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))


Accuracy : 0.9487179487179487
Mean Absolute Error: 0.05128205128205128
Root Mean Squared Error: 0.22645540682891915


Finding most predictive features for the presence of parkinson's disease

In [13]:
feature_impact = pd.Series(model.feature_importances_ ,index=list(df.columns[1:-1])).sort_values(ascending=False)
print(feature_impact)

D2                  0.116868
MDVP:Fo(Hz)         0.102234
DFA                 0.096962
MDVP:Flo(Hz)        0.050816
MDVP:APQ            0.049628
MDVP:Shimmer        0.047240
Shimmer:APQ5        0.046345
Jitter:DDP          0.044867
spread1             0.044098
MDVP:Fhi(Hz)        0.041760
spread2             0.041196
NHR                 0.037837
status              0.037450
Shimmer:APQ3        0.032785
MDVP:Jitter(%)      0.031643
MDVP:RAP            0.031087
Shimmer:DDA         0.026648
MDVP:Shimmer(dB)    0.024573
RPDE                0.024486
MDVP:Jitter(Abs)    0.024407
HNR                 0.023573
MDVP:PPQ            0.023498
dtype: float64
