Importing the Dependencies

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection & Analysis

In [11]:
# loading the data from csv file to a Pandas DataFrame
parkinsons_data = pd.read_csv("../Datasets/parkinsons.csv")

In [12]:
# printing the first 5 rows of the dataframe
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [13]:
# number of rows and columns in the dataframe
parkinsons_data.shape

(4500, 24)

In [14]:
# getting more information about the dataset
parkinsons_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4500 non-null   object 
 1   MDVP:Fo(Hz)       4500 non-null   float64
 2   MDVP:Fhi(Hz)      4500 non-null   float64
 3   MDVP:Flo(Hz)      4500 non-null   float64
 4   MDVP:Jitter(%)    4500 non-null   float64
 5   MDVP:Jitter(Abs)  4500 non-null   float64
 6   MDVP:RAP          4500 non-null   float64
 7   MDVP:PPQ          4500 non-null   float64
 8   Jitter:DDP        4500 non-null   float64
 9   MDVP:Shimmer      4500 non-null   float64
 10  MDVP:Shimmer(dB)  4500 non-null   float64
 11  Shimmer:APQ3      4500 non-null   float64
 12  Shimmer:APQ5      4500 non-null   float64
 13  MDVP:APQ          4500 non-null   float64
 14  Shimmer:DDA       4500 non-null   float64
 15  NHR               4500 non-null   float64
 16  HNR               4500 non-null   float64


In [15]:
# checking for missing values in each column
parkinsons_data.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [16]:
# getting some statistical measures about the data
parkinsons_data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,...,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0
mean,154.120913,198.920148,108.819876,0.006173,4.3e-05,0.003361,0.00354,0.009952,0.03066,0.29006,...,0.047591,0.026126,21.915129,0.758222,0.498781,0.719061,-5.697655,0.226257,2.377713,0.207076
std,45.099208,83.50882,43.149941,0.004012,4.6e-05,0.002474,0.002475,0.007168,0.018683,0.195145,...,0.030809,0.029294,4.442255,0.428208,0.104675,0.055696,1.102378,0.083952,0.379197,0.090129
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,115.046,137.86925,72.49,0.00334,1.2e-05,0.00161,0.001798,0.004897,0.016178,0.146,...,0.023465,0.00611,18.93225,1.0,0.419043,0.677974,-6.526489,0.167127,2.108052,0.138469
50%,147.623,176.409,94.719,0.00503,2.5e-05,0.00267,0.0028,0.00797,0.025185,0.23,...,0.03839,0.016315,21.9635,1.0,0.504802,0.720917,-5.786559,0.22378,2.352084,0.193939
75%,189.31775,234.595,134.6555,0.007763,5.6e-05,0.00428,0.004472,0.01268,0.040155,0.37425,...,0.063613,0.035162,24.90025,1.0,0.585065,0.762278,-4.954768,0.281975,2.623098,0.261962
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [17]:
# distribution of target Variable
parkinsons_data['status'].value_counts()

status
1    3412
0    1088
Name: count, dtype: int64

1  --> Parkinson's Positive

0 --> Healthy


In [18]:
# grouping the data based on the target variable (mean of numeric features only)
numeric_cols = parkinsons_data.select_dtypes(include=[np.number]).columns
parkinsons_data.groupby('status')[numeric_cols].mean()

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,169.091509,210.014951,121.993122,0.005267,2.9e-05,0.002892,0.002973,0.008579,0.025089,0.232055,...,0.040402,0.021494,22.843018,0.0,0.47369,0.707856,-6.202986,0.192323,2.285128,0.166121
1,149.347171,195.382297,104.619263,0.006463,4.8e-05,0.003511,0.003721,0.01039,0.032437,0.308556,...,0.049883,0.027603,21.619249,1.0,0.506781,0.722634,-5.536517,0.237078,2.407236,0.220135


Data Pre-Processing

Separating the features & Target

In [19]:
X = parkinsons_data.drop(columns=['name','status'], axis=1)
Y = parkinsons_data['status']

In [20]:
print(X)

      MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0         119.992       157.302        74.997         0.00784   
1         122.400       148.650       113.819         0.00968   
2         116.682       131.111       111.555         0.01050   
3         116.676       137.871       111.366         0.00997   
4         116.014       141.781       110.655         0.01284   
...           ...           ...           ...             ...   
4495      221.907       329.148        75.307         0.00439   
4496      195.597       273.792       104.612         0.00962   
4497      136.916       141.204        92.535         0.00307   
4498       91.772       167.139        65.865         0.01394   
4499      227.301       304.594        95.647         0.00520   

      MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
0             0.000070   0.00370   0.00554     0.01109       0.04374   
1             0.000080   0.00465   0.00696     0.01394       0.06134   
2  

In [21]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
4495    1
4496    1
4497    1
4498    1
4499    0
Name: status, Length: 4500, dtype: int64


Splitting the data to training data & Test data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(4500, 22) (3600, 22) (900, 22)


Model Training

Support Vector Machine Model

In [24]:
model = svm.SVC(kernel='linear')

In [25]:
# training the SVM model with training data
model.fit(X_train, Y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


Model Evaluation

Accuracy Score

In [26]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [27]:
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.7538888888888889


In [28]:
# accuracy score on training data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [29]:
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.7755555555555556


Building a Predictive System

In [30]:
input_data = (197.07600,206.89600,192.05500,0.00289,0.00001,0.00166,0.00168,0.00498,0.01098,0.09700,0.00563,0.00680,0.00802,0.01689,0.00339,26.77500,0.422229,0.741367,-7.348300,0.177551,1.743867,0.085569)

# changing input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)


if (prediction[0] == 0):
  print("The Person does not have Parkinsons Disease")

else:
  print("The Person has Parkinsons")


[1]
The Person has Parkinsons




Saving the trained model

In [31]:
import pickle

In [32]:
filename = 'parkinsons_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [33]:
# loading the saved model
loaded_model = pickle.load(open('parkinsons_model.sav', 'rb'))

In [34]:
for column in X.columns:
  print(column)

MDVP:Fo(Hz)
MDVP:Fhi(Hz)
MDVP:Flo(Hz)
MDVP:Jitter(%)
MDVP:Jitter(Abs)
MDVP:RAP
MDVP:PPQ
Jitter:DDP
MDVP:Shimmer
MDVP:Shimmer(dB)
Shimmer:APQ3
Shimmer:APQ5
MDVP:APQ
Shimmer:DDA
NHR
HNR
RPDE
DFA
spread1
spread2
D2
PPE
