In [4]:
#Importing pandas under the abbreviation 'pd'
import pandas as pd

#Importing numpy under the abbreviation 'np'
import numpy as np

#Import train_test_split from the sci-kit library to split data into training and test data
from sklearn.model_selection import train_test_split

#Importing xgboost (Extreme Gradient Boosting) as the abbreviation 'xgb'
import xgboost as xgb

#Importing methods to calculate error and accuracy of the predictions
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
#Uploading the dataset 

from google.colab import files
uploaded = files.upload()

Saving parkinsons.data to parkinsons.data


In [6]:
#Creating the dataset into a pandas dataframe
train_data = pd.read_csv("parkinsons.data")

#Checking the first 5 lines of the dataframe
train_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [7]:
#Removing the target variable 'Status' from the training data and assigning the
#rest of the columns to a vector X

X = train_data.drop('status',axis=1)
print(X.head())

             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   MDVP:APQ  Shimmer:DDA      NHR     HNR      RPDE       DFA   spread1  \
0   0.02971

In [8]:
#Since the 'name' column in X has categorical data and xgb will not run on 
#categorical data, we must change it. Since the 'name' column only acts as the 
#id of the patients, we'll just replace it with numbers going through 1 to 195
#Since we can easily replace dataframe columns with a list, we'll just create a 
#list containing the numbers

def createList(r1, r2):
    return [item for item in range(r1, r2+1)]
     
# Driver Code
r1, r2 = 1, 195
ID = createList(r1, r2)

In [9]:
#Replacing the 'name' column with a list containing numbers 1 to 195
X['name'] = ID

In [10]:
X.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.02971,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.04368,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0359,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.03772,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.04465,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [11]:
#Assigning the target variable values from the training data to y

y = train_data[['status']]
print(y.head())

   status
0       1
1       1
2       1
3       1
4       1


In [12]:
#Splitting the data into training and test datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [13]:
#Creating regression matrices

train_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
test_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [14]:
#Defining hyperparameters

params = {"objective": "reg:squarederror", "tree_method": "hist"}
n= 100

#evals can be used to see the error in training and validation after every round
evals = [(train_reg, "train"), (test_reg, "validation")]

In [15]:
#Preparing the model based on the hyperparamaters

model = xgb.train(params = params, dtrain = train_reg, num_boost_round = n, evals = evals)

[0]	train-rmse:0.35612	validation-rmse:0.38638
[1]	train-rmse:0.25403	validation-rmse:0.32164
[2]	train-rmse:0.18155	validation-rmse:0.28953
[3]	train-rmse:0.13000	validation-rmse:0.27679
[4]	train-rmse:0.09338	validation-rmse:0.27346
[5]	train-rmse:0.06737	validation-rmse:0.27453
[6]	train-rmse:0.04894	validation-rmse:0.27692
[7]	train-rmse:0.03593	validation-rmse:0.27640
[8]	train-rmse:0.02668	validation-rmse:0.27484
[9]	train-rmse:0.02014	validation-rmse:0.27389
[10]	train-rmse:0.01544	validation-rmse:0.27386
[11]	train-rmse:0.01190	validation-rmse:0.27323
[12]	train-rmse:0.00930	validation-rmse:0.27308
[13]	train-rmse:0.00735	validation-rmse:0.27298
[14]	train-rmse:0.00599	validation-rmse:0.27239
[15]	train-rmse:0.00486	validation-rmse:0.27254
[16]	train-rmse:0.00412	validation-rmse:0.27262
[17]	train-rmse:0.00334	validation-rmse:0.27223
[18]	train-rmse:0.00272	validation-rmse:0.27248
[19]	train-rmse:0.00224	validation-rmse:0.27267
[20]	train-rmse:0.00186	validation-rmse:0.27280
[2

In [16]:
#Redefining hyperparameters

params = {"objective": "reg:squarederror", "tree_method": "hist"}
n= 28  #redifing hyperparameters as after 28th round, the error doesnt change
evals = [(train_reg, "train"), (test_reg, "validation")]

In [17]:
#Preparing the model based on the redefined hyperparameters

model = xgb.train(params = params, dtrain = train_reg, num_boost_round = n)

In [18]:
#Making predictions using the model

preds = model.predict(test_reg)

In [19]:
#Using root mean square error to find the error percentage in the predictions
#as compared to the y_test 

rmse = mean_squared_error(y_test, preds, squared = False)
print(f"Error in prediction: {rmse:.3f}")

Error in prediction: 0.273


In [20]:
#XGBoost Cross Validation to tune hyperparameters

params = {"objective": "reg:squarederror", "tree_method": "hist"}
n= 28

In [21]:
#Using k fold cross validation (k=5) to further define the model

results = xgb.cv(params, train_reg, num_boost_round = n, nfold = 5, early_stopping_rounds = 20)

In [22]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,0.357302,0.000681,0.380889,0.019596
1,0.255824,0.001038,0.309056,0.039505
2,0.183631,0.001228,0.266655,0.059496
3,0.132161,0.001324,0.23782,0.076207
4,0.095426,0.00139,0.221414,0.088672


In [23]:
#Taking the minimum of test-rmse-mean to find the best score

best_rmse = results['test-rmse-mean'].min()
best_rmse

0.20003958407824957

Finally **classifying the data** into 2 categories - 
1. Positive class (prediction = 1) if patient suffers from Parkinson's disease.
2. Negative class (prediction = 0) if the patient does not suffer from Parkinson's disease

In [24]:
#Instantiating an instance of XGBoost as model

model = xgb.XGBClassifier()

In [25]:
#Fitting the model on the training data

model.fit(X_train, y_train)

In [26]:
#Using the fitted model to make predictions on the test data and saving the 
#predictions to a variable called y_preds

y_preds = model.predict(X_test)

In [28]:
#Evaluating the model's predictions by using methods from sci-kit library

accuracy = accuracy_score(y_test, y_preds)
report = classification_report(y_test, y_preds)
confusion_mat = confusion_matrix(y_test, y_preds)

In [29]:
#Printing the final values for the model

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix:")
print(confusion_mat)

Accuracy: 0.9591836734693877

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        12
           1       0.97      0.97      0.97        37

    accuracy                           0.96        49
   macro avg       0.94      0.94      0.94        49
weighted avg       0.96      0.96      0.96        49


Confusion Matrix:
[[11  1]
 [ 1 36]]


Classifying the same dataset using the **Random Forest Classifier**

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
# Create a Random Forest classifier
rf_model = RandomForestClassifier()

In [32]:
# Train the model
rf_model.fit(X_train, y_train)

  rf_model.fit(X_train, y_train)


In [34]:
# Make predictions on the test set
rf_y_preds = rf_model.predict(X_test)

In [35]:
# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_y_preds)
rf_report = classification_report(y_test, rf_y_preds)
rf_confusion_mat = confusion_matrix(y_test, rf_y_preds)

In [36]:
# Print the results
print("Random Forest Accuracy:", rf_accuracy)
print("\nRandom Forest Classification Report:")
print(rf_report)
print("\nRandom Forest Confusion Matrix:")
print(rf_confusion_mat)

Random Forest Accuracy: 0.9795918367346939

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.97      1.00      0.99        37

    accuracy                           0.98        49
   macro avg       0.99      0.96      0.97        49
weighted avg       0.98      0.98      0.98        49


Random Forest Confusion Matrix:
[[11  1]
 [ 0 37]]


With respect to the above conclusions, we used decision tree optimizers for both the scenarios - one being the XGBoost and the other being Random Forest Classifier. Although, XGBoost is considered to more accurate usually, for this dataset we got a higher accuracy with the Random Forest Classifier.

Classifying the same dataset using **SVM**

In [37]:
from sklearn.svm import SVC

In [38]:
# Create an SVM classifier
svm_model = SVC()

In [39]:
# Train the model
svm_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [40]:
# Make predictions on the test set
svm_y_pred = svm_model.predict(X_test)

In [41]:
# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
svm_confusion_mat = confusion_matrix(y_test, svm_y_pred)

In [42]:
# Print the results
print("SVM Accuracy:", svm_accuracy)
print("\nSVM Classification Report:")
print(svm_report)
print("\nSVM Confusion Matrix:")
print(svm_confusion_mat)

SVM Accuracy: 0.8367346938775511

SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.33      0.50        12
           1       0.82      1.00      0.90        37

    accuracy                           0.84        49
   macro avg       0.91      0.67      0.70        49
weighted avg       0.87      0.84      0.80        49


SVM Confusion Matrix:
[[ 4  8]
 [ 0 37]]


Out of the 3 machine learning models, SVM performed the worst for classifying this dataset.