In [None]:
import numpy as np
import pandas as pd
import os, sys
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
!wget -O perkinsons.zip "https://archive.ics.uci.edu/static/public/174/parkinsons.zip"
!unzip perkinsons.zip

In [None]:
#DataFlair - Read the data
df=pd.read_csv('/content/parkinsons.data')
df.head()

In [None]:
# df summary

df.describe()


In [None]:
df.isnull().sum()

In [None]:
# Get the features and labels
features=df.loc[:,df.columns!='status'].values[:,1:]
labels=df.loc[:,'status'].values

In [None]:
features

In [None]:
# Get the count of each label (0 and 1) in labels
print(labels[labels==1].shape[0], labels[labels==0].shape[0])

In [None]:
# barplot for lavels variable

import matplotlib.pyplot as plt
import seaborn as sns

# Create a bar plot for the labels variable
sns.barplot(x=['Healthy', 'Perkinsons'], y=[labels[labels==0].shape[0], labels[labels==1].shape[0]], palette='Set2')

# Set the title and axis labels
plt.title('Barplot for Labels Variable')
plt.xlabel('Label')
plt.ylabel('Count')

# Show the plot
plt.show()


In [None]:
# histogram on MDVP:Fo(Hz) column of df

df['MDVP:Fo(Hz)'].hist(bins=100)
plt.xlabel('MDVP:Fo(Hz)')
plt.ylabel('Frequency')
plt.title('Histogram of MDVP:Fo(Hz)')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer

# Assuming df is your DataFrame and 'MDVP:Fo(Hz)' is the column to convert
data = df['MDVP:Fo(Hz)'].values.reshape(-1, 1)

# Initialize the QuantileTransformer
quantile_transformer = QuantileTransformer(output_distribution='uniform') # normal if normal distribution

# Fit and transform the data
data_uniform = quantile_transformer.fit_transform(data)


# Plot the original data and transformed data in histograms
plt.figure(figsize=(10, 5))

# Plot original data
plt.subplot(1, 2, 1)
plt.hist(df['MDVP:Fo(Hz)'], bins=20, color='blue', alpha=0.7)
plt.title('Original Data')
plt.xlabel('MDVP:Fo(Hz)')
plt.ylabel('Frequency')

# Plot transformed data
plt.subplot(1, 2, 2)
plt.hist(data_uniform, bins=20, alpha=0.7, color='green', edgecolor='black')
plt.title('Uniform Distribution of MDVP:Fo(Hz)')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
len(data_uniform)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer

# Assuming df is your DataFrame and 'MDVP:Fo(Hz)' is the column to convert
data = df['MDVP:Fo(Hz)'].values.reshape(-1, 1)

# Initialize the QuantileTransformer
quantile_transformer = QuantileTransformer(output_distribution='normal') # normal if normal distribution

# Fit and transform the data
data_normal = quantile_transformer.fit_transform(data)


# Plot the original data and transformed data in histograms
plt.figure(figsize=(10, 5))

# Plot original data
plt.subplot(1, 2, 1)
plt.hist(df['MDVP:Fo(Hz)'], bins=20, color='blue', alpha=0.7, edgecolor='black')
plt.title('Original Data')
plt.xlabel('MDVP:Fo(Hz)')
plt.ylabel('Frequency')

# Plot transformed data
plt.subplot(1, 2, 2)
plt.hist(data_normal, bins=20, alpha=0.7, color='green', edgecolor='black')
plt.title('Normal Distribution of MDVP:Fo(Hz)')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, normalize

# Initialize the PowerTransformer
power_transformer = PowerTransformer(method='box-cox')#, standardize=True)

# Fit the PowerTransformer on the data and transform
data_transformed = power_transformer.fit_transform(np.array(df['MDVP:Fo(Hz)']).reshape(-1, 1))

#data_ = normalize(np.array(df['MDVP:Fo(Hz)']), norm='l2')

# Plot the original data and transformed data in histograms
plt.figure(figsize=(10, 5))

# Plot original data
plt.subplot(1, 2, 1)
plt.hist(df['MDVP:Fo(Hz)'], bins=20, color='blue', alpha=0.7, edgecolor='black')
plt.title('Original Data')
plt.xlabel('MDVP:Fo(Hz)')
plt.ylabel('Frequency')

# Plot transformed data
plt.subplot(1, 2, 2)
plt.hist(data_transformed, bins=20, color='green', alpha=0.7, edgecolor='black')
plt.title('Transformed Data (Gaussian)')
plt.xlabel('Transformed Values')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# heatmap of df

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(18, 18))
# Get the correlation matrix
corr = df.corr()

# Create a heatmap
sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='rocket_r')

# Set the title and show the plot
plt.title('Heatmap of Correlation Matrix')
plt.show()


In [None]:
df.columns

In [None]:
sns.pairplot(df.drop(["name", 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
                      'MDVP:Jitter(Abs)', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)',
                      'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA',
                      'NHR', 'HNR', 'RPDE'], axis=1), hue="status",diag_kind = 'kde')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer
def preprocess_data(features):
  # Assuming df is your DataFrame and 'MDVP:Fo(Hz)' is the column to convert
  #data = features.reshape(-1, 1)

  # Initialize the QuantileTransformer
  quantile_transformer = QuantileTransformer(output_distribution='normal') # normal if normal distribution

  # Fit and transform the data
  data_normal = quantile_transformer.fit_transform(features)

  return data_normal



In [None]:
1features = preprocess_data(features)

In [None]:
features

#### KNN

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 12)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

np.random.seed(123)

import time
start_time_KNN = time.time()

modelKNN = KNeighborsClassifier(n_neighbors=3)
modelKNN.fit(x_train, y_train)

y_predKNN = modelKNN.predict(x_test)

accuracy_KNN = accuracy_score(y_test, y_predKNN)
precision_KNN = precision_score(y_test, y_predKNN, average='weighted')
recall_KNN = recall_score(y_test, y_predKNN, average='weighted')
f1_KNN = f1_score(y_test, y_predKNN, average='weighted')

end_time_KNN = time.time()
time_KNN = end_time_KNN - start_time_KNN

print('KNN - Accuracy  : %.2f' % (accuracy_KNN*100), '%')
print('KNN - F1-Score  : %.2f' % (f1_KNN*100), '%')
print('KNN - Recall    : %.2f' % (recall_KNN*100), '%')
print('KNN - Precision : %.2f' % (precision_KNN*100), '%')
print('KNN - Time Taken : %.2f' % time_KNN, 'seconds')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

def confusion_matrix(y_test, y_pred, model_name):

  confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)

  labels = ['Healthy', 'Perkinson']

  ax = sns.heatmap(confusion_matrix, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels)

  plt.xlabel('Predicted Labels')
  plt.ylabel('Actual Labels')
  plt.title('Confusion Matrix for ' + model_name)

  #plt.savefig("Confusion Matrix.png", dpi=1000, format='png')
  plt.show()

In [None]:
#create Roc Auc curve

from sklearn.metrics import roc_curve, auc

# Get predicted probabilities for the test set
y_probKNN = modelKNN.predict_proba(x_test)[:, 1]

# Calculate ROC curve and AUC
fprKNN, tprKNN, thresholdsKNN = roc_curve(y_test, y_probKNN)
roc_aucKNN = auc(fprKNN, tprKNN)

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fprKNN, tprKNN, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_aucKNN)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve - KNN')
plt.legend(loc="lower right")
plt.show()



In [None]:
y_pred = modelKNN.predict(x_test)
confusion_matrix(y_test, y_pred, 'knn')

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

np.random.seed(123)

import time
start_time_SVM = time.time()

modelSVM_RBF = SVC(kernel = 'linear', probability=True)
modelSVM_RBF.fit(x_train, y_train)

y_predSVM_RBF = modelSVM_RBF.predict(x_test)

accuracy_SVM = accuracy_score(y_test, y_predSVM_RBF)
precision_SVM = precision_score(y_test, y_predSVM_RBF, average='weighted')
recall_SVM = recall_score(y_test, y_predSVM_RBF, average='weighted')
f1_SVM = f1_score(y_test, y_predSVM_RBF, average='weighted')

end_time_SVM = time.time()
time_SVM = end_time_SVM - start_time_SVM

print('SVM - Accuracy  : %.2f' % (accuracy_SVM*100), '%')
print('SVM - F1-Score  : %.2f' % (f1_SVM*100), '%')
print('SVM - Recall    : %.2f' % (recall_SVM*100), '%')
print('SVM - Precision : %.2f' % (precision_SVM*100), '%')
print('KNN - Time Taken : %.2f' % time_SVM, 'seconds')


In [None]:
# svm roc auc curve

# Get predicted probabilities for the test set
y_probSVM = modelSVM_RBF.predict_proba(x_test)[:, 1]

# Calculate ROC curve and AUC
fprSVM, tprSVM, thresholdsSVM = roc_curve(y_test, y_probSVM)
roc_aucSVM = auc(fprSVM, tprSVM)

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fprSVM, tprSVM, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_aucSVM)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve - SVM')
plt.legend(loc="lower right")
plt.show()


In [None]:
y_pred = modelSVM_RBF.predict(x_test)
confusion_matrix(y_test, y_pred, 'SVM')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

np.random.seed(123)

import time
start_time_RFC = time.time()

modelRFC = RandomForestClassifier(n_estimators = 754, criterion = 'gini')
modelRFC.fit(x_train, y_train)

y_predRFC = modelRFC.predict(x_test)

accuracy_RFC = accuracy_score(y_test, y_predRFC)
precision_RFC = precision_score(y_test, y_predRFC, average='weighted')
recall_RFC = recall_score(y_test, y_predRFC, average='weighted')
f1_RFC = f1_score(y_test, y_predRFC, average='weighted')

end_time_RFC = time.time()
time_RFC = end_time_RFC - start_time_RFC

print('RFC - Accuracy  : %.2f' % (accuracy_RFC*100), '%')
print('RFC - F1-Score  : %.2f' % (precision_RFC*100), '%')
print('RFC - Recall    : %.2f' % (recall_RFC*100), '%')
print('RFC - Precision : %.2f' % (f1_RFC*100), '%')
print('RFC - Time Taken : %.2f' % time_RFC, 'seconds')


In [None]:
# roc auc curve code for random forest

# Get predicted probabilities for the test set
y_probRFC = modelRFC.predict_proba(x_test)[:, 1]

# Calculate ROC curve and AUC
fprRFC, tprRFC, thresholdsRFC = roc_curve(y_test, y_probRFC)
roc_aucRFC = auc(fprRFC, tprRFC)

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fprRFC, tprRFC, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_aucRFC)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve - Random Forest')
plt.legend(loc="lower right")
plt.show()


In [None]:
y_pred = modelRFC.predict(x_test)
confusion_matrix(y_test, y_pred, 'RF')

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

np.random.seed(123)

import time
start_time_HGBC = time.time()

modelHGBC = HistGradientBoostingClassifier()
modelHGBC.fit(x_train, y_train)

y_predHGBC = modelHGBC.predict(x_test)

accuracy_HGBC = accuracy_score(y_test, y_predHGBC)
precision_HGBC = precision_score(y_test, y_predHGBC, average='weighted')
recall_HGBC = recall_score(y_test, y_predHGBC, average='weighted')
f1_HGBC = f1_score(y_test, y_predHGBC, average='weighted')

end_time_HGBC = time.time()
time_HGBC = end_time_HGBC - start_time_HGBC

print('HGBC - Accuracy  : %.2f' % (accuracy_HGBC*100), '%')
print('HGBC - F1-Score  : %.2f' % (f1_HGBC*100), '%')
print('HGBC - Recall    : %.2f' % (recall_HGBC*100), '%')
print('HGBC - Precision : %.2f' % (precision_HGBC*100), '%')
print('HGBC - Time Taken : %.2f' % time_HGBC, 'seconds')


In [None]:
# roc auc curve for hgbc

# roc auc curve code for hgbc

# Get predicted probabilities for the test set
y_probHGBC = modelHGBC.predict_proba(x_test)[:, 1]

# Calculate ROC curve and AUC
fprHGBC, tprHGBC, thresholdsHGBC = roc_curve(y_test, y_probHGBC)
roc_aucHGBC = auc(fprHGBC, tprHGBC)

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fprHGBC, tprHGBC, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_aucHGBC)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve - HistGradientBoostingClassifier')
plt.legend(loc="lower right")
plt.show()


In [None]:
y_pred = modelHGBC.predict(x_test)
confusion_matrix(y_test, y_pred, 'HGBC')

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


np.random.seed(123)

import time
start_time_GBM = time.time()

modelGBM = LGBMClassifier()
modelGBM.fit(x_train, y_train)

y_pred_GBM = modelGBM.predict(x_test)

accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
precision_GBM = precision_score(y_test, y_pred_GBM, average='weighted')
recall_GBM = recall_score(y_test, y_pred_GBM, average='weighted')
f1_GBM = f1_score(y_test, y_pred_GBM, average='weighted')

end_time_GBM = time.time()
time_GBM = end_time_GBM - start_time_GBM

print('Light-GBM - Accuracy  : %.2f' % (accuracy_GBM*100), '%')
print('Light-GBM - F1-Score  : %.2f' % (f1_GBM*100), '%')
print('Light-GBM - Recall    : %.2f' % (recall_GBM*100), '%')
print('Light-GBM - Precision : %.2f' % (precision_GBM*100), '%')
print('Light-GBM - Time Taken : %.2f' % time_GBM, 'seconds')

In [None]:
# roc auc curve for lbgm

# roc auc curve for lgbm

# Get predicted probabilities for the test set
y_prob_GBM = modelGBM.predict_proba(x_test)[:, 1]

# Calculate ROC curve and AUC
fpr_GBM, tpr_GBM, thresholds_GBM = roc_curve(y_test, y_prob_GBM)
roc_auc_GBM = auc(fpr_GBM, tpr_GBM)

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fpr_GBM, tpr_GBM, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_GBM)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve - LightGBM')
plt.legend(loc="lower right")
plt.show()



In [None]:
y_pred = modelGBM.predict(x_test)
confusion_matrix(y_test, y_pred, 'LBGM')

In [None]:
from tabulate import tabulate

head = ["Classifier", "Accuracy", "Precision", "Recall", "f1_Accuracy"]

mydata = [
    ["KNN", accuracy_KNN*100, precision_KNN*100, recall_KNN*100, f1_KNN*100],
    ["SVM", accuracy_SVM*100, precision_SVM*100, recall_SVM*100, f1_SVM*100],
    ["RFC", accuracy_RFC*100, precision_RFC*100, recall_RFC*100, f1_RFC*100],
    ["HGBC",accuracy_HGBC*100,precision_HGBC*100,recall_HGBC*100,f1_HGBC*100],
    ["Light-GBM", accuracy_GBM*100, precision_GBM*100, recall_GBM*100, f1_GBM*100]
]

print(tabulate(mydata, headers=head, tablefmt="simple"))

In [None]:
# plot accurecy against classifiers

classifiers = ['KNN', 'SVM', 'RFC', 'HGBC', 'Light-GBM']
accuracies = np.array([accuracy_KNN, accuracy_SVM, accuracy_RFC, accuracy_HGBC, accuracy_GBM]) * 100

plt.figure(figsize=(10, 6))
plt.bar(classifiers, accuracies, color='blue')
plt.ylim(85, 100)
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Different Classifiers')
plt.show()


In [None]:
import keras
from keras.callbacks import CSVLogger
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU
import time
import tensorflow as tf

np.random.seed(123)
tf.random.set_seed(123)

csv_logger = CSVLogger('pd.csv', append=False, separator=';')

classifier = Sequential()
classifier.add(Dense(64, input_shape = x_train[1].shape))

classifier.add(Dense(64))
#classifier.add(LeakyReLU(alpha=0.15))
classifier.add(Dropout(rate=0.3))

classifier.add(Dense(128, activation='relu'))
#classifier.add(LeakyReLU(alpha=0.15))
classifier.add(Dropout(rate=0.1))

classifier.add(Dense(16))
classifier.add(LeakyReLU(alpha=0.15))
classifier.add(Dropout(rate=0.2))

classifier.add(Dense(1, activation='sigmoid'))

checkpoint_filepath = 'checkpoint.model.keras'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                            monitor='val_accuracy',
                                                            mode='max',
                                                            save_best_only=True)

start_time = time.time()
classifier.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
classifier.fit(x_train, y_train, validation_split= 0.2, epochs=100, batch_size=5, callbacks= [csv_logger, model_checkpoint_callback] )
end_time = time.time()

In [None]:
time_taken = end_time - start_time
print("Time : %.3f" % time_taken, 'seconds')

In [None]:
classifier = keras.models.load_model("/content/checkpoint.model.keras")

print("Evaluation on test data : ")
results = classifier.evaluate(x_test, y_test, batch_size=5)
accuracyANN = results[1]*100
print("Test Accuracy : %.2f" % accuracyANN,'%')

In [None]:
test_pred = classifier.predict(x_test)
#y_pred =np.argmax(test_pred, axis=1)
y_pred = test_pred >0.5

In [None]:
y_pred

In [None]:
y_pred = np.reshape(y_pred, (-1,))

In [None]:
test_acc = 100 * np.sum((y_pred==y_test))/len(y_test)
test_acc

In [None]:
#y_pred = classifier.predict(x_test)
confusion_matrix(y_test, y_pred, 'Dense')