In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [15]:
data = pd.read_csv('DB_Voice_Features.csv')

print('Shape: ',data.shape)
data.head

Shape:  (195, 24)


<bound method NDFrame.head of                name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0    phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1    phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2    phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3    phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4    phon_R01_S01_5      116.014       141.781       110.655         0.01284   
..              ...          ...           ...           ...             ...   
190  phon_R01_S50_2      174.188       230.978        94.261         0.00459   
191  phon_R01_S50_3      209.516       253.017        89.488         0.00564   
192  phon_R01_S50_4      174.688       240.005        74.287         0.01360   
193  phon_R01_S50_5      198.764       396.961        74.904         0.00740   
194  phon_R01_S50_6      214.289       260.277        77.973         0.00567   

     MDVP

In [16]:
non_numeric_columns = ['name']
data = data.drop(non_numeric_columns, axis=1)

X = data.drop('status', axis=1)
y = data['status']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
knn_classifier_before = KNeighborsClassifier()

knn_classifier_before.fit(X_train, y_train)

knn_predictions_before = knn_classifier_before.predict(X_test)

accuracy_before = accuracy_score(y_test, knn_predictions_before)

print("KNN Accuracy (Before Optimization):", accuracy_before)
print("KNN Classification Report (Before Optimization):\n", classification_report(y_test, knn_predictions_before))

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'metric': ['euclidean', 'manhattan'],
}

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_knn_classifier = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], metric=best_params['metric'])

best_knn_classifier.fit(X_train, y_train)

knn_predictions_after = best_knn_classifier.predict(X_test)

accuracy_after = accuracy_score(y_test, knn_predictions_after)

print("\nKNN Accuracy (After Optimization):", accuracy_after)
print("KNN Classification Report (After Optimization):\n", classification_report(y_test, knn_predictions_after))



KNN Accuracy (Before Optimization): 0.9487179487179487
KNN Classification Report (Before Optimization):
               precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       0.94      1.00      0.97        32

    accuracy                           0.95        39
   macro avg       0.97      0.86      0.90        39
weighted avg       0.95      0.95      0.95        39


KNN Accuracy (After Optimization): 0.9743589743589743
KNN Classification Report (After Optimization):
               precision    recall  f1-score   support

           0       1.00      0.86      0.92         7
           1       0.97      1.00      0.98        32

    accuracy                           0.97        39
   macro avg       0.98      0.93      0.95        39
weighted avg       0.98      0.97      0.97        39



In [23]:
logistic_classifier_before = LogisticRegression()

logistic_classifier_before.fit(X_train, y_train)

logistic_predictions_before = logistic_classifier_before.predict(X_test)

accuracy_before = accuracy_score(y_test, logistic_predictions_before)

print("Logistic Regression Accuracy (Before Optimization):", accuracy_before)
print("Logistic Regression Classification Report (Before Optimization):\n", classification_report(y_test, logistic_predictions_before))

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2'],
}

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_logistic_classifier = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'])

best_logistic_classifier.fit(X_train, y_train)

logistic_predictions_after = best_logistic_classifier.predict(X_test)

accuracy_after = accuracy_score(y_test, logistic_predictions_after)

print("\nLogistic Regression Accuracy (After Optimization):", accuracy_after)
print("Logistic Regression Classification Report (After Optimization):\n", classification_report(y_test, logistic_predictions_after))



Logistic Regression Accuracy (Before Optimization): 0.8974358974358975
Logistic Regression Classification Report (Before Optimization):
               precision    recall  f1-score   support

           0       1.00      0.43      0.60         7
           1       0.89      1.00      0.94        32

    accuracy                           0.90        39
   macro avg       0.94      0.71      0.77        39
weighted avg       0.91      0.90      0.88        39


Logistic Regression Accuracy (After Optimization): 0.8974358974358975
Logistic Regression Classification Report (After Optimization):
               precision    recall  f1-score   support

           0       1.00      0.43      0.60         7
           1       0.89      1.00      0.94        32

    accuracy                           0.90        39
   macro avg       0.94      0.71      0.77        39
weighted avg       0.91      0.90      0.88        39



In [20]:
knn_predictions = best_knn_classifier.predict(X_test)

knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions)
knn_f1 = f1_score(y_test, knn_predictions)
knn_confusion = confusion_matrix(y_test, knn_predictions)

logistic_predictions = best_logistic_classifier.predict(X_test)

logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_precision = precision_score(y_test, logistic_predictions)
logistic_recall = recall_score(y_test, logistic_predictions)
logistic_f1 = f1_score(y_test, logistic_predictions)
logistic_confusion = confusion_matrix(y_test, logistic_predictions)

print("KNN Metrics:")
print("Accuracy:", knn_accuracy)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1-Score:", knn_f1)
print("Confusion Matrix:\n", knn_confusion)

print("\nLogistic Regression Metrics:")
print("Accuracy:", logistic_accuracy)
print("Precision:", logistic_precision)
print("Recall:", logistic_recall)
print("F1-Score:", logistic_f1)
print("Confusion Matrix:\n", logistic_confusion)


KNN Metrics:
Accuracy: 0.9743589743589743
Precision: 0.9696969696969697
Recall: 1.0
F1-Score: 0.9846153846153847
Confusion Matrix:
 [[ 6  1]
 [ 0 32]]

Logistic Regression Metrics:
Accuracy: 0.8974358974358975
Precision: 0.8888888888888888
Recall: 1.0
F1-Score: 0.9411764705882353
Confusion Matrix:
 [[ 3  4]
 [ 0 32]]


In [25]:
knn_classification_report = classification_report(y_test, knn_predictions, target_names=['Healthy', 'PD'], output_dict=True)

# Calculate classification report for Logistic Regression
logistic_classification_report = classification_report(y_test, logistic_predictions, target_names=['Healthy', 'PD'], output_dict=True)

# Function to compare metrics
def compare_metrics(metric, label):
    knn_metric = knn_classification_report[label][metric]
    logistic_metric = logistic_classification_report[label][metric]
    
    if knn_metric > logistic_metric:
        print(f"KNN has a higher {metric} for class {label}.")
    elif knn_metric < logistic_metric:
        print(f"Logistic Regression has a higher {metric} for class {label}.")
    else:
        print(f"Both KNN and Logistic Regression have the same {metric} for class {label}.")

# Compare metrics for class label 'Healthy'
compare_metrics("precision", 'Healthy')
compare_metrics("recall", 'Healthy')
compare_metrics("f1-score", 'Healthy')

print()

# Compare metrics for class label 'PD'
compare_metrics("precision", 'PD')
compare_metrics("recall", 'PD')
compare_metrics("f1-score", 'PD')


Both KNN and Logistic Regression have the same precision for class Healthy.
KNN has a higher recall for class Healthy.
KNN has a higher f1-score for class Healthy.

KNN has a higher precision for class PD.
Both KNN and Logistic Regression have the same recall for class PD.
KNN has a higher f1-score for class PD.


#  Discuss the results obtained (different metrics)

# Accuracy

# Both models have high accuracy, with KNN achieving approximately 97% and Logistic Regression achieving approximately 90%. 

# Precision

# KNN has a precision of approximately 97%, while Logistic Regression has a precision of approximately 89%. Precision measures the proportion of true positive predictions among all positive predictions. KNN has slightly better precision, indicating that it makes fewer false-positive predictions compared to Logistic Regression. 


# Recall

#  Both models have perfect recall (100%). Recall measures the proportion of true positive predictions among all actual positives. In this context, high recall means that both models correctly identify all individuals with PD. 


# F1-Score

# KNN has an F1-score of approximately 98%, while Logistic Regression has an F1-score of approximately 94%. The F1-score is the harmonic mean of precision and recall and provides a balanced measure of a model's performance. KNN has a higher F1-score, indicating a better overall balance between precision and recall. 

# Confusion Matrix

# The confusion matrix provides a more detailed view of the model's performance. For KNN, there are 6 true negatives, which correctly identified healthy individuals. 1 false positive shows a healthy individual misclassified as having PD. 32 true positives which correctly identified individuals with PD. For Logistic Regression, there are 3 true negatives, 4 false positives, and 32 true positives.



# KNN shows to have a slightly better precision and shows it may be more reliable in not misclassifying healthy individuals as having PD. Logistic Regression has a lower FPR which is importaint in clinical practices to avoid wrong and unnecessary concerns for healthy individuals. 

# For clinical Implications, KNN methods has a higher precision, and Logistic Regression have higher recall.Logistic Regression in terms of precision, F1-score will have better performance, while KNN is better at identifying PD with more accurate positive true and false rates than negatives.
