### No SMOTE, Random State = 42, All ML Algo

In [12]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

# Load the CSV file
df = pd.read_csv("C:\\Users\\Lenovo\\Downloads\\archive (2)\\pd_speech_features.csv")

# Extract features and labels
features = df.iloc[:, 1:-1].values
labels = df.iloc[:, -1].values

print("Parkinson Disease:", labels[labels == 1].shape[0], "Healthy:", labels[labels == 0].shape[0])

# Scale the features
scaler = MinMaxScaler((-1, 1))
x = scaler.fit_transform(features)
y = labels

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create the XGBoost classifier
xgb_model = XGBClassifier(eval_metric='mlogloss')

# Train the XGBoost model
xgb_model.fit(x_train, y_train)

# Make predictions on the test set using XGBoost
xgb_y_pred = xgb_model.predict(x_test)

# Calculate the accuracy score for XGBoost
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
print("XGBoost Accuracy:", xgb_accuracy * 100)

# Create a confusion matrix for XGBoost
xgb_confusion = confusion_matrix(y_test, xgb_y_pred)

# Calculate F1 score for XGBoost
xgb_f1 = f1_score(y_test, xgb_y_pred)
print("XGBoost F1 Score:", xgb_f1)

# Calculate True Positive Rate (TPR) and False Positive Rate (FPR) for XGBoost
xgb_TP = xgb_confusion[1, 1]
xgb_FN = xgb_confusion[1, 0]
xgb_FP = xgb_confusion[0, 1]
xgb_TN = xgb_confusion[0, 0]

xgb_TPR = xgb_TP / (xgb_TP + xgb_FN)
xgb_FPR = xgb_FP / (xgb_FP + xgb_TN)

print("XGBoost True Positive Rate (TPR):", xgb_TPR)
print("XGBoost False Positive Rate (FPR):", xgb_FPR)

# Calculate ROC-AUC Score for XGBoost
xgb_roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(x_test)[:, 1])
print("XGBoost ROC-AUC Score:", xgb_roc_auc)

# Create a DataFrame for the confusion matrix of XGBoost
xgb_confusion_df = pd.DataFrame(xgb_confusion, columns=['Predicted Healthy', 'Predicted Parkinsons'], index=['True Healthy', 'True Parkinsons'])
print(xgb_confusion_df)
print("---------------------------------------")

# Split the data into training and testing sets for other models
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Models
models = {
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=30, max_depth=10, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='liblinear', random_state=42),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate models
for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    tpr = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])
    fpr = confusion[0, 1] / (confusion[0, 1] + confusion[0, 0])
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print(f"True Positive Rate (TPR): {tpr:.2f}")
    print(f"False Positive Rate (FPR): {fpr:.2f}")
    print("Confusion Matrix:")
    print(confusion)
    print("---------------------------------------")


Parkinson Disease: 564 Healthy: 192
XGBoost Accuracy: 88.1578947368421
XGBoost F1 Score: 0.9243697478991597
XGBoost True Positive Rate (TPR): 0.9649122807017544
XGBoost False Positive Rate (FPR): 0.3684210526315789
XGBoost ROC-AUC Score: 0.9162049861495845
                 Predicted Healthy  Predicted Parkinsons
True Healthy                    24                    14
True Parkinsons                  4                   110
---------------------------------------
Model: SVM
Accuracy: 0.83
F1 Score: 0.89
ROC-AUC Score: 0.69
True Positive Rate (TPR): 0.99
False Positive Rate (FPR): 0.62
Confusion Matrix:
[[ 23  38]
 [  1 165]]
---------------------------------------
Model: KNN
Accuracy: 0.86
F1 Score: 0.91
ROC-AUC Score: 0.79
True Positive Rate (TPR): 0.94
False Positive Rate (FPR): 0.36
Confusion Matrix:
[[ 39  22]
 [ 10 156]]
---------------------------------------
Model: Random Forest
Accuracy: 0.85
F1 Score: 0.91
ROC-AUC Score: 0.74
True Positive Rate (TPR): 0.98
False Positive Rate 

###  SMOTE, Random State = 42, All ML Algo

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import pandas as pd
from xgboost import XGBClassifier

# Load the CSV file
df = pd.read_csv("C:\\Users\\Lenovo\\Downloads\\archive (2)\\pd_speech_features.csv")

# Extract features and labels
features = df.iloc[:, 1:-1].values
labels = df.iloc[:, -1].values

print("Parkinson Disease:", labels[labels == 1].shape[0], "Healthy:", labels[labels == 0].shape[0])

# Scale the features to a similar range (e.g., [0, 1]) using MinMaxScaler
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
features_resampled, labels_resampled = smote.fit_resample(features_scaled, labels)

# Split the data into training and testing sets with random split
x_train, x_test, y_train, y_test = train_test_split(features_resampled, labels_resampled, test_size=0.3, random_state=42)

# Models
models = {
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=30, max_depth=10, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='liblinear', random_state=42),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42)
}

# Train and evaluate models
for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    tpr = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])
    fpr = confusion[0, 1] / (confusion[0, 1] + confusion[0, 0])
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print(f"True Positive Rate (TPR): {tpr:.2f}")
    print(f"False Positive Rate (FPR): {fpr:.2f}")
    print("Confusion Matrix:")
    print(confusion)
    print("---------------------------------------")


Parkinson Disease: 564 Healthy: 192
Model: SVM
Accuracy: 0.81
F1 Score: 0.82
ROC-AUC Score: 0.81
True Positive Rate (TPR): 0.81
False Positive Rate (FPR): 0.18
Confusion Matrix:
[[135  30]
 [ 33 141]]
---------------------------------------
Model: KNN
Accuracy: 0.89
F1 Score: 0.88
ROC-AUC Score: 0.89
True Positive Rate (TPR): 0.79
False Positive Rate (FPR): 0.00
Confusion Matrix:
[[165   0]
 [ 37 137]]
---------------------------------------
Model: Random Forest
Accuracy: 0.96
F1 Score: 0.96
ROC-AUC Score: 0.96
True Positive Rate (TPR): 0.93
False Positive Rate (FPR): 0.02
Confusion Matrix:
[[162   3]
 [ 12 162]]
---------------------------------------
Model: Logistic Regression
Accuracy: 0.88
F1 Score: 0.88
ROC-AUC Score: 0.88
True Positive Rate (TPR): 0.83
False Positive Rate (FPR): 0.07
Confusion Matrix:
[[154  11]
 [ 30 144]]
---------------------------------------
Model: Naive Bayes
Accuracy: 0.79
F1 Score: 0.79
ROC-AUC Score: 0.79
True Positive Rate (TPR): 0.80
False Positive Rat

###  SMOTE, PCA, Random State = 42, All ML Algo

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import pandas as pd
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

# Load the CSV file
df = pd.read_csv("C:\\Users\\Lenovo\\Downloads\\archive (2)\\pd_speech_features.csv")

# Extract features and labels
features = df.iloc[:, 1:-1].values
labels = df.iloc[:, -1].values

print("Total Number of Features:", features.shape[1])

# Scale the features to a similar range (e.g., [0, 1]) using MinMaxScaler
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
features_pca = pca.fit_transform(features_scaled)

print("Number of Features after PCA:", features_pca.shape[1])
print("Number of Features Removed:", features.shape[1] - features_pca.shape[1])

# Apply SMOTE to balance the data using PCA-transformed features
smote = SMOTE(random_state=42)
features_resampled_pca, labels_resampled = smote.fit_resample(features_pca, labels)

# Split the data into training and testing sets with random split using PCA-transformed features
x_train_pca, x_test_pca, y_train, y_test = train_test_split(features_resampled_pca, labels_resampled, test_size=0.3, random_state=42)

# Models
models = {
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=30, max_depth=10, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='liblinear', random_state=42),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42)
}

# Train and evaluate models using PCA-transformed features
for model_name, model in models.items():
    model.fit(x_train_pca, y_train)
    y_pred = model.predict(x_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    tpr = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])
    fpr = confusion[0, 1] / (confusion[0, 1] + confusion[0, 0])

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print(f"True Positive Rate (TPR): {tpr:.2f}")
    print(f"False Positive Rate (FPR): {fpr:.2f}")
    print("Confusion Matrix:")
    print(confusion)
    print("---------------------------------------")


Total Number of Features: 753
Number of Features after PCA: 116
Number of Features Removed: 637
Model: SVM
Accuracy: 0.95
F1 Score: 0.95
ROC-AUC Score: 0.95
True Positive Rate (TPR): 0.92
False Positive Rate (FPR): 0.02
Confusion Matrix:
[[161   4]
 [ 14 160]]
---------------------------------------
Model: KNN
Accuracy: 0.89
F1 Score: 0.88
ROC-AUC Score: 0.90
True Positive Rate (TPR): 0.79
False Positive Rate (FPR): 0.00
Confusion Matrix:
[[165   0]
 [ 36 138]]
---------------------------------------
Model: Random Forest
Accuracy: 0.92
F1 Score: 0.92
ROC-AUC Score: 0.92
True Positive Rate (TPR): 0.92
False Positive Rate (FPR): 0.07
Confusion Matrix:
[[153  12]
 [ 14 160]]
---------------------------------------
Model: Logistic Regression
Accuracy: 0.86
F1 Score: 0.86
ROC-AUC Score: 0.87
True Positive Rate (TPR): 0.80
False Positive Rate (FPR): 0.07
Confusion Matrix:
[[153  12]
 [ 34 140]]
---------------------------------------
Model: Naive Bayes
Accuracy: 0.79
F1 Score: 0.78
ROC-AUC S

### SMOTE, Stratified, All ML 

In [15]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import pandas as pd
from xgboost import XGBClassifier

# Load the CSV file
df = pd.read_csv("C:\\Users\\Lenovo\\Downloads\\archive (2)\\pd_speech_features.csv")

# Extract features and labels
features = df.iloc[:, 1:-1].values
labels = df.iloc[:, -1].values

print("Parkinson Disease:", labels[labels == 1].shape[0], "Healthy:", labels[labels == 0].shape[0])

# Scale the features to a similar range (e.g., [0, 1]) using MinMaxScaler
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
features_resampled, labels_resampled = smote.fit_resample(features_scaled, labels)

# Split the data into training and testing sets with stratified split
x_train, x_test, y_train, y_test = train_test_split(features_resampled, labels_resampled, test_size=0.3, stratify=labels_resampled)

# Models
models = {
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=30, max_depth=10),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='liblinear'),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42)
}

# Train and evaluate models
for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    tpr = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])
    fpr = confusion[0, 1] / (confusion[0, 1] + confusion[0, 0])
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print(f"True Positive Rate (TPR): {tpr:.2f}")
    print(f"False Positive Rate (FPR): {fpr:.2f}")
    print("Confusion Matrix:")
    print(confusion)
    print("---------------------------------------")


Parkinson Disease: 564 Healthy: 192
Model: SVM
Accuracy: 0.82
F1 Score: 0.81
ROC-AUC Score: 0.82
True Positive Rate (TPR): 0.78
False Positive Rate (FPR): 0.14
Confusion Matrix:
[[147  23]
 [ 38 131]]
---------------------------------------
Model: KNN
Accuracy: 0.81
F1 Score: 0.77
ROC-AUC Score: 0.81
True Positive Rate (TPR): 0.64
False Positive Rate (FPR): 0.02
Confusion Matrix:
[[167   3]
 [ 61 108]]
---------------------------------------
Model: Random Forest
Accuracy: 0.93
F1 Score: 0.93
ROC-AUC Score: 0.93
True Positive Rate (TPR): 0.93
False Positive Rate (FPR): 0.06
Confusion Matrix:
[[159  11]
 [ 12 157]]
---------------------------------------
Model: Logistic Regression
Accuracy: 0.87
F1 Score: 0.87
ROC-AUC Score: 0.87
True Positive Rate (TPR): 0.86
False Positive Rate (FPR): 0.12
Confusion Matrix:
[[149  21]
 [ 24 145]]
---------------------------------------
Model: Naive Bayes
Accuracy: 0.75
F1 Score: 0.73
ROC-AUC Score: 0.75
True Positive Rate (TPR): 0.69
False Positive Rat