In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

**Data Preprocessing Part 1**

In [3]:
#Open Dataset
parkinsons_data = pd.read_csv('parkinsons_disease_data.csv')
print(parkinsons_data.shape)
parkinsons_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'parkinsons_disease_data.csv'

In [None]:
#Drop Irrelevant Features and Features That Already Identify Parkinsons (ie, UPDRS, MoCA, FunctionalAssessment)
pk_irr = parkinsons_data.drop('PatientID',axis=1)
pk_irr = pk_irr.drop('DoctorInCharge',axis=1)
pk_irr = pk_irr.drop('UPDRS',axis=1)
pk_irr = pk_irr.drop('MoCA',axis=1)
pk_irr = pk_irr.drop('FunctionalAssessment',axis=1)
# Print all column names to confirm the correct column names



In [None]:
##########################Function for Anomaly Removal###########################
def remove_anomalies(data, feature):
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    LB = Q1 - (IQR*1.5)
    UB = Q3 + (IQR*1.5)
    return (data - data[(data[feature]< LB) | (data[feature] > UB)])
###################################################################################

In [None]:
#Decouple Features and Target
pk_features = pk_irr.drop('Diagnosis',axis=1)
pk_target = pk_irr['Diagnosis']

In [None]:
#Apply SMOTE for class imbalance correction
smote = SMOTE()
pk_x_smote, pk_y_smote = smote.fit_resample(pk_features, pk_target)
# print(pk_irr.columns)

**Data Visualization**

In [None]:
#Custom Palette For Visualizations
palette = {0: 'green', 1: 'red'}

In [None]:
#Visualize Correlation
data_correlation = pk_features.corr()
data_high_correlation = data_correlation[abs(data_correlation) >= 0.5]
plt.matshow(data_correlation)
plt.matshow(data_high_correlation)

The above heatmap shows that none of the features in this dataset have a correlation above the cutoff we chose, which was 0.5. Therefore, there was no need to remove any features on account of correlation.

In [None]:
#Visualize Original Class Imbalance
print(pk_irr['Diagnosis'].value_counts())
plt.figure(figsize=(6, 6))
sns.countplot(x='Diagnosis', data=pk_irr, hue='Diagnosis', palette=palette)
plt.show()

In [None]:
# Scatter plot Age vs BMI
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Age', y='BMI', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Age vs BMI')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.show()

# Scatter plot Smoking vs BMI
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Smoking', y='BMI', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Smoking vs BMI')
plt.xlabel('Smoking')
plt.ylabel('BMI')
plt.show()


# Scatter plot Physical Activity vs BMI
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PhysicalActivity', y='BMI', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Physical Activity vs BMI')
plt.xlabel('Physical Activity')
plt.ylabel('BMI')
plt.show()

# Scatter plot Alcohol Consumption vs Smoking
plt.figure(figsize=(8, 6))
sns.scatterplot(x='AlcoholConsumption', y='Smoking', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Alcohol Consumption vs Smoking')
plt.xlabel('Alcohol Consumption')
plt.ylabel('Smoking')
plt.show()

# Scatter plot Diet Quality vs Alcohol Consumption
plt.figure(figsize=(8, 6))
sns.scatterplot(x='DietQuality', y='AlcoholConsumption', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Diet Quality vs Alcohol Consumption')
plt.xlabel('Diet Quality')
plt.ylabel('Alcohol Consumption')
plt.show()

# Scatter plot Age vs Smoking
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Age', y='Smoking', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Age vs Smoking')
plt.xlabel('Age')
plt.ylabel('Smoking')
plt.show()

# Scatter plot Diet Quality vs Physical Activity
plt.figure(figsize=(8, 6))
sns.scatterplot(x='DietQuality', y='PhysicalActivity', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Diet Quality vs Physical Activity')
plt.xlabel('Diet Quality')
plt.ylabel('Physical Activity')
plt.show()

# Scatter plot BMI vs Physical Activity
plt.figure(figsize=(8, 6))
sns.scatterplot(x='BMI', y='PhysicalActivity', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of BMI vs Physical Activity')
plt.xlabel('BMI')
plt.ylabel('Physical Activity')
plt.show()

# Scatter plot Age vs Hypertension
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Age', y='Hypertension', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Age vs Hypertension')
plt.xlabel('Age')
plt.ylabel('Hypertension')
plt.show()

# Scatter plot BMI vs Systolic BP
plt.figure(figsize=(8, 6))
sns.scatterplot(x='BMI', y='SystolicBP', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of BMI vs Systolic BP')
plt.xlabel('BMI')
plt.ylabel('Systolic BP')
plt.show()

# Scatter plot Cholesterol Total vs Cholesterol LDL
plt.figure(figsize=(8, 6))
sns.scatterplot(x='CholesterolTotal', y='CholesterolLDL', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Cholesterol Total vs Cholesterol LDL')
plt.xlabel('Cholesterol Total')
plt.ylabel('Cholesterol LDL')
plt.show()


# Scatter plot Systolic BP vs Diastolic BP
plt.figure(figsize=(8, 6))
sns.scatterplot(x='SystolicBP', y='DiastolicBP', data=pk_irr, hue='Diagnosis', palette=palette)
plt.title('Scatter Plot of Systolic BP vs Diastolic BP')
plt.xlabel('Systolic BP')
plt.ylabel('Diastolic BP')
plt.show()


There was a considerable class imbalance; most of the data points in the dataset were of the positive class (patients with parkinsons). This was corrected by applying SMOTE.

**Data Preprocessing Part 2**

In [None]:
#Scale Data
standard_scale = StandardScaler()
st_pk_x = standard_scale.fit_transform(pk_x_smote)

In [None]:
#Split Training and Testing Set
x_train, x_test, y_train, y_test = train_test_split(st_pk_x, pk_y_smote, train_size=.8, random_state=42)

**Feature Selection**

In [None]:


#Recursive Feature Elimination
#Basing RFE on a Logistic Regression Model since it's simple to train
elim_model = model = LogisticRegression(max_iter=1200, random_state=42)

feature_elim = RFE(estimator=elim_model, n_features_to_select=25)
feature_elim.fit(x_train, y_train)

#Take best features from this training and use for all future models
top_features = feature_elim.support_
x_train_s = x_train[:, top_features]
x_test_s = x_test[:, top_features]

#Retrieve Feature Names For Plotting
feature_names = pk_features.columns.tolist()
selected_feature_names = [name for name, selected in zip(feature_names, feature_elim.support_) if selected]

In [None]:
selected_feature_names

These are the 20 features the above feature elimination process has selected as the most important.

**First Model Implementation - Simple Logistic Regression**

In [None]:
#Fit Model
logreg = LogisticRegression(random_state=42)
logreg.fit(x_train_s, y_train)

#Fit Model w/o Feature Selection for comparison
logreg_b = LogisticRegression(random_state=42)
logreg_b.fit(x_train, y_train)

In [None]:
#Make Predictions
logreg_pred = logreg.predict(x_test_s)
logreg_b_pred = logreg_b.predict(x_test)

**First Model Evaluation - Simple Logistic Regression**

In [None]:
#With Feature Selection
lr_accuracy = accuracy_score(y_test, logreg_pred)
lr_precision = precision_score(y_test, logreg_pred)
lr_recall = recall_score(y_test, logreg_pred)
lr_f1 = f1_score(y_test, logreg_pred)
lr_confusion_matrix = confusion_matrix(y_test, logreg_pred)
print("With Feature Selection:")
print("Accuracy: " + str(lr_accuracy))
print("Precision: " + str(lr_precision))
print("Recall: " + str(lr_recall))
print("f1: " + str(lr_f1))
plt.figure(figsize=(8, 6))
sns.heatmap(lr_confusion_matrix, annot=True, fmt='g', cmap='Purples', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Logistic Regression Confusion Matrix')
plt.show()

#Without Feature Selection
lr_b_accuracy = accuracy_score(y_test, logreg_b_pred)
lr_b_precision = precision_score(y_test, logreg_b_pred)
lr_b_recall = recall_score(y_test, logreg_b_pred)
lr_b_f1 = f1_score(y_test, logreg_b_pred)
lr_b_confusion_matrix = confusion_matrix(y_test, logreg_b_pred)
print("\nWithout Feature Selection:")
print("Accuracy: " + str(lr_b_accuracy))
print("Precision: " + str(lr_b_precision))
print("Recall: " + str(lr_b_recall))
print("f1: " + str(lr_b_f1))
plt.figure(figsize=(8, 6))
sns.heatmap(lr_b_confusion_matrix, annot=True, fmt='g', cmap='Purples', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Logistic Regression Confusion Matrix (No Feature Selection)')
plt.show()

**First Model Visualization - Model Weights**

In [None]:
#With Feature Selection
logreg_thetas = pd.DataFrame({"Feature": selected_feature_names, "Thetas": logreg.coef_[0]}).sort_values(by="Thetas", ascending=False)
logreg_b_thetas =pd.DataFrame({"Feature": feature_names, "Thetas": logreg_b.coef_[0]}).sort_values(by="Thetas", ascending=False)

logreg_colors = ["green" if coef > 0 else "red" for coef in logreg_thetas["Thetas"]]
plt.figure(figsize=(8, 6))
plt.bar(logreg_thetas["Feature"], logreg_thetas["Thetas"], color=logreg_colors)
plt.axhline(0, color='black', linestyle='--')
plt.title("Logistic Regression Feature Importance (Feature Selected)")
plt.xlabel("Features")
plt.ylabel("Theta Value")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

logreg_b_colors = ["green" if coef > 0 else "red" for coef in logreg_b_thetas["Thetas"]]
plt.figure(figsize=(8, 6))
plt.bar(logreg_b_thetas["Feature"], logreg_b_thetas["Thetas"], color=logreg_b_colors)
plt.axhline(0, color='black', linestyle='--')
plt.title("Logistic Regression Feature Importance (No Feature Selection)")
plt.xlabel("Features")
plt.ylabel("Theta Value")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

These visualizations reveal huge influence from symptom features- notably, tremor, rigidity, bradykinesia, and depression. All other features seem to be decently important in determination of diagnosis in our linear models, except for Ethnicity and LDL Cholesterol as shown by the above graph in the model which is *not* feature selected.

**Second Model Implementation - SVM With Gaussian Kernel**

In [None]:
#Fit Model with Optimal Parameters Using GridSearchCV
svm_params = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1, 10]}
svm = GridSearchCV(estimator=SVC(kernel='rbf', random_state=42), param_grid=svm_params, scoring='accuracy')
svm.fit(x_train_s, y_train)

#Fit Model w/o Feature Selection for comparison
svm_b = GridSearchCV(estimator=SVC(kernel='rbf', random_state=42), param_grid=svm_params, scoring='accuracy')
svm_b.fit(x_train, y_train)

In [None]:
#Make Predictions
svm_pred = svm.predict(x_test_s)
svm_b_pred = svm_b.predict(x_test)

**Second Model Evaluation - SVM With Gaussian Kernel**

In [None]:
# With Feature Selection
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
svm_confusion_matrix = confusion_matrix(y_test, svm_pred)
print("With Feature Selection:")
print("Accuracy: " + str(svm_accuracy))
print("Precision: " + str(svm_precision))
print("Recall: " + str(svm_recall))
print("F1 Score: " + str(svm_f1))
plt.figure(figsize=(8, 6))
sns.heatmap(svm_confusion_matrix, annot=True, fmt='g', cmap='Purples', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Support Vector Machine Confusion Matrix (With Feature Selection)')
plt.show()

# Without Feature Selection
svm_b_accuracy = accuracy_score(y_test, svm_b_pred)
svm_b_precision = precision_score(y_test, svm_b_pred)
svm_b_recall = recall_score(y_test, svm_b_pred)
svm_b_f1 = f1_score(y_test, svm_b_pred)
svm_b_confusion_matrix = confusion_matrix(y_test, svm_b_pred) 
print("\nWithout Feature Selection:")
print("Accuracy: " + str(svm_b_accuracy))
print("Precision: " + str(svm_b_precision))
print("Recall: " + str(svm_b_recall))
print("F1 Score: " + str(svm_b_f1))
plt.figure(figsize=(8, 6))
sns.heatmap(svm_b_confusion_matrix, annot=True, fmt='g', cmap='Purples', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Support Vector Machine Confusion Matrix (Without Feature Selection)')
plt.show()

It appears that this model does not benefit from the feature selection technique that was utilized in logistic regression; in fact, removing features seems to make the model worse. Since Gaussian Kernel SVMs leverage dimensionality, this makes sense from a theoretical standpoint.

**Second Model Visualization - TO DO**

Insert something here

**Third Model Implementation - Decision Tree**