In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
df = pd.read_csv("/content/pd_speech_features.csv")
df.head()

In [None]:
len(df)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
sns.distplot(df["gender"], color = "Green");

In [None]:
pd.crosstab(df["class"], df["gender"])

In [None]:
condition = ["Does not have Parkinson's","Does have Parkinson's"]

have_or_not = df["class"].value_counts().tolist()
values = [have_or_not[0], have_or_not[1]]

fig = px.pie(values = df["class"].value_counts(), names = condition, width = 800, height = 400, color_discrete_sequence = ["skyblue", "violet"], title = "Percentage whether patient has Parkinson's or not")
fig.show()

In [None]:
df["class"].value_counts().plot(kind = "bar", color = ["skyblue", "lightgreen"]);

In [None]:
sns.distplot(df["numPulses"], color = "maroon");

In [None]:
sns.histplot(df["numPeriodsPulses"], color = "skyblue");

In [None]:
sns.distplot(df["meanPeriodPulses"], color = "indigo");

In [None]:
def evaluate(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)

    return model_scores

In [None]:
X = df.drop("class", axis = 1)
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
models = {
            "Linear SVM": LinearSVC(),
            "Random Forest": RandomForestClassifier(),
            "KNeighborsClassifier": KNeighborsClassifier(),
            "Bagging Classifier": BaggingClassifier()
         }

In [None]:
model_scores = evaluate(models = models,
                        X_train = X_train,
                        X_test = X_test,
                        y_train = y_train,
                        y_test = y_test)

model_scores

In [None]:
model_scores = model_scores.values()
model_scores = list(model_scores)

svc = model_scores[0]
random_forest = model_scores[1]
neighbors = model_scores[2]
bagging = model_scores[3]

In [None]:
model_scores

In [None]:
print(f"Linear SVM: {svc * 100:.2f}%")
print(f"Random Forest Classifier: {random_forest * 100:.2f}%")
print(f"KNeighbors Classifier: {neighbors * 100:.2f}%")
print(f"Bagging Classifier: {bagging * 100:.2f}%")

In [None]:
np.random.seed(42)

model = RandomForestClassifier(n_estimators = 100)
model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
y_preds = model.predict(X_test)

In [None]:
def model_report(y_test, y_preds):
    print(f"Accuracy: {accuracy_score(y_test, y_preds) * 100:.2f}%")
    print(f"Precision: {precision_score(y_test, y_preds)}")
    print(f"Recall: {recall_score(y_test, y_preds)}")
    print(f"R2 Score: {r2_score(y_test, y_preds)}")
    print(f"F1 Score: {f1_score(y_test, y_preds)}")
    print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
    print(f"MSE: {mean_squared_error(y_test, y_preds)}")
model_report(y_test, y_preds)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true = y_test, 
                                        y_pred = y_preds);

In [None]:
np.random.seed(42)

bagging_model = BaggingClassifier()
bagging_model.fit(X_train, y_train)

bagging_model.score(X_test, y_test)

In [None]:
bagging_preds = bagging_model.predict(X_test)

In [None]:
model_report(y_test, bagging_preds)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true = y_test, 
                                        y_pred = bagging_preds);

In [None]:
y_probs = model.predict_proba(X_test)
y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

In [None]:
roc_auc_score(y_test, y_probs_positive)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color = "green", label = "ROC")

    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend();

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)
plot_roc_curve(fpr, tpr)

In [None]:
ConfusionMatrixDisplay.from_estimator(estimator = model, X = X, y = y);

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true = y_test, 
                                        y_pred = y_preds);

In [None]:
model_report(y_test, y_preds)