In [38]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

# from sklearn.metrics import PrecisionRecallDisplay

In [17]:
dataset2 = pd.read_excel('./data/PCOS_data_without_infertility.xlsx', sheet_name="Full_new")
dataset2.drop(columns = 'Unnamed: 44', inplace = True)
dataset2.loc[dataset2['II    beta-HCG(mIU/mL)'] == '1.99.', 'II    beta-HCG(mIU/mL)'] = 1.99
dataset2['II    beta-HCG(mIU/mL)'] = dataset2['II    beta-HCG(mIU/mL)'].astype(float)
dataset2.loc[dataset2['AMH(ng/mL)'] == 'a', 'AMH(ng/mL)'] = np.nan
dataset2['AMH(ng/mL)'] = dataset2['AMH(ng/mL)'].astype(float)
dataset2 = dataset2.dropna()

y = dataset2['PCOS (Y/N)']
features = list(dataset2.columns)
features.remove('Sl. No')
features.remove('Patient File No.')
features.remove('PCOS (Y/N)')
X = dataset2[features].values

In [12]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# Without hyperparameter optimization, linear SVM has highest accuracy and F1-score

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42
    )

for name, clf in zip(names, classifiers):
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    # accuracy
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    # f1 score
    f1 = f1_score(y_test, y_pred)
    print(name, f1, score)



# display = PrecisionRecallDisplay.from_estimator(
#     classifier, X_test, y_test, name="LinearSVC", plot_chance_level=True
# )
# _ = display.ax_.set_title("2-class Precision-Recall curve")


Nearest Neighbors 0.7480916030534351 0.8472222222222222
Linear SVM 0.8175182481751825 0.8842592592592593
RBF SVM 0.0 0.6759259259259259
Gaussian Process 0.8029197080291971 0.875
Decision Tree 0.7083333333333334 0.8055555555555556
Random Forest 0.5544554455445545 0.7916666666666666
Neural Net 0.8027210884353742 0.8657407407407407
AdaBoost 0.7794117647058824 0.8611111111111112
Naive Bayes 0.6598984771573604 0.6898148148148148
QDA 0.7307692307692307 0.8055555555555556
