In [61]:
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
# from matplotlib.colors import ListedColormap

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix

# from sklearn.metrics import PrecisionRecallDisplay

In [17]:
dataset2 = pd.read_excel('./data/PCOS_data_without_infertility.xlsx', sheet_name="Full_new")
dataset2.drop(columns = 'Unnamed: 44', inplace = True)
dataset2.loc[dataset2['II    beta-HCG(mIU/mL)'] == '1.99.', 'II    beta-HCG(mIU/mL)'] = 1.99
dataset2['II    beta-HCG(mIU/mL)'] = dataset2['II    beta-HCG(mIU/mL)'].astype(float)
dataset2.loc[dataset2['AMH(ng/mL)'] == 'a', 'AMH(ng/mL)'] = np.nan
dataset2['AMH(ng/mL)'] = dataset2['AMH(ng/mL)'].astype(float)
dataset2 = dataset2.dropna()

y = dataset2['PCOS (Y/N)']
features = list(dataset2.columns)
features.remove('Sl. No')
features.remove('Patient File No.')
features.remove('PCOS (Y/N)')
X = dataset2[features].values

In [12]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# Without hyperparameter optimization, linear SVM, Gaussian process and Neural Net have the highest accuracy and F1-score

## Scikit-learn's f1-score calculates something else

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

In [64]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42
    )

for name, clf in zip(names, classifiers):
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    # accuracy
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    # f1 score
    f1 = f1_score(y_test, y_pred)
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # true positive
    tp = cm[0,0]
    tn = cm[1,1]
    # false positive
    fp = cm[0,1]
    # false negative
    fn = cm[1,0]
    # Precision is the fraction of the correctly classified instances from the total classified instances
    precision = tp / (tp + fp)
    # Recall is the fraction of the correctly classified instances from the total classified instances
    recall = tp / (tp + fn)
    # F1-score
    f1s = 2 * precision * recall / (precision + recall)

    print(name, f1, f1s, 2*tp / (2*tp+fp+fn))




Nearest Neighbors 0.7480916030534351 0.8903654485049834 0.8903654485049833
Linear SVM 0.8175182481751825 0.9152542372881356 0.9152542372881356
RBF SVM 0.0 0.8066298342541436 0.8066298342541437
Gaussian Process 0.8029197080291971 0.9084745762711866 0.9084745762711864
Decision Tree 0.7083333333333334 0.8541666666666666 0.8541666666666666
Random Forest 0.5544554455445545 0.86404833836858 0.8640483383685801
Neural Net 0.8027210884353742 0.8982456140350877 0.8982456140350877
AdaBoost 0.7794117647058824 0.8986486486486487 0.8986486486486487
Naive Bayes 0.6598984771573604 0.7148936170212765 0.7148936170212766
QDA 0.7307692307692307 0.8478260869565217 0.8478260869565217


In [69]:
accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

df = pd.DataFrame(data = {'F1score': f1s}, index = [0])
df.index.max()

0

In [101]:
# from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate


# Define a function that compares the CV perfromance of a set of predetrmined models 
def cv_comparison(models, X, y, cv):
    # Initiate a DataFrame for the averages and a list for all measures
    cv_metrics = pd.DataFrame(columns = ['Model','Accuracy', 'Precision', 'Recall', 'F1_score', 'ROC_AUC', 'F1', 'DOF', 'Coefficients'])
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    rocaucs = []
    
    # Loop through the models, run a CV, add the average scores to the DataFrame and the scores of 
    # all CVs to the list
    for model in models:
        # calculate model complexity
        number_of_model_coeffs = 0
        try:
            for i in model.coef_:
                if i != 0:
                    number_of_model_coeffs +=1
            coefficients = model.coef_
        except:
            number_of_model_coeffs = np.nan
            coefficients = np.nan
        
        # get cross-validation metrics
        cv_results = cross_validate(model, X, y, 
                                    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], 
                                    cv=cv,n_jobs=4)
        
        # save metrics
        accuracy = np.round(cv_results['test_accuracy'].mean(),4)
        accuracies.append(cv_results['test_accuracy'])
        precision = np.round(cv_results['test_precision'].mean(),4)
        precisions.append(cv_results['test_precision'])
        recall = np.round(cv_results['test_recall'].mean(),4)
        recalls.append(cv_results['test_recall'])
        f1 = np.round(cv_results['test_f1'].mean(),4)
        f1_scores.append(cv_results['test_f1'])
        rocauc = np.round(cv_results['test_roc_auc'].mean(),4)
        rocaucs.append(cv_results['test_roc_auc'])
        # calculate F1 score again, as scikit learn function had weird results above 
        # checked and doesn't make a difference
        # f1s = 2 * precision * recall / (precision + recall)
        # summary dataframe
        df = pd.DataFrame(data = {'Model': str(model),
                                  'Accuracy': accuracy,
                                  'Precision': precision,
                                  'Recall': recall,
                                  'F1_score': f1,
                                  'ROC_AUC': rocauc,
                                #   'F1': f1s,
                                  'DOF': number_of_model_coeffs,
                                  'Coefficients': coefficients}, 
                                  index = [cv_metrics.index.max()+1])
        cv_metrics = pd.concat([cv_metrics, df], axis=0, ignore_index = True)

    return cv_metrics, accuracies, precisions, recalls, f1_scores, rocaucs

In [106]:
# Create the models to be tested
# mlr_reg = LinearRegression()
# ard_reg=ARDRegression(compute_score=True)
# #rf_reg = RandomForestRegressor(random_state=42)
# #xgb_reg = XGBRegressor(random_state=42)
# Put the models in a list to be used for Cross-Validation
models = classifiers

# def ridge_model(alpha):
#     ridgereg = Ridge(alpha=alpha, max_iter=int(1e6))
#     return ridgereg

# def lasso_model(alpha):
#     #Fit the model
#     lassoreg = Lasso(alpha=alpha, max_iter=int(1e6))
#     return lassoreg

#Define the alpha values to test
# alpha_lasso = [0.0001, 0.001, 0.01, 0.02, 0.03, 0.04, 0.05,0.075, 0.1,0.15, 0.2, 0.25, 0.5, 0.75, 1] 
# alpha_ridge = [1, 3, 4, 5, 6, 7, 10, 15, 20, 30, 50]

# for a in alpha_lasso:
#     models.append(lasso_model(a))
# for a in alpha_ridge:    
#     models.append(ridge_model(a))

# Run the Cross-Validation comparison with the models used in this analysis
# cv = n - leave one out cross validation
comp, accuracies, precisions, recalls, f1_scores, rocaucs = cv_comparison(models, StandardScaler().fit_transform(X_train), y_train, 5)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  f1s = 2 * precision * recall / (precision + recall)
  _warn_prf(average, modifier, msg_start, len(result))


In [103]:
comp

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_score,ROC_AUC,F1,DOF,Coefficients
0,KNeighborsClassifier(n_neighbors=3),0.8448,0.8063,0.7074,0.7494,0.8826,0.753619,,
1,"SVC(C=0.025, kernel='linear', random_state=42)",0.8758,0.8696,0.7359,0.7934,0.9511,0.797183,,
2,"SVC(C=1, gamma=2, random_state=42)",0.6708,0.0,0.0,0.0,0.6192,,,
3,GaussianProcessClassifier(kernel=1**2 * RBF(le...,0.8444,0.7053,0.6117,0.6544,0.9412,0.655174,,
4,"DecisionTreeClassifier(max_depth=5, random_sta...",0.832,0.7538,0.7346,0.7408,0.7937,0.744076,,
5,"RandomForestClassifier(max_depth=5, max_featur...",0.7952,0.8464,0.4429,0.5751,0.8886,0.58151,,
6,"MLPClassifier(alpha=1, max_iter=1000, random_s...",0.8819,0.8396,0.8117,0.8203,0.9476,0.825414,,
7,"AdaBoostClassifier(algorithm='SAMME', random_s...",0.8789,0.8103,0.8307,0.819,0.9491,0.820373,,
8,GaussianNB(),0.6989,0.5733,0.9056,0.6806,0.9031,0.702117,,
9,QuadraticDiscriminantAnalysis(),0.8106,0.7259,0.6896,0.7042,0.8058,0.707285,,


In [91]:
cv_results

{'fit_time': array([0.00076699, 0.00075388, 0.00118923, 0.00083423, 0.00111485]),
 'score_time': array([0.00794291, 0.02807808, 0.04851103, 0.04176569, 0.03201222]),
 'test_accuracy': array([0.66153846, 0.67692308, 0.640625  , 0.6875    , 0.671875  ]),
 'test_precision': array([0.44444444, 0.52      , 0.45      , 0.53846154, 0.5       ]),
 'test_recall': array([0.19047619, 0.59090909, 0.42857143, 0.33333333, 0.38095238]),
 'test_f1': array([0.26666667, 0.55319149, 0.43902439, 0.41176471, 0.43243243]),
 'test_roc_auc': array([0.55735931, 0.62737844, 0.60354374, 0.62236988, 0.64839424])}

In [84]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [75]:
StandardScaler().fit_transform(X_train)

array([[ 0.30103265, -1.2985318 , -2.42099129, ...,  0.55344535,
         1.36018065,  0.22998522],
       [-0.82827818,  0.51661109,  0.59858974, ..., -1.4909103 ,
        -0.18547918, -0.86199486],
       [-0.64005971, -0.10961321, -0.57569177, ...,  0.55344535,
         0.74191672, -1.76406536],
       ...,
       [-1.01649665, -0.32743036,  0.93409874, ...,  0.84549616,
         0.74191672,  0.08755303],
       [ 0.11281418,  1.06115395,  0.09532624, ...,  0.84549616,
         1.05104869, -0.24478873],
       [ 1.24212501,  0.45308108, -1.24670978, ...,  0.55344535,
        -2.04027098,  1.3219653 ]])