In [65]:
!pip install scikit-optimize



In [66]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix


from google.colab import drive
from google.colab import files

from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from skopt import BayesSearchCV

LINK TO DATASET - https://www.kaggle.com/datasets/prasoonkottarathil/polycystic-ovary-syndrome-pcos

PCOS_data_without_infertility dataset was cleaned:
- BMI, FSH/LH, and waist/hip ratio was recalculated
- Replaced 1.99. --> 1.99 on column 'II    beta-HCG(mIU/mL)' row 125
- Extra empty rows and columns were deleted to prevent NaN error

In [78]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [79]:
file_path="/content/drive/MyDrive/ML for Medicine/ML/with_missing_values1.csv"

df = pd.read_csv(file_path)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3    Age (yrs)              541 non-null    int64  
 4   Weight (Kg)             541 non-null    float64
 5   Height(Cm)              541 non-null    float64
 6   BMI                     541 non-null    float64
 7   Blood Group             541 non-null    int64  
 8   Pulse rate(bpm)         541 non-null    int64  
 9   RR (breaths/min)        541 non-null    int64  
 10  Hb(g/dl)                541 non-null    float64
 11  Cycle(R/I)              541 non-null    int64  
 12  Cycle length(days)      541 non-null    int64  
 13  Marraige Status (Yrs)   540 non-null    float64
 14  Pregnant(Y/N)           541 non-null    in

In [70]:
#clean dataset
df = df.drop(columns=['Sl. No', 'Patient File No.'])

imputer = SimpleImputer(strategy='median')
df[df.columns] = imputer.fit_transform(df)    #uses imputer to fill empty points with the median of that column

#define predictor and target
X = df.drop(columns=['PCOS (Y/N)'], errors='ignore')
y = df['PCOS (Y/N)']

#fix data imbalance
X, y = SMOTEENN(random_state=42).fit_resample(X, y) #SMOTEENN uses KNN to create synthetic data for the minority class (PCOS = 1) and removes ambiguous data points (points whose majority neighbor is not its actual class).

#selecting predictors
base = RandomForestClassifier(n_estimators=100, random_state=42) #the base model is RFC, where decision trees are created with splitting random features, and RF is created through many DT buildt on randomly selected features and subset of data. The decrease in impurity (how mixed the classes are) for each splitting is credited to that feature, making it important.
rfe = RFE(base, n_features_to_select=30) #this is the RFE model, which will recursively train the dataset using the base model, calculate the average of the importance for feature, and eliminate the least important until only 30 features are left.
X_selected = rfe.fit_transform(X, y) #fitting the data and transforming X so it now only contains 30 selected predictors

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)

#define base models
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('nb', GaussianNB()),
    ('xgb', XGBClassifier(eval_metric='logloss', random_state=42)),
    ('ada', AdaBoostClassifier(random_state=42))
]

#define second level meta learner
meta_learner = RandomForestClassifier(random_state=42) #learns how to best combine base models' predictions

#define stacked model
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, passthrough=False, cv=5) #this model combines base model predictions with meta-learner. with passthrough=False, the meta-learner only sees base model predictions and not the raw dataset

#uses Baysian Optimization to optimize stacked model
param_grid = {                                          #defines which hyperparameters to optimize and what ranges
    'rf__n_estimators': (50, 200),
    'rf__max_depth': (3, 15),
    'final_estimator__n_estimators': (50, 200)
}

opt = BayesSearchCV(                             #this model is a smart way of searching for the best parameter combination. It choses a set of parameter, fits the stacked model, perform cv, compute accuracy, update probabilistic model, repeat.
    estimator=stacked_model,
    search_spaces=param_grid,
    n_iter=10,                                          #number of parameter it tests
    cv=3,
    scoring='accuracy',
    random_state=42,
    verbose=0
)

opt.fit(X_train, y_train)        #this will trigger a cascade of model calling

best_model = opt.best_estimator_
best_params = opt.best_params_
print("Best model:", opt.best_estimator_)
print("Best Parameters:", best_params)

#model evaluation
y_pred = best_model.predict(X_test)   #making prediction using the best model with the best parameters

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\nStacked ML (RFE + RF meta-learner) Performance:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("\nDetailed Report:\n", classification_report(y_test, y_pred))
print(f"Confusion Matrix:\n{conf_matrix}")

Best model: StackingClassifier(cv=5,
                   estimators=[('rf',
                                RandomForestClassifier(max_depth=14,
                                                       n_estimators=96,
                                                       random_state=42)),
                               ('nb', GaussianNB()),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric='logloss