In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import catboost
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
# Train the model
model = catboost.CatBoostClassifier()
model.fit(X, y)

Learning rate set to 0.012392
0:	learn: 0.6772579	total: 61.4ms	remaining: 1m 1s
1:	learn: 0.6606686	total: 63.6ms	remaining: 31.7s
2:	learn: 0.6481672	total: 64.9ms	remaining: 21.6s
3:	learn: 0.6353348	total: 65.9ms	remaining: 16.4s
4:	learn: 0.6223143	total: 67ms	remaining: 13.3s
5:	learn: 0.6102068	total: 68ms	remaining: 11.3s
6:	learn: 0.6005361	total: 69.2ms	remaining: 9.81s
7:	learn: 0.5910754	total: 70.4ms	remaining: 8.73s
8:	learn: 0.5791614	total: 71.8ms	remaining: 7.9s
9:	learn: 0.5688276	total: 72.8ms	remaining: 7.21s
10:	learn: 0.5591023	total: 73.9ms	remaining: 6.64s
11:	learn: 0.5463402	total: 74.9ms	remaining: 6.17s
12:	learn: 0.5381743	total: 76ms	remaining: 5.77s
13:	learn: 0.5285806	total: 77ms	remaining: 5.42s
14:	learn: 0.5166018	total: 78.7ms	remaining: 5.17s
15:	learn: 0.5082589	total: 80.8ms	remaining: 4.97s
16:	learn: 0.5003746	total: 81.8ms	remaining: 4.73s
17:	learn: 0.4929846	total: 82.9ms	remaining: 4.52s
18:	learn: 0.4847644	total: 84.1ms	remaining: 4.34s
1

<catboost.core.CatBoostClassifier at 0x126061190>

In [8]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [9]:
shap_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values(by='importance', ascending=False)

# Select top N features
top_features = shap_df['feature'].head(20).tolist()

In [10]:
top_features

['Menstrual_Irregularity',
 'Testosterone_Levelng_dL',
 'BMI',
 'Follicle_No_R',
 'Antral_Follicle_Count',
 'Weight_gainY_N',
 'hair_growthY_N',
 'Skin_darkening_Y_N',
 'Weight_Kg',
 'Follicle_No_L',
 'CycleR_I',
 'PimplesY_N',
 'Fast_food_Y_N',
 'AMHng_mL',
 'Hipinch',
 'Marraige_Status_Yrs',
 'Hair_lossY_N',
 'Cycle_lengthdays',
 'FSH_LH',
 'Waistinch']

In [11]:
X = X[top_features]
X

Unnamed: 0,Menstrual_Irregularity,Testosterone_Levelng_dL,BMI,Follicle_No_R,Antral_Follicle_Count,Weight_gainY_N,hair_growthY_N,Skin_darkening_Y_N,Weight_Kg,Follicle_No_L,CycleR_I,PimplesY_N,Fast_food_Y_N,AMHng_mL,Hipinch,Marraige_Status_Yrs,Hair_lossY_N,Cycle_lengthdays,FSH_LH,Waistinch
0,0.655728,0.920363,-1.339000,-1.122160,-0.416708,-1.084330,-0.948724,-0.986675,-1.585738,-1.004891,-0.959590,-1.402277,1.308102,-0.909884,-0.859673,-0.239798,-1.389525,0.113991,-0.112477,-1.506928
1,-0.745766,-0.683667,-0.159445,-0.495030,-0.551111,-1.084330,-0.948724,-0.986675,0.187593,-1.004891,-0.959590,-1.402277,-1.444199,-1.037782,-0.259584,0.633121,-1.389525,0.113991,-0.009416,-0.850297
2,1.122893,0.412065,-0.075192,2.640621,0.154507,-1.084330,-0.948724,-0.986675,0.517919,2.294784,-0.959590,1.415810,1.308102,0.170143,0.340505,0.414891,1.611117,0.113991,-0.006075,0.462966
3,0.655728,-0.414807,0.851601,-1.435725,-0.215102,-1.084330,-0.948724,-0.986675,0.187593,-1.334859,-0.959590,-1.402277,-1.444199,-1.111205,0.940594,-0.894487,-1.389525,0.113991,-0.080094,0.462966
4,-0.278601,-0.029272,-1.170492,-0.808595,-0.248703,-1.084330,-0.948724,-0.986675,-0.942471,-1.004891,-0.959590,-1.402277,-1.444199,-0.864883,-0.559629,-1.549176,1.611117,0.113991,-0.054393,-1.506928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,1.122893,1.824342,-1.528571,-0.369604,0.927328,-1.084330,-0.314182,0.198874,-1.498810,-0.674924,-0.959590,-1.402277,-0.893739,-0.140128,-2.119860,-0.065214,-0.789397,0.113991,-0.079272,-2.360549
1537,1.122893,-1.584603,0.683094,-0.181465,-1.760744,0.455719,-0.314182,-0.393901,0.196286,0.248985,-0.357134,-0.275042,-1.444199,-0.603877,0.460523,1.811562,-0.189268,-0.081187,-0.001192,0.594292
1538,-1.212931,-1.386762,0.556713,-0.369604,1.767350,-0.057631,-0.314182,-0.986675,0.568338,-0.146976,-0.357134,-0.838660,-0.343279,-0.148181,0.520532,0.720413,-0.189268,0.504345,-0.072589,0.462966
1539,-1.212931,1.819269,0.346078,-0.808595,-1.424735,1.482418,-0.314182,0.791648,0.648312,-0.344956,0.847778,-0.275042,1.308102,0.039876,0.820576,1.287811,-0.189268,0.113991,-0.127332,1.185260


In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [13]:
from catboost import CatBoostClassifier, Pool

def objective(trial):
    # Define hyperparameters for CatBoost
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "loss_function": "Logloss",
        "eval_metric": "Accuracy",
        "random_seed": 8,
        "verbose": 0
    }

    # Prepare data
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_test, y_test)

    # Train model
    model = CatBoostClassifier(**param)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

    # Predict
    preds = model.predict(valid_pool)
    acc = accuracy_score(y_test, preds)

    return 1.0 - acc  # Optuna minimizes, so lower is better


In [14]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 20:43:09,188] A new study created in memory with name: no-name-0c38b5a0-e054-4f13-ad18-35fcf25a02a8
[I 2025-05-31 20:43:09,273] Trial 0 finished with value: 0.02155172413793105 and parameters: {'learning_rate': 0.08168435819172402, 'iterations': 918, 'depth': 3, 'subsample': 0.9013844789476997, 'random_strength': 7.0249517814946385, 'l2_leaf_reg': 0.1360459564385178}. Best is trial 0 with value: 0.02155172413793105.
[I 2025-05-31 20:43:09,321] Trial 1 finished with value: 0.017241379310344862 and parameters: {'learning_rate': 0.11111585110121822, 'iterations': 842, 'depth': 6, 'subsample': 0.8564463915690739, 'random_strength': 0.030145893568594512, 'l2_leaf_reg': 0.44148161304319045}. Best is trial 1 with value: 0.017241379310344862.
[I 2025-05-31 20:43:09,400] Trial 2 finished with value: 0.03879310344827591 and parameters: {'learning_rate': 0.011379904943425279, 'iterations': 270, 'depth': 5, 'subsample': 0.8224352228408949, 'random_strength': 1.4262493579539761, 'l2_l

Best hyperparameters:
{'learning_rate': 0.043115941730177125, 'iterations': 489, 'depth': 9, 'subsample': 0.7223937446139096, 'random_strength': 0.027179374750263465, 'l2_leaf_reg': 4.505015055425935}
Best accuracy:
0.9913793103448276


In [15]:
# Prepare data
train_pool = Pool(x_train, y_train)
valid_pool = Pool(x_test, y_test)

# Train model
best_model = CatBoostClassifier(**study.best_params)
best_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

# Predict
preds = best_model.predict(x_test)
preds = (preds > 0.5).astype(int)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")


0:	learn: 0.6292065	test: 0.6310801	best: 0.6310801 (0)	total: 2.38ms	remaining: 1.16s
1:	learn: 0.5754128	test: 0.5792030	best: 0.5792030 (1)	total: 4.8ms	remaining: 1.17s
2:	learn: 0.5240516	test: 0.5287177	best: 0.5287177 (2)	total: 7.02ms	remaining: 1.14s
3:	learn: 0.4830112	test: 0.4867675	best: 0.4867675 (3)	total: 9ms	remaining: 1.09s
4:	learn: 0.4443919	test: 0.4495225	best: 0.4495225 (4)	total: 11.4ms	remaining: 1.11s
5:	learn: 0.4120736	test: 0.4185478	best: 0.4185478 (5)	total: 13.6ms	remaining: 1.09s
6:	learn: 0.3832680	test: 0.3905043	best: 0.3905043 (6)	total: 16.4ms	remaining: 1.13s
7:	learn: 0.3583427	test: 0.3668894	best: 0.3668894 (7)	total: 19.1ms	remaining: 1.15s
8:	learn: 0.3334298	test: 0.3414980	best: 0.3414980 (8)	total: 21.4ms	remaining: 1.14s
9:	learn: 0.3140666	test: 0.3229625	best: 0.3229625 (9)	total: 23.6ms	remaining: 1.13s
10:	learn: 0.2950505	test: 0.3031906	best: 0.3031906 (10)	total: 26ms	remaining: 1.13s
11:	learn: 0.2747425	test: 0.2839826	best: 0.28