In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import catboost
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib

In [72]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [73]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [74]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [75]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [76]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [77]:
# Train the model
model = catboost.CatBoostClassifier()
model.fit(X, y)

Learning rate set to 0.012392
0:	learn: 0.6789421	total: 2.26ms	remaining: 2.26s
1:	learn: 0.6644098	total: 3.43ms	remaining: 1.71s
2:	learn: 0.6520238	total: 7.39ms	remaining: 2.46s
3:	learn: 0.6374382	total: 8.42ms	remaining: 2.1s
4:	learn: 0.6238521	total: 9.49ms	remaining: 1.89s
5:	learn: 0.6119689	total: 10.5ms	remaining: 1.74s
6:	learn: 0.6017419	total: 11.5ms	remaining: 1.63s
7:	learn: 0.5910255	total: 12.5ms	remaining: 1.55s
8:	learn: 0.5800846	total: 14.1ms	remaining: 1.55s
9:	learn: 0.5703951	total: 15.6ms	remaining: 1.54s
10:	learn: 0.5602509	total: 16.9ms	remaining: 1.52s
11:	learn: 0.5492662	total: 17.9ms	remaining: 1.48s
12:	learn: 0.5393912	total: 19.3ms	remaining: 1.46s
13:	learn: 0.5305395	total: 20.4ms	remaining: 1.44s
14:	learn: 0.5192170	total: 21.8ms	remaining: 1.43s
15:	learn: 0.5113292	total: 22.9ms	remaining: 1.41s
16:	learn: 0.5055770	total: 24.1ms	remaining: 1.39s
17:	learn: 0.4966883	total: 25.1ms	remaining: 1.37s
18:	learn: 0.4873485	total: 26.2ms	remaining:

<catboost.core.CatBoostClassifier at 0x164271e10>

In [78]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [79]:
shap_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values(by='importance', ascending=False)

# Select top N features
top_features = shap_df['feature'].head(20).tolist()

In [80]:
top_features

['Menstrual_Irregularity',
 'Testosterone_Levelng_dL',
 'BMI',
 'Follicle_No_R',
 'Antral_Follicle_Count',
 'Weight_gainY_N',
 'hair_growthY_N',
 'Skin_darkening_Y_N',
 'Weight_Kg',
 'Follicle_No_L',
 'CycleR_I',
 'PimplesY_N',
 'AMHng_mL',
 'Fast_food_Y_N',
 'Hair_lossY_N',
 'Hipinch',
 'FSH_LH',
 'Waistinch',
 'Marraige_Status_Yrs',
 'Cycle_lengthdays']

In [81]:
X = X[top_features]
X

Unnamed: 0,Menstrual_Irregularity,Testosterone_Levelng_dL,BMI,Follicle_No_R,Antral_Follicle_Count,Weight_gainY_N,hair_growthY_N,Skin_darkening_Y_N,Weight_Kg,Follicle_No_L,CycleR_I,PimplesY_N,AMHng_mL,Fast_food_Y_N,Hair_lossY_N,Hipinch,FSH_LH,Waistinch,Marraige_Status_Yrs,Cycle_lengthdays
0,0.657030,0.920432,-1.339000,-1.120800,-0.417170,-1.086974,-0.944217,-0.985214,-1.586398,-1.004119,-0.958899,-1.401191,-0.906392,1.310436,-1.389174,-0.861830,-0.112685,-1.507372,-0.241374,0.112101
1,-0.743361,-0.683604,-0.159445,-0.494612,-0.551645,-1.086974,-0.944217,-0.985214,0.187281,-1.004119,-0.958899,-1.401191,-1.034239,-1.443767,-1.389174,-0.261568,-0.009625,-0.850751,0.630026,0.112101
2,1.123826,0.412132,-0.075192,2.636330,0.154350,-1.086974,-0.944217,-0.985214,0.517672,2.292880,-0.958899,1.416919,0.173206,1.310436,1.616606,0.338694,-0.006284,0.462489,0.412176,0.112101
3,0.657030,-0.414743,0.851601,-1.433894,-0.215457,-1.086974,-0.944217,-0.985214,0.187281,-1.333819,-0.958899,-1.401191,-1.107633,-1.443767,-1.389174,0.938956,-0.080302,0.462489,-0.894924,0.112101
4,-0.276564,-0.029206,-1.170492,-0.807706,-0.249076,-1.086974,-0.944217,-0.985214,-0.943005,-1.004119,-0.958899,-1.401191,-0.861409,-1.443767,1.616606,-0.561699,-0.054601,-1.507372,-1.548474,0.112101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,1.123826,1.824414,-1.528571,-0.369374,0.927582,-1.086974,-0.309660,0.200428,-1.499453,-0.674419,-0.958899,-1.401191,-0.136942,-0.892927,-0.788018,-2.122380,-0.079480,-2.360978,-0.067094,0.112101
1537,1.123826,-1.584542,0.683094,-0.181517,-1.761921,0.455408,-0.309660,-0.392393,0.195976,0.248741,-0.357489,-0.273947,-0.600506,-1.443767,-0.186862,0.458747,-0.001401,0.593813,1.806417,-0.083094
1538,-1.210158,-1.386701,0.556713,-0.369374,1.768051,-0.058719,-0.309660,-0.985214,0.568101,-0.146899,-0.357489,-0.837569,-0.144991,-0.342086,-0.186862,0.518773,-0.072797,0.462489,0.717166,0.502493
1539,-1.210158,1.819341,0.346078,-0.807706,-1.425733,1.483663,-0.309660,0.793249,0.648090,-0.344719,0.845330,-0.273947,0.042991,1.310436,-0.186862,0.818904,-0.127540,1.184771,1.283577,0.112101


In [82]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [83]:
from catboost import CatBoostClassifier, Pool

def objective(trial):
    # Define hyperparameters for CatBoost
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "loss_function": "Logloss",
        "eval_metric": "Accuracy",
        "random_seed": 8,
        "verbose": 0
    }

    # Prepare data
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_test, y_test)

    # Train model
    model = CatBoostClassifier(**param)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

    # Predict
    preds = model.predict(valid_pool)
    acc = accuracy_score(y_test, preds)

    return 1.0 - acc  # Optuna minimizes, so lower is better


In [84]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-17 19:15:43,321] A new study created in memory with name: no-name-1bb2f400-6699-4720-8772-d8b89ec94b52
[I 2025-05-17 19:15:43,786] Trial 0 finished with value: 0.012931034482758674 and parameters: {'learning_rate': 0.024900508722459816, 'iterations': 130, 'depth': 11, 'subsample': 0.7646935797348945, 'random_strength': 0.2730678298429257, 'l2_leaf_reg': 0.04261405376996456}. Best is trial 0 with value: 0.012931034482758674.
[I 2025-05-17 19:15:43,887] Trial 1 finished with value: 0.017241379310344862 and parameters: {'learning_rate': 0.03628045562279542, 'iterations': 379, 'depth': 8, 'subsample': 0.6511925926157781, 'random_strength': 0.23371248416328713, 'l2_leaf_reg': 0.14982747503249957}. Best is trial 0 with value: 0.012931034482758674.
[I 2025-05-17 19:15:44,082] Trial 2 finished with value: 0.008620689655172376 and parameters: {'learning_rate': 0.022504679228859844, 'iterations': 738, 'depth': 10, 'subsample': 0.9084257711738178, 'random_strength': 0.19028884363650464

Best hyperparameters:
{'learning_rate': 0.03635543652507003, 'iterations': 140, 'depth': 11, 'subsample': 0.978407697098991, 'random_strength': 0.2576912768633996, 'l2_leaf_reg': 0.18123198816079075}
Best accuracy:
0.9956896551724138
