In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import catboost
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
import json

In [21]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [22]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [23]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [24]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [25]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [26]:
# Train the model
model = catboost.CatBoostClassifier()
model.fit(X, y)

Learning rate set to 0.012392
0:	learn: 0.6782997	total: 1.55ms	remaining: 1.55s
1:	learn: 0.6642667	total: 2.72ms	remaining: 1.35s
2:	learn: 0.6555403	total: 4.02ms	remaining: 1.34s
3:	learn: 0.6430619	total: 5.19ms	remaining: 1.29s
4:	learn: 0.6334263	total: 6.52ms	remaining: 1.3s
5:	learn: 0.6200819	total: 7.64ms	remaining: 1.26s
6:	learn: 0.6041721	total: 8.61ms	remaining: 1.22s
7:	learn: 0.5941366	total: 9.57ms	remaining: 1.19s
8:	learn: 0.5836668	total: 10.8ms	remaining: 1.19s
9:	learn: 0.5701192	total: 11.8ms	remaining: 1.17s
10:	learn: 0.5643182	total: 12.9ms	remaining: 1.16s
11:	learn: 0.5534326	total: 14.2ms	remaining: 1.17s
12:	learn: 0.5411247	total: 15.4ms	remaining: 1.17s
13:	learn: 0.5310471	total: 16.4ms	remaining: 1.16s
14:	learn: 0.5199010	total: 17.4ms	remaining: 1.14s
15:	learn: 0.5101823	total: 18.5ms	remaining: 1.14s
16:	learn: 0.5001552	total: 19.9ms	remaining: 1.15s
17:	learn: 0.4913120	total: 21.1ms	remaining: 1.15s
18:	learn: 0.4832253	total: 22.1ms	remaining:

<catboost.core.CatBoostClassifier at 0x12d225750>

In [27]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [28]:
shap_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values(by='importance', ascending=False)

# Select top N features
top_features = shap_df['feature'].head(20).tolist()

In [29]:
top_features

['Menstrual_Irregularity',
 'BMI',
 'Testosterone_Levelng_dL',
 'Follicle_No_R',
 'Antral_Follicle_Count',
 'Weight_gainY_N',
 'hair_growthY_N',
 'Skin_darkening_Y_N',
 'Follicle_No_L',
 'Weight_Kg',
 'CycleR_I',
 'PimplesY_N',
 'AMHng_mL',
 'Fast_food_Y_N',
 'Hair_lossY_N',
 'FSH_LH',
 'Waistinch',
 'Marraige_Status_Yrs',
 'Hipinch',
 'Age']

In [30]:
X = X[top_features]
X

Unnamed: 0,Menstrual_Irregularity,BMI,Testosterone_Levelng_dL,Follicle_No_R,Antral_Follicle_Count,Weight_gainY_N,hair_growthY_N,Skin_darkening_Y_N,Follicle_No_L,Weight_Kg,CycleR_I,PimplesY_N,AMHng_mL,Fast_food_Y_N,Hair_lossY_N,FSH_LH,Waistinch,Marraige_Status_Yrs,Hipinch,Age
0,0.656240,-1.339000,0.921275,-1.122747,-0.416832,-1.087201,-0.947734,-0.984513,-1.006361,-1.586610,-0.959510,-1.407874,-0.907563,1.308170,-1.389520,-0.112483,-1.505357,-0.242600,-0.859414,-0.484837
1,-0.744405,-0.159445,-0.683298,-0.496493,-0.551226,-1.087201,-0.947734,-0.984513,-1.006361,0.187638,-0.959510,-1.407874,-1.035512,-1.443523,-1.389520,-0.009413,-0.849336,0.629853,-0.259587,0.577375
2,1.123121,-0.075192,0.412805,2.634780,0.154344,-1.087201,-0.947734,-0.984513,2.292235,0.518135,-0.959510,1.411899,0.172895,1.308170,1.611953,-0.006072,0.462705,0.411740,0.340239,0.179046
3,0.656240,0.851601,-0.414347,-1.435875,-0.215240,-1.087201,-0.947734,-0.984513,-1.336220,0.187638,-0.959510,-1.407874,-1.108964,-1.443523,-1.389520,-0.080097,0.462705,-0.896939,0.940065,0.710152
4,-0.277523,-1.170492,-0.028681,-0.809620,-0.248839,-1.087201,-0.947734,-0.984513,-1.006361,-0.943010,-0.959510,-1.407874,-0.862544,-1.443523,1.611953,-0.054394,-1.505357,-1.551279,-0.559501,-0.883166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,1.123121,-1.528571,1.825559,-0.371242,0.927112,-1.087201,-0.313855,0.199823,-0.676501,-1.499637,-0.959510,-1.407874,-0.137500,-0.893184,-0.789226,-0.079275,-2.358184,-0.068109,-2.119049,0.311822
1537,1.123121,0.683094,-1.584537,-0.183365,-1.760775,0.454084,-0.313855,-0.392345,0.247106,0.196335,-0.357595,-0.279964,-0.601433,-1.443523,-0.188931,-0.001188,0.593909,1.807664,0.460204,1.772363
1538,-1.211286,0.556713,-1.386630,-0.371242,1.767076,-0.059678,-0.313855,-0.984513,-0.148726,0.568580,-0.357595,-0.843919,-0.145556,-0.342846,-0.188931,-0.072592,0.462705,0.717098,0.520187,0.710152
1539,-1.211286,0.346078,1.820485,-0.809620,-1.424789,1.481607,-0.313855,0.791991,-0.346641,0.648595,0.846236,-0.279964,0.042576,1.308170,-0.188931,-0.127340,1.184328,1.284193,0.820100,1.241257


In [31]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [32]:
from catboost import CatBoostClassifier, Pool

def objective(trial):
    # Define hyperparameters for CatBoost
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "loss_function": "Logloss",
        "eval_metric": "Accuracy",
        "random_seed": 8,
        "verbose": 0
    }

    # Prepare data
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_test, y_test)

    # Train model
    model = CatBoostClassifier(**param)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

    # Predict
    preds = model.predict(valid_pool)
    acc = accuracy_score(y_test, preds)

    return 1.0 - acc  # Optuna minimizes, so lower is better


In [33]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-22 00:12:39,700] A new study created in memory with name: no-name-6a4aaf5a-0c12-4a78-a2fa-edd46b3da6f3
[I 2025-05-22 00:12:39,865] Trial 0 finished with value: 0.017241379310344862 and parameters: {'learning_rate': 0.034663270855011113, 'iterations': 854, 'depth': 8, 'subsample': 0.9698667238726799, 'random_strength': 3.7992134302877854, 'l2_leaf_reg': 0.6635280400150109}. Best is trial 0 with value: 0.017241379310344862.
[I 2025-05-22 00:12:39,907] Trial 1 finished with value: 0.025862068965517238 and parameters: {'learning_rate': 0.021819252911914964, 'iterations': 508, 'depth': 4, 'subsample': 0.9976410308324944, 'random_strength': 0.1364135622506828, 'l2_leaf_reg': 0.006807522221115256}. Best is trial 0 with value: 0.017241379310344862.
[I 2025-05-22 00:12:40,156] Trial 2 finished with value: 0.02155172413793105 and parameters: {'learning_rate': 0.018177211312789238, 'iterations': 777, 'depth': 11, 'subsample': 0.7172643578926948, 'random_strength': 0.0013843125680529553

Best hyperparameters:
{'learning_rate': 0.0831973703932451, 'iterations': 156, 'depth': 10, 'subsample': 0.6007574209794214, 'random_strength': 0.034941320713799504, 'l2_leaf_reg': 0.008594256619767439}
Best accuracy:
0.9956896551724138


In [34]:
# Saving and testing the best model
best_model = catboost.CatBoostClassifier(**study.best_params)
best_model.fit(x_train, y_train, verbose=0)

<catboost.core.CatBoostClassifier at 0x11c64d4d0>

In [35]:
# Predict
preds = best_model.predict(x_test)
acc = accuracy_score(y_test, preds)
print(acc)

0.9870689655172413


In [36]:
# Save
best_model.save_model("best_model")

In [44]:
# Loading and testing
new_model = catboost.CatBoostClassifier()
new_model.load_model("best_model")
preds = new_model.predict(x_test)
acc = accuracy_score(y_test, preds)
print(acc)

0.9870689655172413


In [45]:
selected_features = {"selected_features" : top_features}
with open("selected_features.json", "w") as f:
    json.dump(selected_features, f)