In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("../PCOS_data.csv")
new = pd.read_csv("../pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
# Train the model
model = lgb.LGBMClassifier()
model.fit(X, y)

[LightGBM] [Info] Number of positive: 376, number of negative: 1165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4644
[LightGBM] [Info] Number of data points in the train set: 1541, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243997 -> initscore=-1.130887
[LightGBM] [Info] Start training from score -1.130887


In [8]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)



In [9]:
shap_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values(by='importance', ascending=False)

# Select top N features
top_features = shap_df['feature'].head(20).tolist()

In [10]:
top_features

['Menstrual_Irregularity',
 'Testosterone_Levelng_dL',
 'BMI',
 'Antral_Follicle_Count',
 'Follicle_No_R',
 'Weight_gainY_N',
 'hair_growthY_N',
 'Skin_darkening_Y_N',
 'Follicle_No_L',
 'CycleR_I',
 'AMHng_mL',
 'PimplesY_N',
 'Weight_Kg',
 'Waistinch',
 'Fast_food_Y_N',
 'Avg_F_size_L_mm',
 'FSHmIU_mL',
 'PRLng_mL',
 'RBSmg_dl',
 'Hipinch']

In [11]:
X = X[top_features]
X

Unnamed: 0,Menstrual_Irregularity,Testosterone_Levelng_dL,BMI,Antral_Follicle_Count,Follicle_No_R,Weight_gainY_N,hair_growthY_N,Skin_darkening_Y_N,Follicle_No_L,CycleR_I,AMHng_mL,PimplesY_N,Weight_Kg,Waistinch,Fast_food_Y_N,Avg_F_size_L_mm,FSHmIU_mL,PRLng_mL,RBSmg_dl,Hipinch
0,0.656317,0.921543,-1.339000,-0.416229,-1.125957,-1.086229,-0.946673,-0.985749,-1.007449,-0.958462,-0.907123,-1.401700,-1.587199,-1.506695,1.304546,1.252232,-0.019464,2.150797,-0.557131,-0.860948
1,-0.745139,-0.682661,-0.159445,-0.550597,-0.499205,-1.086229,-0.946673,-0.985749,-1.007449,-0.958462,-1.035045,-1.401700,0.187214,-0.849850,-1.445529,0.057427,-0.028435,-0.316955,-0.557131,-0.260730
2,1.123469,0.413190,-0.075192,0.154838,2.634553,-1.086229,-0.946673,-0.985749,2.290786,-0.958462,0.173112,1.415227,0.517742,0.463841,1.304546,1.252232,-0.037185,-1.258973,-1.058309,0.339487
3,0.656317,-0.413772,0.851601,-0.214676,-1.439332,-1.086229,-0.946673,-0.985749,-1.337273,-0.958462,-1.108482,-1.401700,0.187214,0.463841,-1.445529,0.057427,-0.018656,1.337729,-1.559486,0.939704
4,-0.277987,-0.028194,-1.170492,-0.248268,-0.812581,-1.086229,-0.946673,-0.985749,-1.007449,-0.958462,-0.862113,-1.401700,-0.943539,-1.506695,-1.445529,0.455696,-0.048656,0.667390,-1.058309,-0.560839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,1.123469,1.825621,-1.528571,0.927457,-0.373855,-1.086229,-0.312126,0.198225,-0.677626,-0.958462,-0.137218,-1.401700,-1.500218,-2.360594,-0.895514,0.814137,-0.034332,0.569743,0.069340,-2.121404
1537,1.123469,-1.583694,0.683094,-1.759915,-0.185829,0.455096,-0.312126,-0.393762,0.245880,-0.356714,-0.601056,-0.274929,0.195912,0.595210,-1.445529,0.216735,-0.047817,-0.987687,-0.068483,0.459530
1538,-1.212292,-1.385832,0.556713,1.767261,-0.373855,-0.058679,-0.312126,-0.985749,-0.149908,-0.356714,-0.145273,-0.838314,0.568191,0.463841,-0.345499,0.455696,-0.046126,-0.542173,0.846165,0.519552
1539,-1.212292,1.820547,0.346078,-1.423993,-0.812581,1.482646,-0.312126,0.790213,-0.347802,0.846781,0.042821,-0.274929,0.648214,1.186371,1.304546,0.137081,-0.038744,-0.175997,-0.632308,0.819661


In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [13]:
def objective(trial):
    # Suggest hyperparameters to try
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "objective": "binary", # or "multiclass" if you're doing that
        "verbose": -1,
    }


    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_test, label=y_test)

    gbm = lgb.train(param,
                    train_data,
                    valid_sets=[valid_data],
                    num_boost_round=1000)

    y_pred = gbm.predict(x_test)
    y_pred_labels = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_labels)

    return 1.0 - acc  # Optuna minimizes, so 1 - accuracy


In [14]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 20:35:13,784] A new study created in memory with name: no-name-45a04dec-9e46-4857-8078-2f0323497a0c
[I 2025-05-31 20:35:14,773] Trial 0 finished with value: 0.030172413793103425 and parameters: {'learning_rate': 0.041552175074827495, 'n_estimators': 866, 'max_depth': 5, 'num_leaves': 145, 'min_child_samples': 48, 'subsample': 0.9274285161900622, 'colsample_bytree': 0.9077213215484643, 'reg_alpha': 0.0067849853837518475, 'reg_lambda': 5.992783238052487e-07}. Best is trial 0 with value: 0.030172413793103425.
[I 2025-05-31 20:35:16,300] Trial 1 finished with value: 0.017241379310344862 and parameters: {'learning_rate': 0.0323933202509244, 'n_estimators': 484, 'max_depth': 9, 'num_leaves': 58, 'min_child_samples': 17, 'subsample': 0.8420260941226427, 'colsample_bytree': 0.7295044386589632, 'reg_alpha': 2.346905347166588e-08, 'reg_lambda': 1.728237413069774e-05}. Best is trial 1 with value: 0.017241379310344862.
[I 2025-05-31 20:35:16,737] Trial 2 finished with value: 0.021551

Best hyperparameters:
{'learning_rate': 0.022811694837696874, 'n_estimators': 615, 'max_depth': 9, 'num_leaves': 63, 'min_child_samples': 27, 'subsample': 0.6577301748062336, 'colsample_bytree': 0.9365511446437059, 'reg_alpha': 0.04912142817831978, 'reg_lambda': 1.14588594457898e-05}
Best accuracy:
0.9870689655172413


In [20]:
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_test, label=y_test)

best_model = lgb.train(study.best_params,
                train_data,
                valid_sets=[valid_data],
                num_boost_round=1000)

# Predict
preds = best_model.predict(x_test)
preds = (preds > 0.5).astype(int)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")



Final evaluation on test set:
Accuracy: 96.98
Precision: 93.10
Recall: 94.74
F1 Score: 93.91
