In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
from mrmr import mrmr_classif
import lightgbm as lgb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
selected_features = mrmr_classif(X=X, y=y, K=20)
print("Selected features:", selected_features)

100%|███████████████████████████████████████████████| 20/20 [00:00<00:00, 72.41it/s]

Selected features: ['Follicle_No_R', 'Menstrual_Irregularity', 'Weight_gainY_N', 'Antral_Follicle_Count', 'Testosterone_Levelng_dL', 'Follicle_No_L', 'Skin_darkening_Y_N', 'hair_growthY_N', 'CycleR_I', 'BMI', 'Fast_food_Y_N', 'AMHng_mL', 'Weight_Kg', 'PimplesY_N', 'Hipinch', 'Cycle_lengthdays', 'Waistinch', 'Hair_lossY_N', 'Avg_F_size_L_mm', 'Vit_D3_ng_mL']





In [8]:
X = X[selected_features]
X

Unnamed: 0,Follicle_No_R,Menstrual_Irregularity,Weight_gainY_N,Antral_Follicle_Count,Testosterone_Levelng_dL,Follicle_No_L,Skin_darkening_Y_N,hair_growthY_N,CycleR_I,BMI,Fast_food_Y_N,AMHng_mL,Weight_Kg,PimplesY_N,Hipinch,Cycle_lengthdays,Waistinch,Hair_lossY_N,Avg_F_size_L_mm,Vit_D3_ng_mL
0,-1.124608,0.656224,-1.084736,-0.416146,0.921038,-1.005429,-0.983198,-0.946511,-0.956593,-1.339000,1.306424,-0.907314,-1.585562,-1.396937,-0.863180,0.116043,-1.508488,-1.387206,1.250808,-0.103055
1,-0.497701,-0.745034,-1.084736,-0.550565,-0.683854,-1.005429,-0.983198,-0.946511,-0.956593,-0.159445,-1.443849,-1.035030,0.187412,-1.396937,-0.262819,0.116043,-0.851697,-1.387206,0.057800,0.102266
2,2.636831,1.123310,-1.084736,0.155136,0.412467,2.296857,-0.983198,-0.946511,-0.956593,-0.075192,1.306424,0.171173,0.517672,1.417759,0.337543,0.116043,0.461885,1.619383,1.250808,0.048381
3,-1.438061,0.656224,-1.084736,-0.214517,-0.414849,-1.335658,-0.983198,-0.946511,-0.956593,0.851601,-1.443849,-1.108348,0.187412,-1.396937,0.937904,0.116043,0.461885,-1.387206,0.057800,-0.027337
4,-0.811154,-0.277948,-1.084736,-0.248122,-0.029107,-1.005429,-0.983198,-0.946511,-0.956593,-1.170492,-1.443849,-0.862377,-0.942424,-1.396937,-0.563000,0.116043,-1.508488,1.619383,0.455469,0.020974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,-0.372320,1.123310,-1.084736,0.928047,1.825502,-0.675201,0.200018,-0.311797,-0.956593,-1.528571,-0.893794,-0.138656,-1.498652,-1.396937,-2.123940,0.116043,-2.362316,-0.785888,0.813372,-0.056416
1537,-0.184248,1.123310,0.456836,-1.760339,-1.585273,0.249439,-0.391590,-0.311797,-0.355038,0.683094,-1.443849,-0.601743,0.196103,-0.271058,0.457615,-0.078752,0.593243,-0.184571,0.216868,-0.054186
1538,-0.372320,-1.212120,-0.057021,1.768168,-1.387327,-0.146835,-0.983198,-0.311797,-0.355038,0.556713,-0.343740,-0.146697,0.568080,-0.833998,0.517651,0.505634,0.461885,-0.184571,0.455469,-0.036925
1539,-0.811154,-1.212120,1.484552,-1.424291,1.820427,-0.344972,0.791626,-0.311797,0.848071,0.346078,1.306424,0.041092,0.648038,-0.271058,0.817832,0.116043,1.184355,-0.184571,0.137334,-0.061805


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [10]:
def objective(trial):
    # Suggest hyperparameters to try
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "objective": "binary",  # or "multiclass" if you're doing that
        "verbose": -1,
    }


    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_test, label=y_test)

    gbm = lgb.train(param,
                    train_data,
                    valid_sets=[valid_data],
                    num_boost_round=1000)

    y_pred = gbm.predict(x_test)
    y_pred_labels = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_labels)

    return 1.0 - acc  # Optuna minimizes, so 1 - accuracy


In [11]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 21:23:37,376] A new study created in memory with name: no-name-5431824d-9066-448b-bce4-1edb1c6cc319
[I 2025-05-31 21:23:37,927] Trial 0 finished with value: 0.017241379310344862 and parameters: {'learning_rate': 0.13816464222368208, 'n_estimators': 628, 'max_depth': 4, 'num_leaves': 109, 'min_child_samples': 25, 'subsample': 0.8411138872081563, 'colsample_bytree': 0.6355372090058654, 'reg_alpha': 3.923545240670943e-06, 'reg_lambda': 7.638356847638994e-08}. Best is trial 0 with value: 0.017241379310344862.
[I 2025-05-31 21:23:38,720] Trial 1 finished with value: 0.02155172413793105 and parameters: {'learning_rate': 0.1352278984770792, 'n_estimators': 801, 'max_depth': 6, 'num_leaves': 35, 'min_child_samples': 15, 'subsample': 0.9596935653248992, 'colsample_bytree': 0.6769661814416909, 'reg_alpha': 1.8045190531118516e-06, 'reg_lambda': 0.00039882924076273273}. Best is trial 0 with value: 0.017241379310344862.
[I 2025-05-31 21:23:39,921] Trial 2 finished with value: 0.017241

Best hyperparameters:
{'learning_rate': 0.040819576160372684, 'n_estimators': 765, 'max_depth': 7, 'num_leaves': 25, 'min_child_samples': 17, 'subsample': 0.7734902907899357, 'colsample_bytree': 0.960453923566604, 'reg_alpha': 2.3506851711528933e-07, 'reg_lambda': 2.0561415340444973e-08}
Best accuracy:
0.9913793103448276


In [12]:
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_test, label=y_test)

best_model = lgb.train(study.best_params,
                train_data,
                valid_sets=[valid_data],
                num_boost_round=1000)

# Predict
preds = best_model.predict(x_test)
preds = (preds > 0.5).astype(int)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")



Final evaluation on test set:
Accuracy: 96.12
Precision: 91.38
Recall: 92.98
F1 Score: 92.17
