In [137]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib

In [138]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [139]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [140]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [141]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

X.to_pickle('unscaled_X.pkl')

In [149]:
# X[y==1][selected_features["selected_features"]]

Unnamed: 0,PimplesY_N,PRLng_mL,Vit_D3_ng_mL,Cycle_lengthdays,CycleR_I,BMI,RBSmg_dl,Weight_gainY_N,Avg_F_size_L_mm,Weight_Kg,hair_growthY_N,I___beta_HCGmIU_mL,Avg_F_size_R_mm,FSH_LH,Skin_darkening_Y_N,LHmIU_mL,TSH_mIU_L,Antral_Follicle_Count,WaistHip_Ratio,Follicle_No_R,BP__Systolic_mmHg,Marraige_Status_Yrs,FSHmIU_mL,Testosterone_Levelng_dL,AMHng_mL,Menstrual_Irregularity,Follicle_No_L,Endometrium_mm,Hipinch
2,1.0,10.520,49.700,5.0,2.0,25.3,84.0,0.0,18.0,68.80,0.0,494.080,20.00,6.300,0.0,0.8800,2.540,18.4,0.900,15.0,120.0,10.0,5.540,67.86,6.630,1.0,13.0,10.00,40.0
12,1.0,22.430,31.400,2.0,4.0,32.0,125.0,1.0,20.0,74.00,1.0,1214.230,21.00,1.320,1.0,1.5100,6.510,15.2,0.890,8.0,120.0,7.0,2.000,66.10,7.940,0.4,15.0,8.00,45.0
19,1.0,19.130,28.000,7.0,4.0,31.2,100.0,0.0,18.0,85.00,1.0,23.580,17.00,2.330,1.0,0.8100,2.870,15.8,0.950,8.0,120.0,7.0,1.890,64.34,2.070,0.4,16.0,11.00,44.0
24,1.0,11.460,21.500,5.0,2.0,25.2,100.0,0.0,18.0,63.00,1.0,610.630,17.00,6.000,0.0,0.8900,0.650,14.6,0.840,6.0,120.0,12.0,5.340,52.42,1.890,0.8,4.0,7.30,38.0
26,1.0,17.980,25.140,3.0,4.0,29.7,100.0,1.0,11.0,76.00,1.0,1.990,12.00,2.220,1.0,2.7800,4.280,16.4,0.840,20.0,120.0,5.0,6.180,66.24,3.840,0.4,21.0,6.80,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1516,0.4,10.944,28.920,4.6,2.4,28.4,101.4,0.6,15.0,67.94,0.0,123.998,13.60,5.182,0.2,3.1300,2.110,16.0,0.894,4.2,118.0,13.2,5.022,68.30,3.022,1.0,4.8,6.64,41.2
1519,0.2,27.706,22.912,4.4,2.4,33.3,171.2,0.6,7.8,82.90,0.4,99.238,13.76,3.426,0.4,2.4120,2.518,20.0,0.888,7.0,114.0,12.4,7.766,96.10,4.064,1.0,5.8,8.62,42.6
1524,0.4,16.532,38.200,6.8,3.6,32.0,101.6,0.4,16.8,81.40,0.6,286.896,17.40,14.930,0.8,1.0044,1.438,18.0,0.872,7.8,118.0,10.0,3.384,71.90,11.354,1.0,9.8,8.14,41.6
1529,0.6,20.580,32.250,5.0,2.0,25.7,97.4,0.6,15.6,63.00,0.4,182.794,14.40,3.152,0.4,2.9440,2.308,28.0,0.874,4.8,118.0,10.8,6.124,74.10,3.356,1.0,3.4,9.06,41.4


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
joblib.dump(scaler, 'scaler.pkl')

In [None]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
rfecv = RFECV(estimator=model, step=1, cv=5, scoring='accuracy', n_jobs=-1)
rfecv.fit(X, y)
selected_mask = rfecv.support_
selected_features = X.columns[selected_mask]
X = X[selected_features]
print(selected_features, len(selected_features))

In [129]:
X

Unnamed: 0,PimplesY_N,PRLng_mL,Vit_D3_ng_mL,Cycle_lengthdays,CycleR_I,BMI,RBSmg_dl,Weight_gainY_N,Avg_F_size_L_mm,Weight_Kg,hair_growthY_N,I___beta_HCGmIU_mL,Avg_F_size_R_mm,FSH_LH,Skin_darkening_Y_N,LHmIU_mL,TSH_mIU_L,Antral_Follicle_Count,WaistHip_Ratio,Follicle_No_R,BP__Systolic_mmHg,Marraige_Status_Yrs,FSHmIU_mL,Testosterone_Levelng_dL,AMHng_mL,Menstrual_Irregularity,Follicle_No_L,Endometrium_mm,Hipinch
0,-1.399388,2.155471,-0.110646,0.114163,-0.960521,-1.339000,-0.557356,-1.084586,1.248061,-1.585880,-0.946011,-0.287763,1.180819,-0.112551,-0.986565,-0.021274,-0.913143,-0.416698,-1.864279,-1.122485,-0.942889,-0.240167,-0.019515,0.920724,-0.908231,0.655466,-1.005084,0.027633,-0.859636
1,-1.399388,-0.316869,0.088920,0.114163,-0.960521,-0.159445,-0.557356,-1.084586,0.055459,0.188663,-0.946011,-0.263601,-0.537339,-0.009492,-0.986565,-0.068956,0.033082,-0.551112,-1.556469,-0.495969,0.965828,0.632036,-0.028486,-0.683586,-1.036085,-0.744820,-1.005084,-3.162981,-0.259464
2,1.415830,-1.260638,0.036545,0.114163,-0.960521,-0.075192,-1.060306,-1.084586,1.248061,0.519215,-0.946011,-0.085585,2.039898,-0.006151,-0.986565,-0.072822,-0.203474,0.154564,0.290392,2.636615,0.965828,0.413985,-0.037236,0.412338,0.171418,1.122228,2.296491,1.024700,0.340707
3,-1.399388,1.340891,-0.037051,0.114163,-0.960521,0.851601,-1.563256,-1.084586,0.055459,0.188663,-0.946011,-0.287763,-0.537339,-0.080168,-0.986565,-0.045575,5.088521,-0.215076,-0.940849,-1.435744,0.965828,-0.894320,-0.018706,-0.414679,-1.109482,0.655466,-1.335242,-0.637078,0.940879
4,-1.399388,0.669306,0.009906,0.114163,-0.960521,-1.170492,-1.060306,-1.084586,0.452993,-0.942173,-0.946011,0.040700,-0.537339,-0.054468,-0.986565,-0.072454,0.189515,-0.248680,-2.479900,-0.809227,0.965828,-1.548473,-0.048706,-0.029076,-0.863246,-0.278058,-1.005084,-0.969434,-0.559550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,-1.399388,0.571478,-0.065315,0.114163,-0.960521,-1.528571,0.071332,-1.084586,0.810773,-1.498893,-0.312459,-0.209596,0.321740,-0.079346,0.197006,-0.046533,0.018584,0.927447,-0.756162,-0.370665,-0.561145,-0.065727,-0.034383,1.824861,-0.138744,1.122228,-0.674927,0.147281,-2.119996
1537,-0.273301,-0.988848,-0.063148,-0.080876,-0.358706,0.683094,-0.066979,0.457721,0.214472,0.197361,-0.312459,1.816304,-0.021892,-0.001268,-0.394779,-0.054964,-0.223314,-1.760842,0.413516,-0.182710,-0.179402,1.809511,-0.047868,-1.584678,-0.602331,1.122228,0.249514,-0.902963,0.460742
1538,-0.836344,-0.542505,-0.046370,0.504239,-0.358706,0.556713,0.850904,-0.056381,0.452993,0.569668,-0.312459,-0.257178,0.149924,-0.072664,-0.986565,-0.064059,0.123889,1.767537,-0.140542,-0.370665,0.202341,0.719257,-0.046177,-1.386803,-0.146794,-1.211582,-0.146675,0.107399,0.520759
1539,-0.273301,-0.175648,-0.070553,0.114163,0.844922,0.346078,-0.632798,1.485926,0.134966,0.649696,-0.312459,-0.287589,-0.279616,-0.127406,0.788791,-0.022269,-0.430111,-1.424806,0.782889,-0.809227,1.347571,1.286189,-0.038794,1.819787,0.041197,-1.211582,-0.344769,-0.052132,0.820844


In [116]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [117]:
d_train = lgb.Dataset(x_train, label=y_train)

In [118]:
params = {'learning_rate': 0.3,
              'application': 'binary',
              'num_leaves': 30,
              'verbosity': -1,
              'metric': 'binary_error',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
# params = {
#     'learning_rate': 0.5,
#     'application': 'binary',
#     'num_boost_round': 100,
#     'nfold': 10,
#     'num_leaves': 31,
#     'verbosity': -1,
#     'metric': 'binary_error',
#     'data_random_seed': 2,
#     'bagging_fraction': 0.8,
#     'feature_fraction': 0.6,
#     'nthread': 4,
#     'lambda_l1': 1,
#     'lambda_l2': 1,
# }

In [119]:
lgbcv = lgb.cv(params, train_set=d_train, nfold=10, stratified=True)


In [120]:
min_error = min(lgbcv['valid binary_error-mean'])

# Accuracy is (1 - error)
accuracy = 1 - min_error
print(f"Cross-validated accuracy: {accuracy:.4f}")


Cross-validated accuracy: 0.9565


In [121]:
# def objective(trial):
#     # Suggest hyperparameters to try
#     param = {
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 150),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
#         "subsample": trial.suggest_float("subsample", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
#         "random_state": 42,
#         "objective": "binary",  # or "multiclass" if you're doing that
#     }

#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=8)
#     acc_scores = []

#     for train_idx, valid_idx in skf.split(X, y):
#         X_train_fold = X.iloc[train_idx]
#         X_valid_fold = X.iloc[valid_idx]
#         y_train_fold = y.iloc[train_idx]
#         y_valid_fold = y.iloc[valid_idx]


#         train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
#         valid_data = lgb.Dataset(X_valid_fold, label=y_valid_fold)

#         gbm = lgb.train(param,
#                         train_data,
#                         valid_sets=[valid_data],
#                         num_boost_round=1000)

#         y_pred = gbm.predict(X_valid_fold)
#         y_pred_labels = (y_pred > 0.5).astype(int)
#         acc = accuracy_score(y_valid_fold, y_pred_labels)
#         acc_scores.append(acc)

#     return 1.0 - np.mean(acc_scores)  # Optuna minimizes, so 1 - accuracy

def objective(trial):
    # Suggest hyperparameters to try
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "objective": "binary",  # or "multiclass" if you're doing that
    }


    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_test, label=y_test)

    gbm = lgb.train(param,
                    train_data,
                    valid_sets=[valid_data],
                    num_boost_round=1000)

    y_pred = gbm.predict(x_test)
    y_pred_labels = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_labels)

    return 1.0 - acc  # Optuna minimizes, so 1 - accuracy


In [None]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

In [123]:
# Making the best model

#Taking the best params for training
best_params = study.best_params

#Training and testing
train_data = lgb.Dataset(x_train, label=y_train)
best_model = lgb.train(best_params,
                train_data,
                num_boost_round=1000)

y_pred = best_model.predict(x_test)
y_pred_labels = (y_pred > 0.5).astype(int)
acc = accuracy_score(y_test, y_pred_labels)
print(acc)

0.978448275862069


In [124]:
# Saving the best model

best_model.save_model('best_model.txt', num_iteration=best_model.best_iteration) 

<lightgbm.basic.Booster at 0x11a17bb90>

In [125]:
# To read the model
new_model = lgb.Booster(model_file='best_model.txt')

# Testing
y_pred = new_model.predict(x_test)
y_pred_labels = (y_pred > 0.5).astype(int)
acc = accuracy_score(y_test, y_pred_labels)
print(acc)

0.978448275862069


In [126]:
# Saving selected features
selected_features = selected_features.tolist()
selected_features = { "selected_features" : selected_features}

In [127]:
import json

with open("selected_features.json", "w") as f:
    json.dump(selected_features, f)

In [128]:
with open("selected_features.json", "r") as f:
    data = json.load(f)
data

{'selected_features': ['PimplesY_N',
  'PRLng_mL',
  'Vit_D3_ng_mL',
  'Cycle_lengthdays',
  'CycleR_I',
  'BMI',
  'RBSmg_dl',
  'Weight_gainY_N',
  'Avg_F_size_L_mm',
  'Weight_Kg',
  'hair_growthY_N',
  'I___beta_HCGmIU_mL',
  'Avg_F_size_R_mm',
  'FSH_LH',
  'Skin_darkening_Y_N',
  'LHmIU_mL',
  'TSH_mIU_L',
  'Antral_Follicle_Count',
  'WaistHip_Ratio',
  'Follicle_No_R',
  'BP__Systolic_mmHg',
  'Marraige_Status_Yrs',
  'FSHmIU_mL',
  'Testosterone_Levelng_dL',
  'AMHng_mL',
  'Menstrual_Irregularity',
  'Follicle_No_L',
  'Endometrium_mm',
  'Hipinch']}


[LightGBM] [Info] Start training from score -1.130223
[LightGBM] [Info] Number of positive: 301, number of negative: 932
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2818
[LightGBM] [Info] Number of data points in the train set: 1233, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244120 -> initscore=-1.130223
[LightGBM] [Info] Start training from score -1.130223
[LightGBM] [Info] Number of positive: 301, number of negative: 932
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2563
[LightGBM] [Info] Number of data points in the train set: 1233, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244120 -> initscore=-1.130223
[LightGBM] [Info] Sta