In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV

In [15]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

In [16]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [17]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [18]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
pd.set_option('display.max_columns', None)

In [19]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
rfecv = RFECV(estimator=model, step=1, cv=5, scoring='accuracy', n_jobs=-1)
rfecv.fit(X, y)
selected_mask = rfecv.support_
selected_features = X.columns[selected_mask]
X = X[selected_features]
print(selected_features, len(selected_features))


# # L1-penalized logistic regression with CV
# clf = LogisticRegressionCV(
#     penalty='l1',
#     solver='saga',  # saga supports L1
#     cv=StratifiedKFold(5),
#     scoring='accuracy',
#     max_iter=1000,
#     random_state=8
# )
# clf.fit(X, y)

# selected_mask = clf.coef_[0] != 0
# selected_features = np.array(X.columns)[selected_mask]
# X = X[selected_features]
# print(f"Selected {len(selected_features)} features", selected_features)


# from boruta import BorutaPy
# from lightgbm import LGBMClassifier

# # LightGBM wrapper (must look like sklearn)
# lgb_clf = LGBMClassifier(n_estimators=200,boosting_type='gbdt')

# # Boruta requires numpy arrays
# X_np = X.values if isinstance(X, pd.DataFrame) else X
# y_np = y.values if isinstance(y, pd.Series) else y

# boruta_selector = BorutaPy(
#     estimator=lgb_clf,
#     n_estimators=200,
#     verbose=1,
#     random_state=8
# )

# # Fit Boruta
# boruta_selector.fit(X, y)

# # Get selected features
# selected_mask = boruta_selector.support_
# selected_features = X.columns[selected_mask]
# print(f"Selected {len(selected_features)} features: {list(selected_features)}")

# # Reduced dataset
# X = X[selected_features]


Index(['PRLng_mL', 'Weight_gainY_N', 'Endometrium_mm',
       'Menstrual_Irregularity', 'TSH_mIU_L', 'hair_growthY_N',
       'Testosterone_Levelng_dL', 'AMHng_mL', 'Follicle_No_L', 'FSHmIU_mL',
       'FSH_LH', 'Follicle_No_R', 'BMI', 'Avg_F_size_R_mm',
       'Antral_Follicle_Count', 'LHmIU_mL'],
      dtype='object') 16


In [20]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [21]:
d_train = lgb.Dataset(x_train, label=y_train)

In [22]:
params = {'learning_rate': 0.3,
              'application': 'binary',
              'num_leaves': 30,
              'verbosity': -1,
              'metric': 'binary_error',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
# params = {
#     'learning_rate': 0.5,
#     'application': 'binary',
#     'num_boost_round': 100,
#     'nfold': 10,
#     'num_leaves': 31,
#     'verbosity': -1,
#     'metric': 'binary_error',
#     'data_random_seed': 2,
#     'bagging_fraction': 0.8,
#     'feature_fraction': 0.6,
#     'nthread': 4,
#     'lambda_l1': 1,
#     'lambda_l2': 1,
# }

In [23]:
lgbcv = lgb.cv(params, train_set=d_train, nfold=10, stratified=True)


In [24]:
min_error = min(lgbcv['valid binary_error-mean'])

# Accuracy is (1 - error)
accuracy = 1 - min_error
print(f"Cross-validated accuracy: {accuracy:.4f}")


Cross-validated accuracy: 0.9542


In [25]:
# def objective(trial):
#     # Suggest hyperparameters to try
#     param = {
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 150),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
#         "subsample": trial.suggest_float("subsample", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
#         "random_state": 42,
#         "objective": "binary",  # or "multiclass" if you're doing that
#     }

#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=8)
#     acc_scores = []

#     for train_idx, valid_idx in skf.split(X, y):
#         X_train_fold = X.iloc[train_idx]
#         X_valid_fold = X.iloc[valid_idx]
#         y_train_fold = y.iloc[train_idx]
#         y_valid_fold = y.iloc[valid_idx]


#         train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
#         valid_data = lgb.Dataset(X_valid_fold, label=y_valid_fold)

#         gbm = lgb.train(param,
#                         train_data,
#                         valid_sets=[valid_data],
#                         num_boost_round=1000)

#         y_pred = gbm.predict(X_valid_fold)
#         y_pred_labels = (y_pred > 0.5).astype(int)
#         acc = accuracy_score(y_valid_fold, y_pred_labels)
#         acc_scores.append(acc)

#     return 1.0 - np.mean(acc_scores)  # Optuna minimizes, so 1 - accuracy

def objective(trial):
    # Suggest hyperparameters to try
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "objective": "binary",  # or "multiclass" if you're doing that
    }


    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_test, label=y_test)

    gbm = lgb.train(param,
                    train_data,
                    valid_sets=[valid_data],
                    num_boost_round=1000)

    y_pred = gbm.predict(x_test)
    y_pred_labels = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_labels)

    return 1.0 - acc  # Optuna minimizes, so 1 - accuracy


In [None]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-04-18 15:22:32,827] A new study created in memory with name: no-name-dec4830f-99ee-4c93-95c3-b097b0802596
[I 2025-04-18 15:22:33,199] Trial 0 finished with value: 0.03448275862068961 and parameters: {'learning_rate': 0.048681113163451424, 'n_estimators': 297, 'max_depth': 10, 'num_leaves': 43, 'min_child_samples': 11, 'subsample': 0.9183367313928923, 'colsample_bytree': 0.8055352745231061, 'reg_alpha': 1.725332833770288e-06, 'reg_lambda': 8.212559134809716e-06}. Best is trial 0 with value: 0.03448275862068961.
[I 2025-04-18 15:22:33,275] Trial 1 finished with value: 0.06034482758620685 and parameters: {'learning_rate': 0.02163519033241913, 'n_estimators': 219, 'max_depth': 5, 'num_leaves': 79, 'min_child_samples': 43, 'subsample': 0.9602123605135215, 'colsample_bytree': 0.6420342834652729, 'reg_alpha': 0.022858665129268214, 'reg_lambda': 5.639842639732859e-08}. Best is trial 0 with value: 0.03448275862068961.
[I 2025-04-18 15:22:33,503] Trial 2 finished with value: 0.0474137931