In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
from sklearn.linear_model import LassoCV
import catboost

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X, y)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [8]:
selected_mask = lasso.coef_ != 0
selected_features = list(X.columns[selected_mask])
print("Selected Features:", selected_features)

Selected Features: ['Skin_darkening_Y_N', 'HeightCm', 'AMHng_mL', 'Menstrual_Irregularity', 'Blood_Group', 'hair_growthY_N', 'Follicle_No_L', 'Antral_Follicle_Count', 'CycleR_I', 'LHmIU_mL', 'Fast_food_Y_N', 'Testosterone_Levelng_dL', 'BMI', 'Follicle_No_R', 'Weight_gainY_N']


In [9]:
X = X[selected_features]
X

Unnamed: 0,Skin_darkening_Y_N,HeightCm,AMHng_mL,Menstrual_Irregularity,Blood_Group,hair_growthY_N,Follicle_No_L,Antral_Follicle_Count,CycleR_I,LHmIU_mL,Fast_food_Y_N,Testosterone_Levelng_dL,BMI,Follicle_No_R,Weight_gainY_N
0,-0.983789,-1.088671,-0.905660,0.657215,0.836915,-0.948485,-1.005567,-0.416342,-0.959723,-0.021320,1.308685,0.921116,-1.339000,-1.123420,-1.087649
1,-0.983789,1.084828,-1.032985,-0.743572,0.836915,-0.948485,-1.005567,-0.550789,-0.959723,-0.069002,-1.444091,-0.683313,-0.159445,-0.497153,-1.087649
2,-0.983789,1.885590,0.169533,1.124144,-2.354192,-0.948485,2.290989,0.155059,-0.959723,-0.072868,1.308685,0.412692,-0.075192,2.634183,-1.087649
3,-0.983789,-2.003828,-1.106080,0.657215,-0.758639,-0.948485,-1.335223,-0.214671,-0.959723,-0.045621,-1.444091,-0.414386,0.851601,-1.436554,-1.087649
4,-0.983789,0.970433,-0.860860,-0.276643,-2.354192,-0.948485,-1.005567,-0.248282,-0.959723,-0.072500,-1.444091,-0.028755,-1.170492,-0.810287,-1.087649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,0.198753,-0.128122,-0.139349,1.124144,1.156026,-0.313276,-0.675912,0.928131,-0.959723,-0.046579,-0.893536,1.825321,-1.528571,-0.371899,-1.087649
1537,-0.392518,-1.134429,-0.601022,1.124144,0.358249,-0.313276,0.247124,-1.760814,-0.358286,-0.055011,-1.444091,-1.584473,0.683094,-0.184019,0.453798
1538,-0.983789,-0.310787,-0.147366,-1.210500,-0.120417,-0.313276,-0.148463,1.768427,-0.358286,-0.064105,-0.342980,-1.386583,0.556713,-0.371899,-0.060017
1539,0.790024,0.526765,0.039849,-1.210500,0.517804,-0.313276,-0.346256,-1.424696,0.844587,-0.022315,1.308685,1.820247,0.346078,-0.810287,1.481430


In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [11]:
from catboost import CatBoostClassifier, Pool

def objective(trial):
    # Define hyperparameters for CatBoost
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "loss_function": "Logloss",
        "eval_metric": "Accuracy",
        "random_seed": 8,
        "verbose": 0
    }

    # Prepare data
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_test, y_test)

    # Train model
    model = CatBoostClassifier(**param)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

    # Predict
    preds = model.predict(valid_pool)
    acc = accuracy_score(y_test, preds)

    return 1.0 - acc  # Optuna minimizes, so lower is better


In [12]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 21:11:41,794] A new study created in memory with name: no-name-8d5b31cd-5c92-47e3-be99-c65472d949a1
[I 2025-05-31 21:11:41,914] Trial 0 finished with value: 0.012931034482758674 and parameters: {'learning_rate': 0.07997796614071694, 'iterations': 467, 'depth': 5, 'subsample': 0.820709698569275, 'random_strength': 0.7234710849092388, 'l2_leaf_reg': 1.1135486679348467}. Best is trial 0 with value: 0.012931034482758674.
[I 2025-05-31 21:11:41,976] Trial 1 finished with value: 0.02155172413793105 and parameters: {'learning_rate': 0.04140372511770608, 'iterations': 130, 'depth': 6, 'subsample': 0.8015959543929914, 'random_strength': 0.03107950160347912, 'l2_leaf_reg': 0.008811419159319126}. Best is trial 0 with value: 0.012931034482758674.
[I 2025-05-31 21:11:42,593] Trial 2 finished with value: 0.025862068965517238 and parameters: {'learning_rate': 0.03591000834640088, 'iterations': 622, 'depth': 12, 'subsample': 0.8281556019622449, 'random_strength': 0.3091844765509227, 'l2_

Best hyperparameters:
{'learning_rate': 0.07997796614071694, 'iterations': 467, 'depth': 5, 'subsample': 0.820709698569275, 'random_strength': 0.7234710849092388, 'l2_leaf_reg': 1.1135486679348467}
Best accuracy:
0.9870689655172413


In [13]:
# Prepare data
train_pool = Pool(x_train, y_train)
valid_pool = Pool(x_test, y_test)

# Train model
best_model = CatBoostClassifier(**study.best_params)
best_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

# Predict
preds = best_model.predict(x_test)
preds = (preds > 0.5).astype(int)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")


0:	learn: 0.6118485	test: 0.6123693	best: 0.6123693 (0)	total: 1.21ms	remaining: 566ms
1:	learn: 0.5261195	test: 0.5230995	best: 0.5230995 (1)	total: 2.27ms	remaining: 527ms
2:	learn: 0.4603864	test: 0.4569208	best: 0.4569208 (2)	total: 2.97ms	remaining: 459ms
3:	learn: 0.4077206	test: 0.4022060	best: 0.4022060 (3)	total: 3.85ms	remaining: 446ms
4:	learn: 0.3761140	test: 0.3732764	best: 0.3732764 (4)	total: 4.62ms	remaining: 427ms
5:	learn: 0.3339026	test: 0.3266014	best: 0.3266014 (5)	total: 5.18ms	remaining: 398ms
6:	learn: 0.3016603	test: 0.2911257	best: 0.2911257 (6)	total: 5.91ms	remaining: 389ms
7:	learn: 0.2791217	test: 0.2687840	best: 0.2687840 (7)	total: 6.47ms	remaining: 371ms
8:	learn: 0.2639035	test: 0.2546934	best: 0.2546934 (8)	total: 7.38ms	remaining: 376ms
9:	learn: 0.2527586	test: 0.2445623	best: 0.2445623 (9)	total: 7.95ms	remaining: 363ms
10:	learn: 0.2365074	test: 0.2280854	best: 0.2280854 (10)	total: 8.58ms	remaining: 356ms
11:	learn: 0.2279202	test: 0.2200909	best