In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
import catboost

In [13]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [14]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [15]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [16]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
model = catboost.CatBoostClassifier()
rfecv = RFECV(estimator=model, step=2, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
rfecv.fit(X, y)
selected_mask = rfecv.support_
selected_features = X.columns[selected_mask]
X = X[selected_features]
print(selected_features, len(selected_features))

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [20]:
from catboost import CatBoostClassifier, Pool

def objective(trial):
    # Define hyperparameters for CatBoost
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "loss_function": "Logloss",
        "eval_metric": "Accuracy",
        "random_seed": 8,
        "verbose": 0
    }

    # Prepare data
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_test, y_test)

    # Train model
    model = CatBoostClassifier(**param)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)

    # Predict
    preds = model.predict(valid_pool)
    acc = accuracy_score(y_test, preds)

    return 1.0 - acc  # Optuna minimizes, so lower is better


In [21]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-18 14:29:56,118] A new study created in memory with name: no-name-ffdd8d3a-3dea-4477-a680-998617146598
[I 2025-05-18 14:29:56,297] Trial 0 finished with value: 0.017241379310344862 and parameters: {'learning_rate': 0.13022614093861626, 'iterations': 227, 'depth': 8, 'subsample': 0.9464626026078092, 'random_strength': 0.7034003517567503, 'l2_leaf_reg': 0.032808499670678275}. Best is trial 0 with value: 0.017241379310344862.
[I 2025-05-18 14:29:56,376] Trial 1 finished with value: 0.025862068965517238 and parameters: {'learning_rate': 0.010715502799134283, 'iterations': 110, 'depth': 6, 'subsample': 0.961986877520178, 'random_strength': 0.0016496546659977055, 'l2_leaf_reg': 0.48156329824985744}. Best is trial 0 with value: 0.017241379310344862.
[I 2025-05-18 14:29:56,433] Trial 2 finished with value: 0.025862068965517238 and parameters: {'learning_rate': 0.02262982117098629, 'iterations': 171, 'depth': 5, 'subsample': 0.7082536170295871, 'random_strength': 0.14355801855465597,

Best hyperparameters:
{'learning_rate': 0.07476417500869825, 'iterations': 325, 'depth': 7, 'subsample': 0.8988012655217306, 'random_strength': 0.42702681958190924, 'l2_leaf_reg': 0.40406456603658547}
Best accuracy:
0.9956896551724138
466:	learn: 0.4707525	total: 113ms	remaining: 129ms
467:	learn: 0.4707510	total: 113ms	remaining: 128ms
468:	learn: 0.4707501	total: 113ms	remaining: 128ms
469:	learn: 0.4707493	total: 113ms	remaining: 128ms
470:	learn: 0.4707454	total: 114ms	remaining: 128ms
471:	learn: 0.4707438	total: 114ms	remaining: 127ms
472:	learn: 0.4707429	total: 114ms	remaining: 127ms
473:	learn: 0.4707389	total: 114ms	remaining: 127ms
474:	learn: 0.4707403	total: 115ms	remaining: 127ms
475:	learn: 0.4707410	total: 115ms	remaining: 126ms
476:	learn: 0.4707383	total: 115ms	remaining: 126ms
477:	learn: 0.4707367	total: 115ms	remaining: 126ms
478:	learn: 0.4707341	total: 115ms	remaining: 126ms
479:	learn: 0.4707340	total: 116ms	remaining: 125ms
480:	learn: 0.4707319	total: 116ms	re