In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
from mrmr import mrmr_classif

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
selected_features = mrmr_classif(X=X, y=y, K=20)
print("Selected features:", selected_features)

100%|███████████████████████████████████████████████| 20/20 [00:00<00:00, 75.43it/s]

Selected features: ['Follicle_No_R', 'Menstrual_Irregularity', 'Weight_gainY_N', 'Antral_Follicle_Count', 'Testosterone_Levelng_dL', 'Follicle_No_L', 'Skin_darkening_Y_N', 'hair_growthY_N', 'CycleR_I', 'BMI', 'Fast_food_Y_N', 'AMHng_mL', 'Weight_Kg', 'PimplesY_N', 'Hipinch', 'Cycle_lengthdays', 'Waistinch', 'Hair_lossY_N', 'Avg_F_size_L_mm', 'Vit_D3_ng_mL']





In [8]:
X = X[selected_features]
X

Unnamed: 0,Follicle_No_R,Menstrual_Irregularity,Weight_gainY_N,Antral_Follicle_Count,Testosterone_Levelng_dL,Follicle_No_L,Skin_darkening_Y_N,hair_growthY_N,CycleR_I,BMI,Fast_food_Y_N,AMHng_mL,Weight_Kg,PimplesY_N,Hipinch,Cycle_lengthdays,Waistinch,Hair_lossY_N,Avg_F_size_L_mm,Vit_D3_ng_mL
0,-1.124880,0.656441,-1.085968,-0.416566,0.920950,-1.006065,-0.982161,-0.946682,-0.960042,-1.339000,1.308753,-0.906930,-1.586238,-1.401196,-0.863611,0.115673,-1.508890,-1.392127,1.253177,-0.103104
1,-0.497595,-0.743987,-1.085968,-0.550959,-0.683028,-1.006065,-0.982161,-0.946682,-0.960042,-0.159445,-1.443415,-1.034293,0.187693,-1.401196,-0.262570,0.115673,-0.852258,-1.392127,0.059097,0.102218
2,2.638831,1.123250,-1.085968,0.154604,0.412668,2.298308,-0.982161,-0.946682,-0.960042,-0.075192,1.308753,0.168583,0.518131,1.413984,0.338470,0.115673,0.461006,1.614977,1.253177,0.048332
3,-1.438522,0.656441,-1.085968,-0.214977,-0.414177,-1.336502,-0.982161,-0.946682,-0.960042,0.851601,-1.443415,-1.107409,0.187693,-1.401196,0.939511,0.115673,0.461006,-1.392127,0.059097,-0.027386
4,-0.811237,-0.277177,-1.085968,-0.248575,-0.028654,-1.006065,-0.982161,-0.946682,-0.960042,-1.170492,-1.443415,-0.862117,-0.942754,-1.401196,-0.563090,0.115673,-1.508890,1.614977,0.457124,0.020925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,-0.372138,1.123250,-1.085968,0.927365,1.824900,-0.675627,0.204906,-0.311300,-0.960042,-1.528571,-0.892981,-0.140391,-1.499281,-1.401196,-2.125796,0.115673,-2.362512,-0.790706,0.815348,-0.056465
1537,-0.183952,1.123250,0.455460,-1.760497,-1.583934,0.249597,-0.388628,-0.311300,-0.357302,0.683094,-1.443415,-0.602201,0.196388,-0.275124,0.458679,-0.079351,0.592332,-0.189286,0.218308,-0.054235
1538,-0.372138,-1.210796,-0.058350,1.767322,-1.386100,-0.146928,-0.982161,-0.311300,-0.357302,0.556713,-0.342548,-0.148411,0.568566,-0.838160,0.518783,0.505722,0.461006,-0.189286,0.457124,-0.036974
1539,-0.811237,-1.210796,1.483078,-1.424514,1.819827,-0.345190,0.798439,-0.311300,0.848177,0.346078,1.308753,0.038861,0.648567,-0.275124,0.819303,0.115673,1.183301,-0.189286,0.138702,-0.061854


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [10]:
# Define the objective function for Random Forest
def objective(trial):
    # Define hyperparameters for Random Forest
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 12, log=True),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42  # Set a fixed random seed for reproducibility
    }

    # Train the Random Forest model with the suggested hyperparameters
    model = RandomForestClassifier(**param)
    model.fit(x_train, y_train)

    # Predict on the validation set
    preds = model.predict(x_test)

    # Calculate accuracy
    acc = accuracy_score(y_test, preds)

    # Return the negative accuracy because Optuna minimizes the objective
    return 1.0 - acc  # Minimize the error (lower is better)

In [11]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 21:18:10,086] A new study created in memory with name: no-name-3e183b4b-a297-488b-b69c-0b3653416805
[I 2025-05-31 21:18:10,348] Trial 0 finished with value: 0.03448275862068961 and parameters: {'n_estimators': 94, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 0.03448275862068961.
[I 2025-05-31 21:18:10,573] Trial 1 finished with value: 0.025862068965517238 and parameters: {'n_estimators': 199, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 1 with value: 0.025862068965517238.
[I 2025-05-31 21:18:10,643] Trial 2 finished with value: 0.03448275862068961 and parameters: {'n_estimators': 69, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.025862068965517238.
[I 2025-05-31 21:18:11,077] Trial 3 finished with value: 0.06034482758620685 and parameters: {'n_estimators': 198, 'max_depth': 4

Best hyperparameters:
{'n_estimators': 106, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}
Best accuracy:
0.9870689655172413


In [12]:
# Get the data
best_model = RandomForestClassifier(**study.best_params)
best_model.fit(x_train, y_train)

# Predict
preds = best_model.predict(x_test)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")



Final evaluation on test set:
Accuracy: 97.84
Precision: 96.43
Recall: 94.74
F1 Score: 95.58
