In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
model = RandomForestClassifier(n_estimators=100, random_state=8)
rfecv = RFECV(estimator=model, step=2, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
rfecv.fit(X, y)
selected_mask = rfecv.support_
selected_features = X.columns[selected_mask]
X = X[selected_features]
print(selected_features, len(selected_features))

Fitting estimator with 44 features.
Fitting estimator with 42 features.
Fitting estimator with 40 features.
Fitting estimator with 38 features.
Fitting estimator with 36 features.
Fitting estimator with 34 features.
Fitting estimator with 32 features.
Fitting estimator with 30 features.
Fitting estimator with 28 features.
Fitting estimator with 26 features.
Fitting estimator with 24 features.
Index(['Hipinch', 'Age', 'hair_growthY_N', 'Fast_food_Y_N',
       'Testosterone_Levelng_dL', 'Follicle_No_R', 'Follicle_No_L', 'PRLng_mL',
       'TSH_mIU_L', 'CycleR_I', 'Menstrual_Irregularity', 'Cycle_lengthdays',
       'FSH_LH', 'Weight_Kg', 'BMI', 'Skin_darkening_Y_N', 'Weight_gainY_N',
       'Antral_Follicle_Count', 'AMHng_mL', 'Waistinch', 'LHmIU_mL',
       'FSHmIU_mL'],
      dtype='object') 22


In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [9]:
# Define the objective function for Random Forest
def objective(trial):
    # Define hyperparameters for Random Forest
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 12, log=True),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42  # Set a fixed random seed for reproducibility
    }

    # Train the Random Forest model with the suggested hyperparameters
    model = RandomForestClassifier(**param)
    model.fit(x_train, y_train)

    # Predict on the validation set
    preds = model.predict(x_test)

    # Calculate accuracy
    acc = accuracy_score(y_test, preds)

    # Return the negative accuracy because Optuna minimizes the objective
    return 1.0 - acc  # Minimize the error (lower is better)

In [10]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 20:57:59,835] A new study created in memory with name: no-name-a486dcfc-3e94-4421-acef-174121669c7c
[I 2025-05-31 20:57:59,992] Trial 0 finished with value: 0.1637931034482759 and parameters: {'n_estimators': 189, 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 0.1637931034482759.
[I 2025-05-31 20:58:00,272] Trial 1 finished with value: 0.06034482758620685 and parameters: {'n_estimators': 86, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': None}. Best is trial 1 with value: 0.06034482758620685.
[I 2025-05-31 20:58:00,988] Trial 2 finished with value: 0.030172413793103425 and parameters: {'n_estimators': 177, 'max_depth': 11, 'min_samples_split': 2, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 2 with value: 0.030172413793103425.
[I 2025-05-31 20:58:01,136] Trial 3 finished with value: 0.0431034482758621 and parameters: {'n_estimators': 145, 'max_depth': 5, 'min

Best hyperparameters:
{'n_estimators': 64, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 'sqrt'}
Best accuracy:
0.978448275862069


In [11]:
# Get the data
best_model = RandomForestClassifier(**study.best_params)
best_model.fit(x_train, y_train)

# Predict
preds = best_model.predict(x_test)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")



Final evaluation on test set:
Accuracy: 97.84
Precision: 98.15
Recall: 92.98
F1 Score: 95.50
