In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
from mrmr import mrmr_classif

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
selected_features = mrmr_classif(X=X, y=y, K=20)
print("Selected features:", selected_features)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 75.41it/s]

Selected features: ['Follicle_No_R', 'Menstrual_Irregularity', 'Weight_gainY_N', 'Antral_Follicle_Count', 'Testosterone_Levelng_dL', 'Follicle_No_L', 'Skin_darkening_Y_N', 'hair_growthY_N', 'CycleR_I', 'BMI', 'Fast_food_Y_N', 'AMHng_mL', 'Weight_Kg', 'PimplesY_N', 'Hipinch', 'Cycle_lengthdays', 'Waistinch', 'Hair_lossY_N', 'Avg_F_size_L_mm', 'Vit_D3_ng_mL']





In [8]:
X = X[selected_features]
X

Unnamed: 0,Follicle_No_R,Menstrual_Irregularity,Weight_gainY_N,Antral_Follicle_Count,Testosterone_Levelng_dL,Follicle_No_L,Skin_darkening_Y_N,hair_growthY_N,CycleR_I,BMI,Fast_food_Y_N,AMHng_mL,Weight_Kg,PimplesY_N,Hipinch,Cycle_lengthdays,Waistinch,Hair_lossY_N,Avg_F_size_L_mm,Vit_D3_ng_mL
0,-1.122146,0.656038,-1.086525,-0.417101,0.920702,-1.005762,-0.981684,-0.949879,-0.958011,-1.339000,1.309920,-0.905672,-1.586517,-1.403435,-0.862017,0.114388,-1.508471,-1.391141,1.252636,-0.103017
1,-0.494910,-0.744823,-1.086525,-0.551511,-0.683972,-1.005762,-0.981684,-0.949879,-0.958011,-0.159445,-1.443199,-1.033168,0.187034,-1.403435,-0.260918,0.114388,-0.851037,-1.391141,0.058525,0.102297
2,2.641270,1.122992,-1.086525,0.154145,0.412200,2.296913,-0.981684,-0.949879,-0.958011,-0.075192,1.309920,0.170958,0.517401,1.416979,0.340181,0.114388,0.463830,1.615519,1.252636,0.048414
3,-1.435764,0.656038,-1.086525,-0.215485,-0.415004,-1.336030,-0.981684,-0.949879,-0.958011,0.851601,-1.443199,-1.106360,0.187034,-1.403435,0.941279,0.114388,0.463830,-1.391141,0.058525,-0.027302
4,-0.808528,-0.277869,-1.086525,-0.249087,-0.029314,-1.005762,-0.981684,-0.949879,-0.958011,-1.170492,-1.443199,-0.860812,-0.943170,-1.403435,-0.561468,0.114388,-1.508471,1.615519,0.456562,0.021008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,-0.369463,1.122992,-1.086525,0.927007,1.825045,-0.675495,0.204341,-0.314841,-0.958011,-1.528571,-0.892575,-0.138337,-1.499578,-1.403435,-2.124324,0.114388,-2.363134,-0.789809,0.814795,-0.056380
1537,-0.181292,1.122992,0.458065,-1.761208,-1.585270,0.249254,-0.388671,-0.314841,-0.356547,0.683094,-1.443199,-0.600627,0.195728,-0.275269,0.460400,-0.080603,0.595317,-0.188477,0.217740,-0.054151
1538,-0.369463,-1.211777,-0.056799,1.767073,-1.387350,-0.147067,-0.981684,-0.314841,-0.356547,0.556713,-0.341951,-0.146365,0.567826,-0.839352,0.520510,0.504370,0.463830,-0.188477,0.456562,-0.036889
1539,-0.808528,-1.211777,1.487791,-1.425181,1.819970,-0.345227,0.797353,-0.314841,0.846383,0.346078,1.309920,0.041101,0.647810,-0.275269,0.821060,0.114388,1.187007,-0.188477,0.138133,-0.061769


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [10]:
# Define the objective function for Random Forest
def objective(trial):
    # Define hyperparameters for Random Forest
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 12, log=True),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42  # Set a fixed random seed for reproducibility
    }

    # Train the Random Forest model with the suggested hyperparameters
    model = RandomForestClassifier(**param)
    model.fit(x_train, y_train)

    # Predict on the validation set
    preds = model.predict(x_test)

    # Calculate accuracy
    acc = accuracy_score(y_test, preds)

    # Return the negative accuracy because Optuna minimizes the objective
    return 1.0 - acc  # Minimize the error (lower is better)

In [11]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-18 22:35:36,230] A new study created in memory with name: no-name-9f9d3521-df3d-4e8a-b9dd-3bd69dcfd6d7
[I 2025-05-18 22:35:36,382] Trial 0 finished with value: 0.025862068965517238 and parameters: {'n_estimators': 125, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.025862068965517238.
[I 2025-05-18 22:35:36,686] Trial 1 finished with value: 0.030172413793103425 and parameters: {'n_estimators': 122, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': None}. Best is trial 0 with value: 0.025862068965517238.
[I 2025-05-18 22:35:36,925] Trial 2 finished with value: 0.012931034482758674 and parameters: {'n_estimators': 199, 'max_depth': 11, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.012931034482758674.
[I 2025-05-18 22:35:36,977] Trial 3 finished with value: 0.07327586206896552 and parameters: {'n_estimators': 52, 'max_depth

Best hyperparameters:
{'n_estimators': 199, 'max_depth': 11, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'sqrt'}
Best accuracy:
0.9870689655172413
