In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [3]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [4]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [5]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
# Train the model
model = xgb.XGBClassifier()
model.fit(X, y)

In [8]:
# Use SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [9]:
shap_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values(by='importance', ascending=False)

# Select top N features
top_features = shap_df['feature'].head(20).tolist()

In [10]:
top_features

['Menstrual_Irregularity',
 'BMI',
 'Follicle_No_R',
 'Testosterone_Levelng_dL',
 'Antral_Follicle_Count',
 'hair_growthY_N',
 'Weight_gainY_N',
 'CycleR_I',
 'Skin_darkening_Y_N',
 'PimplesY_N',
 'Follicle_No_L',
 'FSH_LH',
 'AMHng_mL',
 'Avg_F_size_R_mm',
 'Weight_Kg',
 'RBSmg_dl',
 'LHmIU_mL',
 'PRLng_mL',
 'Age',
 'Waistinch']

In [11]:
X = X[top_features]
X

Unnamed: 0,Menstrual_Irregularity,BMI,Follicle_No_R,Testosterone_Levelng_dL,Antral_Follicle_Count,hair_growthY_N,Weight_gainY_N,CycleR_I,Skin_darkening_Y_N,PimplesY_N,Follicle_No_L,FSH_LH,AMHng_mL,Avg_F_size_R_mm,Weight_Kg,RBSmg_dl,LHmIU_mL,PRLng_mL,Age,Waistinch
0,0.656038,-1.339000,-1.122836,0.920872,-0.416975,-0.946420,-1.086381,-0.959723,-0.988198,-1.399962,-1.006224,-0.112472,-0.911381,1.182416,-1.586049,-0.558343,-0.021249,2.151234,-0.484837,-1.508897
1,-0.744823,-0.159445,-0.496464,-0.683807,-0.551388,-0.946420,-1.086381,-0.959723,-0.988198,-1.399962,-1.006224,-0.009411,-1.039082,-0.535430,0.188001,-0.558343,-0.068930,-0.317319,0.577375,-0.851891
2,1.122992,-0.075192,2.635397,0.412368,0.154278,-0.946420,-1.086381,-0.959723,-0.988198,1.419355,2.291223,-0.006070,0.166979,2.041339,0.518461,-1.061131,-0.072796,-1.259643,0.179046,0.462121
3,0.656038,0.851601,-1.436022,-0.414839,-0.215357,-0.946420,-1.086381,-0.959723,-0.988198,-1.399962,-1.335969,-0.080089,-1.112391,-0.535430,0.188001,-1.563919,-0.045550,1.337902,0.710152,0.462121
4,-0.277869,-1.170492,-0.809650,-0.029147,-0.248960,-0.946420,-1.086381,-0.959723,-0.988198,-1.399962,-1.006224,-0.054388,-0.866449,-0.535430,-0.942521,-1.061131,-0.072428,0.667345,-0.883166,-1.508897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,1.122992,-1.528571,-0.371189,1.825216,0.927150,-0.312870,-1.086381,-0.959723,0.194570,-1.399962,-0.676480,-0.079266,-0.142813,0.323493,-1.499086,0.070142,-0.046507,0.569667,0.311822,-2.363005
1537,1.122992,0.683094,-0.183278,-1.585107,-1.761101,-0.312870,0.454214,-0.358286,-0.396814,-0.272235,0.246806,-0.001187,-0.605846,-0.020076,0.196697,-0.068124,-0.054939,-0.988269,1.772363,0.593523
1538,-1.211777,0.556713,-0.371189,-1.387187,1.767228,-0.312870,-0.059318,-0.358286,-0.988198,-0.836099,-0.148888,-0.072584,-0.150854,0.151708,0.568900,0.849464,-0.064033,-0.542610,0.710152,0.462121
1539,-1.211777,0.346078,-0.809650,1.820141,-1.425069,-0.312870,1.481277,0.844587,0.785954,-0.272235,-0.346735,-0.127327,0.036914,-0.277753,0.648906,-0.633761,-0.022243,-0.176315,1.241257,1.184828


In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True, random_state=8)

In [13]:
def objective(trial):
    
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 8,
        "use_label_encoder": False,
    }

    train_pool = xgb.DMatrix(x_train, y_train)
    valid_pool = xgb.DMatrix(x_test, y_test)
    
    # Train model
    model = xgb.XGBClassifier(**param)
    model.fit(x_train, y_train,
              eval_set=[(x_test, y_test)],
              verbose=False)

    # Predict
    preds = model.predict(x_test)
    acc = accuracy_score(y_test, preds)

    return 1.0 - acc  # Optuna minimizes

In [14]:
study = optuna.create_study(direction='minimize')  # Because we return 1 - accuracy
study.optimize(objective, n_trials=50)  # Try 50 combinations (or more!)

print("Best hyperparameters:")
print(study.best_params)

print("Best accuracy:")
print(1 - study.best_value)

[I 2025-05-31 20:07:50,664] A new study created in memory with name: no-name-4d7021d7-f5f9-40cd-825e-fdb0d838a93c
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-31 20:07:51,043] Trial 0 finished with value: 0.025862068965517238 and parameters: {'learning_rate': 0.22362521127598226, 'n_estimators': 650, 'max_depth': 7, 'subsample': 0.85498117630046, 'colsample_bytree': 0.8491602164265448, 'reg_alpha': 0.8413912529457798, 'reg_lambda': 0.0013793988161473257}. Best is trial 0 with value: 0.025862068965517238.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-31 20:07:51,311] Trial 1 finished with value: 0.0431034482758621 and parameters: {'learning_rate': 0.01669729585203327, 'n_estimators': 151, 'max_depth': 10, 'subsample': 0.9020933175549183, 'colsample_bytree': 0.8895556722369756, 'reg_alpha': 2.3267126691477764, 'reg_lambda': 1.025888551919279}. Best is trial 0 with valu

Best hyperparameters:
{'learning_rate': 0.01625669577357544, 'n_estimators': 479, 'max_depth': 9, 'subsample': 0.7720251785180374, 'colsample_bytree': 0.9012143267773818, 'reg_alpha': 2.2774305893018276, 'reg_lambda': 2.6271042155977593}
Best accuracy:
0.9827586206896551


In [22]:
# Get the data
best_model = xgb.XGBClassifier(**study.best_params)
best_model.fit(x_train, y_train,
          eval_set=[(x_test, y_test)],
          verbose=False)

# Predict
preds = best_model.predict(x_test)

# Compute final scores
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='binary')
recall = recall_score(y_test, preds, average='binary')
f1 = f1_score(y_test, preds, average='binary')

print("\nFinal evaluation on test set:")
print(f"Accuracy: {acc * 100:.2f}")
print(f"Precision: {precision * 100:.2f}")
print(f"Recall: {recall * 100:.2f}")
print(f"F1 Score: {f1 * 100:.2f}")



Final evaluation on test set:
Accuracy: 97.41
Precision: 96.36
Recall: 92.98
F1 Score: 94.64
