In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os


In [32]:
df = pd.read_csv('../data/PCOS_data.csv')
df.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm),Unnamed: 44
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,1.0,0,110,80,3,3,18.0,18.0,8.5,
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0.0,0,120,70,3,5,15.0,14.0,3.7,
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1.0,0,120,80,13,15,18.0,20.0,10.0,
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0.0,0,120,70,2,2,15.0,14.0,7.5,
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0.0,0,120,80,3,4,16.0,14.0,7.0,


In [33]:
df.columns

Index(['Sl. No', 'Patient File No.', 'PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)',
       'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ',
       'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)',
       'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of abortions',
       '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)',
       'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio',
       'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)',
       'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)',
       'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)',
       'Unnamed: 44'],
      dtype='object')

In [34]:
df.columns = df.columns.str.strip()

In [35]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.drop(columns=["Sl. No", "Patient File No."], errors='ignore', inplace=True)

target_col = "PCOS (Y/N)"

selected_features = [
    "Age (yrs)",
    "BMI",
    "Cycle(R/I)",
    "Cycle length(days)",
    "Weight gain(Y/N)",
    "hair growth(Y/N)",
    "Skin darkening (Y/N)",
    "Hair loss(Y/N)",
    "Pimples(Y/N)",
    "Fast food (Y/N)",
    "Reg.Exercise(Y/N)",
    "Marraige Status (Yrs)",     
    "Pregnant(Y/N)",
    "No. of abortions",
    "Hip(inch)",
    "Waist(inch)",
    "Waist:Hip Ratio",
    "Blood Group"
]

df = df[selected_features + [target_col]]

num_cols = [
    "Age (yrs)", "BMI", "Cycle length(days)", "Marraige Status (Yrs)",
    "No. of abortions", "Hip(inch)", "Waist(inch)", "Waist:Hip Ratio"
]
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

bin_cols = [
    "Cycle(R/I)", "Weight gain(Y/N)", "hair growth(Y/N)", "Skin darkening (Y/N)",
    "Hair loss(Y/N)", "Pimples(Y/N)", "Fast food (Y/N)", "Reg.Exercise(Y/N)",
    "Pregnant(Y/N)"
]
for col in bin_cols:
    df[col] = df[col].fillna("N")  

df["Blood Group"] = df["Blood Group"].fillna(df["Blood Group"].mode()[0])

binary_map = {"Y": 1, "N": 0, "R": 1, "I": 0}
for col in bin_cols + ["Cycle(R/I)"]:
    df[col] = df[col].map(binary_map)

blood_group_map = {
    "A+": 0, "A-": 1, "B+": 2, "B-": 3,
    "AB+": 4, "AB-": 5, "O+": 6, "O-": 7
}
df["Blood Group"] = df["Blood Group"].map(blood_group_map).fillna(-1).astype(int)


In [36]:
X = df[selected_features]
y = df[target_col]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

imputer = SimpleImputer(strategy="mean")
X_scaled_imputed = imputer.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_imputed, y, test_size=0.2, random_state=42
)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [37]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

best_model = None
best_auc = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n{name} Evaluation:")
    print(classification_report(y_test, y_pred))
    auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC: {auc:.4f}")

    if auc > best_auc:
        best_model = model
        best_auc = auc
        best_model_name = name



Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.78      0.90      0.83        77
           1       0.60      0.38      0.46        32

    accuracy                           0.74       109
   macro avg       0.69      0.64      0.65       109
weighted avg       0.72      0.74      0.72       109

ROC-AUC: 0.6575

Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        77
           1       0.60      0.56      0.58        32

    accuracy                           0.76       109
   macro avg       0.71      0.70      0.71       109
weighted avg       0.76      0.76      0.76       109

ROC-AUC: 0.7770

XGBoost Evaluation:
              precision    recall  f1-score   support

           0       0.79      0.75      0.77        77
           1       0.47      0.53      0.50        32

    accuracy                           0.69       109
   macro avg   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [38]:
MODEL_DIR = "../models"

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(best_model, f"{MODEL_DIR}/pcos_model.pkl")
joblib.dump(scaler, f"{MODEL_DIR}/scaler.pkl")

print(f"\nBest model: {best_model_name} saved to model/pcos_model.pkl")


Best model: Random Forest saved to model/pcos_model.pkl
