In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os


In [6]:
df = pd.read_csv('../data/PCOS_data.csv')
df.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm),Unnamed: 44
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,1.0,0,110,80,3,3,18.0,18.0,8.5,
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0.0,0,120,70,3,5,15.0,14.0,3.7,
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1.0,0,120,80,13,15,18.0,20.0,10.0,
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0.0,0,120,70,2,2,15.0,14.0,7.5,
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0.0,0,120,80,3,4,16.0,14.0,7.0,


In [7]:
df.columns

Index(['Sl. No', 'Patient File No.', 'PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)',
       'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ',
       'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)',
       'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of abortions',
       '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)',
       'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio',
       'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)',
       'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)',
       'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)',
       'Unnamed: 44'],
      dtype='object')

In [8]:
df.columns = df.columns.str.strip()

In [9]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Drop ID columns
df.drop(columns=["Sl. No", "Patient File No."], errors='ignore', inplace=True)

# Target column
target_col = "PCOS (Y/N)"

# Selected features (user input-friendly)
selected_features = [
    "Age (yrs)",
    "BMI",
    "Cycle(R/I)",
    "Cycle length(days)",
    "Weight gain(Y/N)",
    "hair growth(Y/N)",
    "Skin darkening (Y/N)",
    "Hair loss(Y/N)",
    "Pimples(Y/N)",
    "Fast food (Y/N)",
    "Reg.Exercise(Y/N)",
]

# Subset dataframe
df = df[selected_features + [target_col]]

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# Encode categorical features
binary_map = {"Y": 1, "N": 0, "R": 1, "I": 0}
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].map(binary_map)

In [10]:
X = df[selected_features]
y = df[target_col]

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [11]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

best_model = None
best_auc = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n{name} Evaluation:")
    print(classification_report(y_test, y_pred))
    auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC: {auc:.4f}")

    if auc > best_auc:
        best_model = model
        best_auc = auc
        best_model_name = name



Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92        77
           1       0.83      0.75      0.79        32

    accuracy                           0.88       109
   macro avg       0.86      0.84      0.85       109
weighted avg       0.88      0.88      0.88       109

ROC-AUC: 0.9119

Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.84      0.94      0.88        77
           1       0.78      0.56      0.65        32

    accuracy                           0.83       109
   macro avg       0.81      0.75      0.77       109
weighted avg       0.82      0.83      0.82       109

ROC-AUC: 0.8977

XGBoost Evaluation:
              precision    recall  f1-score   support

           0       0.86      0.92      0.89        77
           1       0.77      0.62      0.69        32

    accuracy                           0.83       109
   macro avg   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
MODEL_DIR = "../models"

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(best_model, f"{MODEL_DIR}/pcos_model.pkl")
joblib.dump(scaler, f"{MODEL_DIR}/scaler.pkl")

print(f"\nBest model: {best_model_name} saved to model/pcos_model.pkl")


Best model: Logistic Regression saved to model/pcos_model.pkl
