# PCOS Prediction
## Imports

In [None]:
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os

import joblib
import numpy as np

## Preprocessing

In [6]:
df = pd.read_csv('../data/PCOS_data.csv')


In [7]:
df.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm),Unnamed: 44
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,1.0,0,110,80,3,3,18.0,18.0,8.5,
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0.0,0,120,70,3,5,15.0,14.0,3.7,
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1.0,0,120,80,13,15,18.0,20.0,10.0,
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0.0,0,120,70,2,2,15.0,14.0,7.5,
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0.0,0,120,80,3,4,16.0,14.0,7.0,


In [9]:
df.drop(columns=["Sl. No", "Patient File No.", "Unnamed: 44"], errors="ignore", inplace=True)

# Convert all columns except the target to numeric, coerce errors to NaN
for col in df.columns:
	if col != "PCOS (Y/N)":
		df[col] = pd.to_numeric(df[col], errors="coerce")

# Handle missing values
imputer = SimpleImputer(strategy="mean")
df[df.columns] = imputer.fit_transform(df)

y = df["PCOS (Y/N)"]
X = df.drop(columns=["PCOS (Y/N)"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Training

In [None]:
MODEL_DIR = "../models"

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print("🔢 AUC Score:", roc_auc_score(y_test, y_proba))

os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(model, f"{MODEL_DIR}/pcos_model.pkl")
joblib.dump(scaler, f"{MODEL_DIR}/scaler.pkl")
print("✅ Model and scaler saved.")


📊 Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88        77
         1.0       0.72      0.72      0.72        32

    accuracy                           0.83       109
   macro avg       0.80      0.80      0.80       109
weighted avg       0.83      0.83      0.83       109

🔢 AUC Score: 0.924512987012987
✅ Model and scaler saved.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:


def load_model_and_scaler():
    model = joblib.load("models/pcos_model.pkl")
    scaler = joblib.load("models/scaler.pkl")
    return model, scaler

def make_prediction(model, scaler, raw_input_list):
    input_scaled = scaler.transform([raw_input_list])
    prediction = model.predict(input_scaled)[0]
    probability = model.predict_proba(input_scaled)[0][1]
    return int(prediction), round(float(probability), 2)
