In [1]:

import pandas as pd
import numpy as np


df = pd.read_csv("PCOS_data.csv")  # Make sure the CSV is in the same directory as the notebook

df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace(r'[^\w]', '', regex=True)


df['II____betaHCGmIUmL'] = pd.to_numeric(df['II____betaHCGmIUmL'], errors='coerce')
df['AMHngmL'] = pd.to_numeric(df['AMHngmL'], errors='coerce')

df['Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())
df['Fast_food_YN'] = df['Fast_food_YN'].fillna(df['Fast_food_YN'].median())

df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True)


print("Data cleaned successfully!\n")
print(df.info())
print(df.head())


✅ Data cleaned successfully!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 42 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PCOS_YN              541 non-null    int64  
 1   Age_yrs              541 non-null    int64  
 2   Weight_Kg            541 non-null    float64
 3   HeightCm             541 non-null    float64
 4   BMI                  541 non-null    float64
 5   Blood_Group          541 non-null    int64  
 6   Pulse_ratebpm        541 non-null    int64  
 7   RR_breathsmin        541 non-null    int64  
 8   Hbgdl                541 non-null    float64
 9   CycleRI              541 non-null    int64  
 10  Cycle_lengthdays     541 non-null    int64  
 11  Marraige_Status_Yrs  541 non-null    float64
 12  PregnantYN           541 non-null    int64  
 13  No_of_abortions      541 non-null    int64  
 14  I___betaHCGmIUmL     541 non-null    float64
 15  II____beta

In [2]:
from sklearn.model_selection import train_test_split


X = df.drop('PCOS_YN', axis=1)
y = df['PCOS_YN']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

print("Training Accuracy:", rf.score(X_train, y_train))
print("Test Accuracy:", rf.score(X_test, y_test))


Training Accuracy: 1.0
Test Accuracy: 0.8715596330275229


In [4]:

y_proba = rf.predict_proba(X_test)[:, 1]  

for i, prob in enumerate(y_proba[:5]):
    print(f"Sample {i+1}: {prob:.2%} risk of PCOS")


Sample 1: 48.00% risk of PCOS
Sample 2: 18.00% risk of PCOS
Sample 3: 20.00% risk of PCOS
Sample 4: 29.00% risk of PCOS
Sample 5: 69.00% risk of PCOS


In [5]:
import joblib
joblib.dump(rf, 'pcos_rf_model.pkl')
joblib.dump(X.columns.tolist(), 'pcos_features.pkl')

['pcos_features.pkl']