In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegressionCV
import joblib
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load both datasets
original = pd.read_csv("PCOS_data.csv")
new = pd.read_csv("pcos_dataset.csv")

# Set max columns to show is unlimited
pd.set_option('display.max_columns', None)

In [4]:
def preprocess(df):
    df = df.copy()  # avoid SettingWithCopyWarning

    # 1. Clean column names
    df.columns = df.columns.str.strip() \
                           .str.replace(' ', '_') \
                           .str.replace('(', '') \
                           .str.replace(')', '') \
                           .str.replace('.', '') \
                           .str.replace('-', '_') \
                           .str.replace('/', '_')
    df.rename(columns={'II____beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)

    # 2. Drop irrelevant columns
    df.drop(columns=['Sl_No', 'Patient_File_No'], inplace=True, errors='ignore')
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

    # 3. Merge Age columns
    if 'Age' not in df.columns and 'Age_yrs' in df.columns:
        df.rename(columns={'Age_yrs': 'Age'}, inplace=True)
    elif 'Age' in df.columns and 'Age_yrs' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age_yrs'])
        df.drop(columns=['Age_yrs'], inplace=True)

    # 4. Merge PCOS diagnosis columns
    if 'PCOS_Diagnosis' in df.columns:
        df.rename(columns={'PCOS_Diagnosis': 'PCOS_Y_N'}, inplace=True)

    # 5. Handle missing values
    if 'Marraige_Status_Yrs' in df.columns:
        df.loc[:, 'Marraige_Status_Yrs'] = df['Marraige_Status_Yrs'].fillna(df['Marraige_Status_Yrs'].median())

    if 'Fast_food_Y_N' not in df.columns and 'Fast_food_YN' in df.columns:
        df.rename(columns={'Fast_food_YN': 'Fast_food_Y_N'}, inplace=True)
    if 'Fast_food_Y_N' in df.columns:
        df.loc[:, 'Fast_food_Y_N'] = df['Fast_food_Y_N'].fillna(df['Fast_food_Y_N'].mode()[0])

    # 6. Convert to numeric and fill missing values
    if 'II_beta_HCG' not in df.columns and 'II_beta_HCGmIU_mL' in df.columns:
        df.rename(columns={'II_beta_HCGmIU_mL': 'II_beta_HCG'}, inplace=True)
    if 'II_beta_HCG' in df.columns:
        df.loc[:, 'II_beta_HCG'] = pd.to_numeric(df['II_beta_HCG'], errors='coerce')
        df['II_beta_HCG'] = df['II_beta_HCG'].astype(float)
        df.loc[:, 'II_beta_HCG'] = df['II_beta_HCG'].fillna(df['II_beta_HCG'].median())

    if 'AMHng_mL' in df.columns:
        df.loc[:, 'AMHng_mL'] = pd.to_numeric(df['AMHng_mL'], errors='coerce')
        df['AMHng_mL'] = df['AMHng_mL'].astype(float)
        df.loc[:, 'AMHng_mL'] = df['AMHng_mL'].fillna(df['AMHng_mL'].median())

    return df

In [5]:
# Apply preprocessing
original_clean = preprocess(original)
new_clean = preprocess(new)

# Ensure consistent columns across both
all_columns = list(set(original_clean.columns).union(set(new_clean.columns)))

# Align both dataframes to same columns, fill missing with NaN
original_aligned = original_clean.reindex(columns=all_columns)
new_aligned = new_clean.reindex(columns=all_columns)

# Concatenate datasets
combined_df = pd.concat([original_aligned, new_aligned], ignore_index=True)

In [6]:
# Separate features and target
# X = combined_df.drop(columns=['PCOS_Y_N'])
X = combined_df.drop(columns=['PCOS_Y_N'])
y = combined_df['PCOS_Y_N']

# Initialize KNNImputer (e.g., with 5 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=5)

X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

# Sanitize feature names
X.columns = [str(col).replace(' ', '_')
                        .replace('"', '')
                        .replace("'", '')
                        .replace('[', '')
                        .replace(']', '')
                        .replace('{', '')
                        .replace('}', '')
                        .replace(':', '')
                        .replace(',', '')
                        for col in X.columns]

In [7]:
with open("selected_features.json", "r") as f:
    selected_features = json.load(f)["selected_features"]

In [8]:
X = X[selected_features]
X

Unnamed: 0,Menstrual_Irregularity,BMI,Testosterone_Levelng_dL,Follicle_No_R,Antral_Follicle_Count,Weight_gainY_N,hair_growthY_N,Skin_darkening_Y_N,Follicle_No_L,Weight_Kg,CycleR_I,PimplesY_N,AMHng_mL,Fast_food_Y_N,Hair_lossY_N,FSH_LH,Waistinch,Marraige_Status_Yrs,Hipinch,Age
0,0.8,19.3,77.88,3.0,15.0,0.0,0.0,0.0,3.0,44.60,2.0,0.0,2.070,1.0,0.0,2.160,30.0,7.0,36.0,28.0
1,0.2,24.9,46.26,5.0,14.2,0.0,0.0,0.0,3.0,65.00,2.0,0.0,1.530,0.0,0.0,6.170,32.0,11.0,38.0,36.0
2,1.0,25.3,67.86,15.0,18.4,0.0,0.0,0.0,13.0,68.80,2.0,1.0,6.630,1.0,1.0,6.300,36.0,10.0,40.0,33.0
3,0.8,29.7,51.56,2.0,16.2,0.0,0.0,0.0,2.0,65.00,2.0,0.0,1.220,0.0,0.0,3.420,36.0,4.0,42.0,37.0
4,0.4,20.1,59.16,4.0,16.0,0.0,0.0,0.0,3.0,52.00,2.0,0.0,2.260,0.0,1.0,4.420,30.0,1.0,37.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,1.0,18.4,95.70,5.4,23.0,0.0,0.2,0.4,4.0,45.60,2.0,0.0,5.320,0.2,0.2,3.452,27.4,7.8,31.8,34.0
1537,1.0,28.9,28.50,6.0,7.0,0.6,0.2,0.2,6.8,65.10,2.4,0.4,3.362,0.0,0.4,6.490,36.4,16.4,40.4,45.0
1538,0.0,28.3,32.40,5.4,28.0,0.4,0.2,0.0,5.6,69.38,2.4,0.2,5.286,0.4,0.4,3.712,36.0,11.4,40.6,37.0
1539,0.0,27.3,95.60,4.0,9.0,1.0,0.2,0.6,5.0,70.30,3.2,0.4,6.080,1.0,0.4,1.582,38.2,14.0,41.6,41.0
