In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib

In [17]:
file_path = r'C:/Users/soura/Desktop/Minor Project/Gestational Doiabetes/Dataset/GS.xlsx'
try:
    data = pd.read_excel(file_path)
    print("Dataset loaded successfully!")
    # Standardize column names to lowercase with underscores.
    data.columns = data.columns.str.lower().str.replace(" ", "_")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!


In [21]:
try:
    data.columns = data.columns.str.lower().str.strip().str.replace(" ", "_")
    print("Columns after standardization:", data.columns.tolist())
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")


Columns after standardization: ['case_number', 'age', 'noofpregnancy', 'gestation_in_previous_pregnancy', 'bmi', 'hdl', 'family_history', 'unexplained_prenetal_loss', 'large_child_or_birth_default', 'pcos', 'sys_bp', 'dia_bp', 'ogtt', 'hemoglobin', 'sedentary_lifestyle', 'prediabetes', 'class_label(gdm_/non_gdm)']


In [22]:
feature_cols = ['age', 'noofpregnancy', 'bmi', 'sys_bp', 'dia_bp', 'ogtt']
target_col = 'class_label(gdm_/non_gdm)'  # Adjust this if your target column name differs

In [23]:
missing_cols = [col for col in feature_cols if col not in data.columns]
if missing_cols:
    raise KeyError(f"The following columns are missing from the dataset: {missing_cols}")

In [24]:
X = data[feature_cols]
y = data[target_col]

In [25]:
X = X.fillna(X.mean())

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))

In [28]:
rf_params = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 5, 10],
    'randomforestclassifier__min_samples_split': [2, 5, 10]
}

In [29]:
grid = GridSearchCV(pipeline, rf_params, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)
print("Best Parameters for Random Forest:", grid.best_params_)
print("Best ROC-AUC Score for Random Forest:", grid.best_score_)

Best Parameters for Random Forest: {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 50}
Best ROC-AUC Score for Random Forest: 0.9981839222545139


In [30]:
y_pred = grid.predict(X_test)
print("Classification Report on Test Data:\n", classification_report(y_test, y_pred))

Classification Report on Test Data:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       448
           1       0.94      0.96      0.95       257

    accuracy                           0.96       705
   macro avg       0.96      0.96      0.96       705
weighted avg       0.96      0.96      0.96       705



In [31]:
joblib.dump(grid.best_estimator_, "best_model.pkl")
print("Best Random Forest model saved successfully at best_model.pkl")

Best Random Forest model saved successfully at best_model.pkl
