In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
file_path = r'C:/Users/soura/Desktop/Minor Project/Gestational Doiabetes/Dataset/GS.xlsx'

In [3]:
try:
    data = pd.read_excel(file_path)
    print("Dataset loaded successfully!")
    data.head()
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!


In [4]:
data.head()

Unnamed: 0,Case Number,Age,No of Pregnancy,Gestation in previous Pregnancy,BMI,HDL,Family History,unexplained prenetal loss,Large Child or Birth Default,PCOS,Sys BP,Dia BP,OGTT,Hemoglobin,Sedentary Lifestyle,Prediabetes,Class Label(GDM /Non GDM)
0,1,22,2,1,,55.0,0,0,0,0,102.0,69,,12.0,0,0,0
1,2,26,2,1,,53.0,0,0,0,0,101.0,63,,12.4,0,0,0
2,3,29,1,0,,50.0,0,0,0,0,118.0,79,,14.3,0,0,0
3,4,28,2,1,,51.0,0,0,0,0,99.0,70,,15.0,0,0,0
4,5,21,2,1,,52.0,0,0,0,0,116.0,65,,15.0,0,0,0


In [5]:
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

BMI       1081
HDL       1001
Sys BP    1705
OGTT       513
dtype: int64

In [6]:
numeric_columns = ['BMI', 'HDL', 'Sys BP', 'OGTT']
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Verify if missing values are handled
data.isnull().sum()


Case Number                        0
Age                                0
No of Pregnancy                    0
Gestation in previous Pregnancy    0
BMI                                0
HDL                                0
Family History                     0
unexplained prenetal loss          0
Large Child or Birth Default       0
PCOS                               0
Sys BP                             0
Dia BP                             0
OGTT                               0
Hemoglobin                         0
Sedentary Lifestyle                0
Prediabetes                        0
Class Label(GDM /Non GDM)          0
dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler

X = data.drop(columns=['Case Number', 'Class Label(GDM /Non GDM)'])  # Drop ID and target columns
y = data['Class Label(GDM /Non GDM)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((2820, 15), (705, 15), (2820,), (705,))

In [8]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_auc_score = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

xgb_report = classification_report(y_test, xgb_pred)
print("XGBoost:\nAUC Score:",xgb_auc_score, "\nClassification Report:",xgb_report)

Parameters: { "use_label_encoder" } are not used.



XGBoost:
AUC Score: 0.9964911061700945 
Classification Report:               precision    recall  f1-score   support

           0       0.97      0.98      0.98       448
           1       0.96      0.95      0.96       257

    accuracy                           0.97       705
   macro avg       0.97      0.97      0.97       705
weighted avg       0.97      0.97      0.97       705



In [9]:
from sklearn.model_selection import GridSearchCV

In [11]:
from xgboost import XGBClassifier

xgb_params = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'max_depth': [3, 6, 10],  # Depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'subsample': [0.8, 1.0],  # Fraction of samples used per tree
    'colsample_bytree': [0.8, 1.0]  # Fraction of features used per split
}

xgb_grid = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),  # Removed use_label_encoder
    xgb_params,
    cv=5,  # 5-fold cross-validation
    scoring='roc_auc'
)

xgb_grid.fit(X_train, y_train)

print("Best Parameters for XGBoost:", xgb_grid.best_params_)
print("Best ROC-AUC Score for XGBoost:", xgb_grid.best_score_)

Best Parameters for XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best ROC-AUC Score for XGBoost: 0.9984009047512592


In [12]:
# xgb_importance = xgb_grid.best_estimator_.feature_importances_
# features = X_train.columns

# plt.figure(figsize=(10, 6))
# plt.barh(features, xgb_importance)
# plt.xlabel("Feature Importance")
# plt.title("XGBoost Feature Importance")
# plt.show()

In [13]:
import joblib
from xgboost import XGBClassifier  # Import XGBoost

# Assuming you have trained XGBoostClassifier
best_model = xgb_grid.best_estimator_  # Use the best model after tuning

# Save the trained XGBoost model
joblib.dump(best_model, "best_model.pkl")

print("Best XGBoost model saved successfully at best_model.pkl")

Best XGBoost model saved successfully at best_model.pkl
