In [5]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.pipeline import Pipeline          # <-- imbalanced‐learn pipeline
from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform

# 1) Load and split
df = pd.read_csv("diabetes_dataset.csv")
y = df["Previous_Gestational_Diabetes"]
X = df.drop(["Unnamed: 0", "Previous_Gestational_Diabetes"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2) Preprocessing: dense one‐hot encode categorical only
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
preprocessor = ColumnTransformer([
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
], remainder="passthrough")

# 3) Build imblearn pipeline
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", XGBClassifier(
        random_state=42,
        eval_metric="logloss",
        # no use_label_encoder flag needed
        scale_pos_weight=(y_train==0).sum()/(y_train==1).sum()
    )),
])

# 4) Hyperparameter distributions
param_dist = {
    "clf__n_estimators": randint(50, 300),
    "clf__max_depth":     randint(3, 12),
    "clf__learning_rate": uniform(0.01, 0.4),
    "clf__subsample":     uniform(0.6, 0.4),
    "clf__colsample_bytree": uniform(0.6, 0.4),
    "clf__gamma":         uniform(0, 5),
    "clf__min_child_weight": randint(1, 10),
}

# 5) Randomized search
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
search.fit(X_train, y_train)

# 6) Evaluate
best = search.best_estimator_
print("Best params:", search.best_params_)
print("Test accuracy:", best.score(X_test, y_test))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'clf__colsample_bytree': 0.6185801650879991, 'clf__gamma': 3.0377242595071916, 'clf__learning_rate': 0.07820964947491661, 'clf__max_depth': 9, 'clf__min_child_weight': 2, 'clf__n_estimators': 181, 'clf__subsample': 0.9768807022739411}
Test accuracy: 0.479


In [3]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# 1) Load
df = pd.read_csv("diabetes_dataset.csv")

# 2) Define target & features
y = df["Previous_Gestational_Diabetes"]
X = df.drop(["Unnamed: 0", "Previous_Gestational_Diabetes"], axis=1)

# 3) Identify categorical vs. numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# 4) Build a preprocessing + modeling pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="passthrough"  # leave numeric columns untouched
)

model = Pipeline([
    ("preproc", preprocessor),
    ("clf", XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    ))
])

# 5) Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 6) Train
model.fit(X_train, y_train)

# 7) Evaluate
preds = model.predict(X_test)
accuracy = (preds == y_test).mean()
print(f"Test Accuracy: {accuracy:.3f}")


Test Accuracy: 0.504
