In [31]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# File paths
DATA_PATH = "pima_indians.csv"
MODEL_PATH = "pima_xgb_model.pkl"
FEATURES_PATH = "pima_feature_names.pkl"

# Load the dataset
df = pd.read_csv(DATA_PATH)

# Define features and target variable
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Hyperparameter grid for tuning
param_grid = {
    'n_estimators': [48],  # Number of boosting rounds
    'max_depth': [3],          # Maximum depth of a tree
    'learning_rate': [0.1],  # Learning rate
    'subsample': [0.8],         # Subsample ratio of the training instances
    'colsample_bytree': [1.0],  # Subsample ratio of columns when constructing each tree
    'gamma': [0],          # Minimum loss reduction required to make a split
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',  # Metric to optimize
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    verbose=1  # Print progress
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Save the best model and feature names
joblib.dump(best_model, MODEL_PATH)
joblib.dump(X.columns.tolist(), FEATURES_PATH)
print("Model and feature names saved successfully.")

# Evaluate the best model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 48, 'subsample': 0.8}
Model and feature names saved successfully.
Accuracy: 0.7604
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       123
           1       0.66      0.70      0.68        69

    accuracy                           0.76       192
   macro avg       0.74      0.75      0.74       192
weighted avg       0.76      0.76      0.76       192



Parameters: { "use_label_encoder" } are not used.

