# XGBoost


In [2]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
)
import mlflow
import mlflow.xgboost
import mlflow.sklearn
from tqdm import tqdm

print("Libraries imported successfully.")

Libraries imported successfully.


configuration


In [4]:
param_grid = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [None, 10, 20],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.7, 1.0],
}

# Create a list of all possible combinations
grid = list(ParameterGrid(param_grid))

print(f"Created a grid with {len(grid)} hyperparameter combinations to test.")

Created a grid with 48 hyperparameter combinations to test.


Load and Split Data


In [9]:
FINAL_DATASET_PATH = "data/final_embedding_dataset.csv"
df = pd.read_csv(FINAL_DATASET_PATH)
df["commit_date"] = pd.to_datetime(df["commit_date"])
df.sort_values(by="commit_date", inplace=True)

X = df.drop(
    columns=["commit_hash", "author_email", "commit_date", "is_bug_introducing", "diff"]
)
y = df["is_bug_introducing"]

split_point = int(len(df) * 0.80)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

  df["commit_date"] = pd.to_datetime(df["commit_date"])


In [8]:
df.shape

(25038, 787)

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [7]:
df = df.dropna(subset=["diff", "commit_hash", "author_email", "commit_date"])
df.shape

(25038, 787)

In [5]:
df.isnull().sum()

commit_hash      0
author_email     0
commit_date      0
lines_added      0
lines_deleted    0
                ..
emb_763          0
emb_764          0
emb_765          0
emb_766          0
emb_767          0
Length: 787, dtype: int64

## functions


In [6]:
import xgboost as xgb


def train_and_evaluate_xgb(params, X_train, y_train, X_test, y_test):
    """Trains an XGBoost model and returns the model and its performance metrics."""
    # Use 'use_label_encoder=False' and 'eval_metric' to avoid common warnings
    xgb_model = xgb.XGBClassifier(
        random_state=42, eval_metric="logloss", n_jobs=-1, **params
    )

    start_time = time.time()
    xgb_model.fit(X_train, y_train)
    end_time = time.time()

    y_pred = xgb_model.predict(X_test)
    y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_pred_proba),
        "training_duration": end_time - start_time,
    }
    return xgb_model, metrics


def plot_feature_importance(model, feature_names):
    """Creates and returns a matplotlib figure of feature importances."""
    importances = model.feature_importances_
    df = pd.DataFrame({"feature": feature_names, "importance": importances})
    df = df.sort_values(by="importance", ascending=True)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(df["feature"], df["importance"], color="skyblue")
    ax.set_title("Feature Importance", fontsize=16)
    ax.set_xlabel("Importance")
    plt.tight_layout()
    return fig


def log_to_mlflow(run_name, params, metrics, model, feature_names):
    """Logs all experiment data for a single run to MLflow."""
    with mlflow.start_run(run_name=run_name):
        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Log metrics
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

        # Log the XGBoost model
        mlflow.xgboost.log_model(model, "model")

        # Create and log feature importance plot
        fig = plot_feature_importance(model, feature_names)
        mlflow.log_figure(fig, "feature_importance.png")
        plt.close(fig)  # Prevent inline display

        # Create feature importance DataFrame and save as CSV
        feature_importance_df = pd.DataFrame(
            {"feature": feature_names, "importance": model.feature_importances_}
        ).sort_values(by="importance", ascending=False)

        # Save and log feature importance as CSV
        importance_path = "feature_importance.csv"
        feature_importance_df.to_csv(importance_path, index=False)
        mlflow.log_artifact(importance_path)
        os.remove(importance_path)  # Clean up temporary file

        # Log model parameters as tags for easy filtering
        mlflow.set_tag("model_type", "XGBoost")
        mlflow.set_tag("data_type", "embeddings")


print("Helper functions defined successfully.")

Helper functions defined successfully.


## Model training


In [7]:
# MLflow setup
mlflow.set_tracking_uri("file:./mlruns")
experiment_name = "XGBoost with embeddings"

# Set or create experiment
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except mlflow.exceptions.MlflowException:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

mlflow.set_experiment(experiment_name)
print(f"MLflow experiment set: {experiment_name}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

MLflow experiment set: XGBoost with embeddings
Tracking URI: file:./mlruns


In [None]:
print("\n--- Starting XGBoost Hyperparameter Search ---")

results_list_xgb = []
best_f1_score = 0
best_model = None
best_params = None
best_run_id = None

for i, params in enumerate(tqdm(grid, desc="Training XGBoost Models")):
    run_name = f"xgb_run_{i:03d}"

    # 1. Train and evaluate the XGBoost model
    model, metrics = train_and_evaluate_xgb(params, X_train, y_train, X_test, y_test)

    # 2. Log everything to MLflow for this run
    with mlflow.start_run(run_name=run_name) as run:
        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Log metrics
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

        # Log the XGBoost model
        mlflow.xgboost.log_model(model, "model")

        # Create and log feature importance plot
        fig = plot_feature_importance(model, X_train.columns)
        mlflow.log_figure(fig, "feature_importance.png")
        plt.close(fig)  # Prevent inline display

        # Create feature importance DataFrame and save as CSV
        feature_importance_df = pd.DataFrame(
            {"feature": X_train.columns, "importance": model.feature_importances_}
        ).sort_values(by="importance", ascending=False)

        # Save and log feature importance as CSV
        importance_path = "feature_importance.csv"
        feature_importance_df.to_csv(importance_path, index=False)
        mlflow.log_artifact(importance_path)
        os.remove(importance_path)  # Clean up temporary file

        # Log model parameters as tags for easy filtering
        mlflow.set_tag("model_type", "XGBoost")
        mlflow.set_tag("data_type", "embeddings")

        # Track best model based on F1 score
        if metrics["f1"] > best_f1_score:
            best_f1_score = metrics["f1"]
            best_model = model
            best_params = params
            best_run_id = run.info.run_id
            mlflow.set_tag("best_model", "True")
            print(f"New best model found! F1 Score: {best_f1_score:.4f}")
        else:
            mlflow.set_tag("best_model", "False")

    # 3. Store results for the final summary table
    run_results = {"run_name": run_name, **params, **metrics}
    results_list_xgb.append(run_results)

print("\n--- XGBoost Hyperparameter Search Complete ---")
print(f"Best F1 Score: {best_f1_score:.4f}")
print(f"Best Parameters: {best_params}")
print(f"Best Model Run ID: {best_run_id}")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment name: {experiment_name}")
print("To view results, run: mlflow ui")


--- Starting XGBoost Hyperparameter Search ---


Training XGBoost Models:   0%|          | 0/48 [00:00<?, ?it/s]

Training XGBoost Models:   2%|▏         | 1/48 [00:49<38:31, 49.19s/it]

New best model found! F1 Score: 0.3302


Training XGBoost Models:   4%|▍         | 2/48 [01:30<34:14, 44.67s/it]

New best model found! F1 Score: 0.3391


Training XGBoost Models:   6%|▋         | 3/48 [02:18<34:37, 46.17s/it]

New best model found! F1 Score: 0.3496


Training XGBoost Models:  54%|█████▍    | 26/48 [1:55:24<2:13:15, 363.45s/it]

New best model found! F1 Score: 0.3529




In [1]:
# --- Display the final results table ---
print("\n📈 XGBoost Results Summary Table:\n")
results_df_xgb = pd.DataFrame(results_list_xgb).sort_values(by="f1", ascending=False)

# Create results directory
results_dir = os.path.join("logs", "xgboost_results")
os.makedirs(results_dir, exist_ok=True)

# Save results summary
results_path = os.path.join(results_dir, "results_summary.csv")
results_df_xgb.to_csv(results_path, index=False)
print(f"Results saved to: {results_path}")

# Save the best model locally
if best_model is not None:
    import joblib

    best_model_path = os.path.join(results_dir, "best_xgboost_model.pkl")
    joblib.dump(best_model, best_model_path)
    print(f"Best model saved to: {best_model_path}")

    # Register the best model in MLflow Model Registry
    model_name = "XGBoost_Bug_Prediction_Embeddings"
    try:
        # Register the model from the best run
        model_uri = f"runs:/{best_run_id}/model"
        mlflow.register_model(model_uri, model_name)
        print(f"Best model registered in MLflow Model Registry as: {model_name}")
    except Exception as e:
        print(f"Warning: Could not register model in MLflow Model Registry: {e}")

    # Save best model info
    best_model_info = {
        "best_f1_score": best_f1_score,
        "best_params": best_params,
        "best_run_id": best_run_id,
        "model_path": best_model_path,
        "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    }

    import json

    best_info_path = os.path.join(results_dir, "best_model_info.json")
    with open(best_info_path, "w") as f:
        json.dump(best_model_info, f, indent=2)
    print(f"Best model info saved to: {best_info_path}")

print(f"\n🎯 Best Model Summary:")
print(f"F1 Score: {best_f1_score:.4f}")
print(f"Parameters: {best_params}")
print(f"Run ID: {best_run_id}")

results_df_xgb


📈 XGBoost Results Summary Table:



NameError: name 'pd' is not defined

## Best Model Usage Example

The following cell demonstrates how to load and use the best model for predictions.


In [None]:
# Example: Load and use the best model for predictions
if best_model is not None:
    # Option 1: Use the model that's already in memory
    sample_predictions = best_model.predict(X_test[:5])
    sample_probabilities = best_model.predict_proba(X_test[:5])

    print("Sample predictions from best model:")
    print(f"Predictions: {sample_predictions}")
    print(f"Probabilities: {sample_probabilities}")

    # Option 2: Load the saved model from file
    import joblib

    loaded_model = joblib.load(os.path.join(results_dir, "best_xgboost_model.pkl"))
    loaded_predictions = loaded_model.predict(X_test[:5])
    print(f"\nVerification - Loaded model predictions: {loaded_predictions}")
    print(f"Predictions match: {all(sample_predictions == loaded_predictions)}")

    # Option 3: Load model from MLflow
    try:
        model_uri = f"runs:/{best_run_id}/model"
        mlflow_model = mlflow.xgboost.load_model(model_uri)
        mlflow_predictions = mlflow_model.predict(X_test[:5])
        print(f"MLflow model predictions: {mlflow_predictions}")
        print(
            f"MLflow predictions match: {all(sample_predictions == mlflow_predictions)}"
        )
    except Exception as e:
        print(f"Could not load from MLflow: {e}")
else:
    print("No best model available. Please run the training cells first.")