In [1]:
import warnings
warnings.filterwarnings('ignore')

# XGBoost


In [2]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
)
import mlflow
import mlflow.xgboost
import mlflow.sklearn
from tqdm import tqdm

print("Libraries imported successfully.")

Libraries imported successfully.


configuration


In [3]:
param_grid = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [None, 10, 20],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.7, 1.0],
}

# Create a list of all possible combinations
grid = list(ParameterGrid(param_grid))

print(f"Created a grid with {len(grid)} hyperparameter combinations to test.")

Created a grid with 48 hyperparameter combinations to test.


Load and Split Data


In [4]:
FINAL_DATASET_PATH = "data/final_dataset_with_embeddings.csv"
df = pd.read_csv(FINAL_DATASET_PATH)

df.drop(index=df[df["commit_hash"].isnull()].index, inplace=True)

df["commit_date"] = pd.to_datetime(df["commit_date"])
df.sort_values(by="commit_date", inplace=True)

X = df.drop(
    columns=["commit_hash", "author_email", "commit_date", "is_bug_introducing"]
)
y = df["is_bug_introducing"]

split_point = int(len(df) * 0.80)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

In [5]:
df.shape

(12595, 786)

In [6]:
df.isna().sum()

commit_hash      0
author_email     0
commit_date      0
lines_added      0
lines_deleted    0
                ..
emb_763          0
emb_764          0
emb_765          0
emb_766          0
emb_767          0
Length: 786, dtype: int64

In [7]:
df.isnull().sum()

commit_hash      0
author_email     0
commit_date      0
lines_added      0
lines_deleted    0
                ..
emb_763          0
emb_764          0
emb_765          0
emb_766          0
emb_767          0
Length: 786, dtype: int64

In [8]:
df = df.dropna(subset=["commit_hash", "author_email", "commit_date"])
df.shape

(12595, 786)

In [9]:
df.isnull().sum()

commit_hash      0
author_email     0
commit_date      0
lines_added      0
lines_deleted    0
                ..
emb_763          0
emb_764          0
emb_765          0
emb_766          0
emb_767          0
Length: 786, dtype: int64

## functions


In [10]:
import xgboost as xgb
from mlflow.models.signature import infer_signature


def train_and_evaluate_xgb(params, X_train, y_train, X_test, y_test):
    """Trains an XGBoost model and returns the model and its performance metrics."""
    # Use 'use_label_encoder=False' and 'eval_metric' to avoid common warnings
    xgb_model = xgb.XGBClassifier(
        random_state=42, eval_metric="logloss", n_jobs=-1, **params
    )

    start_time = time.time()
    xgb_model.fit(X_train, y_train)
    end_time = time.time()

    y_pred = xgb_model.predict(X_test)
    y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_pred_proba),
        "training_duration": end_time - start_time,
    }
    return xgb_model, metrics


def plot_feature_importance(model, feature_names):
    """Creates and returns a matplotlib figure of feature importances."""
    importances = model.feature_importances_
    df = pd.DataFrame({"feature": feature_names, "importance": importances})
    df = df.sort_values(by="importance", ascending=True)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(df["feature"], df["importance"], color="skyblue")
    ax.set_title("Feature Importance", fontsize=16)
    ax.set_xlabel("Importance")
    plt.tight_layout()
    return fig


def log_to_mlflow(run_name, params, metrics, model, X_train_sample):
    """Logs all experiment data for a single run to MLflow."""
    with mlflow.start_run(run_name=run_name) as run:
        # Log hyperparameters
        mlflow.log_params(params)

        # Log metrics
        mlflow.log_metrics(metrics)

        # Infer model signature
        signature = infer_signature(X_train_sample, model.predict(X_train_sample))

        # Log the XGBoost model with signature
        mlflow.xgboost.log_model(
            xgb_model=model,
            artifact_path="model",
            signature=signature,
            input_example=X_train_sample,
            model_format="json",  # Specify the model format to avoid warnings
        )

        # Create and log feature importance plot
        fig = plot_feature_importance(model, X_train.columns)
        mlflow.log_figure(fig, "feature_importance.png")
        plt.close(fig)  # Prevent inline display

        # Create feature importance DataFrame and save as CSV
        feature_importance_df = pd.DataFrame(
            {"feature": X_train.columns, "importance": model.feature_importances_}
        ).sort_values(by="importance", ascending=False)

        # Save and log feature importance as CSV
        importance_path = "feature_importance.csv"
        feature_importance_df.to_csv(importance_path, index=False)
        mlflow.log_artifact(importance_path)
        os.remove(importance_path)  # Clean up temporary file

        # Log model parameters as tags for easy filtering
        mlflow.set_tag("model_type", "XGBoost")
        mlflow.set_tag("data_type", "embeddings")

        return run.info.run_id


print("Helper functions defined successfully.")


Helper functions defined successfully.


## Model training


In [11]:
# MLflow setup
mlflow.set_tracking_uri("file:./mlruns")
experiment_name = "XGBoost with embeddings 2"

# Set or create experiment
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except mlflow.exceptions.MlflowException:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

mlflow.set_experiment(experiment_name)
print(f"MLflow experiment set: {experiment_name}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

MLflow experiment set: XGBoost with embeddings 2
Tracking URI: file:./mlruns


In [12]:
print("\n--- Starting XGBoost Hyperparameter Search ---")

results_list_xgb = []
best_f1_score = 0
best_model = None
best_params = None
best_run_id = None

for i, params in enumerate(tqdm(grid, desc="Training XGBoost Models")):
    run_name = f"xgb_run_{i:03d}"

    # 1. Train and evaluate the XGBoost model
    model, metrics = train_and_evaluate_xgb(params, X_train, y_train, X_test, y_test)

    # 2. Log everything to MLflow for this run
    # The refactored function now handles all MLflow logging
    current_run_id = log_to_mlflow(
        run_name=run_name,
        params=params,
        metrics=metrics,
        model=model,
        X_train_sample=X_train.head(),
    )

    # 3. Track the best model based on F1 score
    if metrics["f1"] > best_f1_score:
        best_f1_score = metrics["f1"]
        best_model = model
        best_params = params
        best_run_id = current_run_id  # Use the run ID from the logging function
        print(f"New best model found! F1 Score: {best_f1_score:.4f}")
        # Tag the best run in MLflow
        with mlflow.start_run(run_id=current_run_id):
            mlflow.set_tag("best_model", "True")

    # 4. Store results for the final summary table
    run_results = {"run_name": run_name, **params, **metrics}
    results_list_xgb.append(run_results)

print("\n--- XGBoost Hyperparameter Search Complete ---")
print(f"Best F1 Score: {best_f1_score:.4f}")
print(f"Best Parameters: {best_params}")
print(f"Best Model Run ID: {best_run_id}")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment name: {experiment_name}")
print("To view results, run: mlflow ui")



--- Starting XGBoost Hyperparameter Search ---




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/09/18 14:22:46 INFO mlflow.models.model: Found the following environment variables used during model inference: [OPENAI_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Training XGBoost Models:   2%|▏         | 1/48 [00:35<27:30, 35.12s/it]

New best model found! F1 Score: 0.2948




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Training XGBoost Models:   6%|▋         | 3/48 [01:32<22:45, 30.35s/it]

New best model found! F1 Score: 0.3092




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Training XGBoost Models:  10%|█         | 5/48 [02:50<26:21, 36.79s/it]

New best model found! F1 Score: 0.3109




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Training XGBoost Models:  15%|█▍        | 7/48 [05:03<37:31, 54.91s/it]

New best model found! F1 Score: 0.3130




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Training XGBoost Models:  31%|███▏      | 15/48 [18:39<1:11:00, 129.12s/it]

New best model found! F1 Score: 0.3146




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Training XGBoost Models:  98%|█████████▊| 47/48 [1:26:59<02:46, 166.88s/it]

New best model found! F1 Score: 0.3169




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Training XGBoost Models: 100%|██████████| 48/48 [1:30:35<00:00, 113.24s/it]


--- XGBoost Hyperparameter Search Complete ---
Best F1 Score: 0.3169
Best Parameters: {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 300, 'subsample': 0.7}
Best Model Run ID: 839ba65290a346f692d8ebcf2fc02ec6
MLflow tracking URI: file:./mlruns
Experiment name: XGBoost with embeddings 2
To view results, run: mlflow ui





In [13]:
# --- Display the final results table ---
print("\n📈 XGBoost Results Summary Table:\n")
results_df_xgb = pd.DataFrame(results_list_xgb).sort_values(by="f1", ascending=False)

# Create results directory
results_dir = os.path.join("logs", "xgboost_results")
os.makedirs(results_dir, exist_ok=True)

# Save results summary
results_path = os.path.join(results_dir, "results_summary.csv")
results_df_xgb.to_csv(results_path, index=False)
print(f"Results saved to: {results_path}")

# Save the best model locally
if best_model is not None:
    import joblib

    best_model_path = os.path.join(results_dir, "best_xgboost_model.pkl")
    joblib.dump(best_model, best_model_path)
    print(f"Best model saved to: {best_model_path}")

    # Register the best model in MLflow Model Registry
    model_name = "XGBoost_Bug_Prediction_Embeddings"
    try:
        # Register the model from the best run
        model_uri = f"runs:/{best_run_id}/model"
        mlflow.register_model(model_uri, model_name)
        print(f"Best model registered in MLflow Model Registry as: {model_name}")
    except Exception as e:
        print(f"Warning: Could not register model in MLflow Model Registry: {e}")

    # Save best model info
    best_model_info = {
        "best_f1_score": best_f1_score,
        "best_params": best_params,
        "best_run_id": best_run_id,
        "model_path": best_model_path,
        "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    }

    import json

    best_info_path = os.path.join(results_dir, "best_model_info.json")
    with open(best_info_path, "w") as f:
        json.dump(best_model_info, f, indent=2)
    print(f"Best model info saved to: {best_info_path}")

print(f"\n🎯 Best Model Summary:")
print(f"F1 Score: {best_f1_score:.4f}")
print(f"Parameters: {best_params}")
print(f"Run ID: {best_run_id}")

results_df_xgb


📈 XGBoost Results Summary Table:

Results saved to: logs\xgboost_results\results_summary.csv
Best model saved to: logs\xgboost_results\best_xgboost_model.pkl


Registered model 'XGBoost_Bug_Prediction_Embeddings' already exists. Creating a new version of this model...
Created version '4' of model 'XGBoost_Bug_Prediction_Embeddings'.


Best model registered in MLflow Model Registry as: XGBoost_Bug_Prediction_Embeddings
Best model info saved to: logs\xgboost_results\best_model_info.json

🎯 Best Model Summary:
F1 Score: 0.3169
Parameters: {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 300, 'subsample': 0.7}
Run ID: 839ba65290a346f692d8ebcf2fc02ec6


Unnamed: 0,run_name,learning_rate,max_depth,n_estimators,subsample,accuracy,precision,recall,f1,roc_auc,training_duration
46,xgb_run_046,0.1,20.0,300,0.7,0.500198,0.193122,0.882175,0.316875,0.714174,173.376827
14,xgb_run_014,0.05,10.0,300,0.7,0.491465,0.191157,0.888218,0.314607,0.712441,164.302461
44,xgb_run_044,0.1,20.0,200,0.7,0.498214,0.191264,0.873112,0.313789,0.71336,147.493839
6,xgb_run_006,0.05,,300,0.7,0.512108,0.192044,0.845921,0.313024,0.704672,62.667777
27,xgb_run_027,0.1,,100,1.0,0.511314,0.191358,0.8429,0.311906,0.698265,21.15905
22,xgb_run_022,0.05,20.0,300,0.7,0.498611,0.19016,0.864048,0.311717,0.705392,246.415313
4,xgb_run_004,0.05,,200,0.7,0.516078,0.191238,0.830816,0.31091,0.702655,34.307451
40,xgb_run_040,0.1,20.0,50,0.7,0.490274,0.188766,0.873112,0.310419,0.702668,85.867257
47,xgb_run_047,0.1,20.0,300,1.0,0.510917,0.189952,0.833837,0.309417,0.69541,197.810498
30,xgb_run_030,0.1,,300,0.7,0.503771,0.189317,0.845921,0.309392,0.70146,56.407295


## Best Model Usage Example

The following cell demonstrates how to load and use the best model for predictions.


In [14]:
# Example: Load and use the best model for predictions
if best_model is not None:
    # Option 1: Use the model that's already in memory
    sample_predictions = best_model.predict(X_test[:5])
    sample_probabilities = best_model.predict_proba(X_test[:5])

    print("Sample predictions from best model:")
    print(f"Predictions: {sample_predictions}")
    print(f"Probabilities: {sample_probabilities}")

    # Option 2: Load the saved model from file
    import joblib

    loaded_model = joblib.load(os.path.join(results_dir, "best_xgboost_model.pkl"))
    loaded_predictions = loaded_model.predict(X_test[:5])
    print(f"\nVerification - Loaded model predictions: {loaded_predictions}")
    print(f"Predictions match: {all(sample_predictions == loaded_predictions)}")

    # Option 3: Load model from MLflow
    try:
        model_uri = f"runs:/{best_run_id}/model"
        mlflow_model = mlflow.xgboost.load_model(model_uri)
        mlflow_predictions = mlflow_model.predict(X_test[:5])
        print(f"MLflow model predictions: {mlflow_predictions}")
        print(
            f"MLflow predictions match: {all(sample_predictions == mlflow_predictions)}"
        )
    except Exception as e:
        print(f"Could not load from MLflow: {e}")
else:
    print("No best model available. Please run the training cells first.")

Sample predictions from best model:
Predictions: [1 0 1 0 1]
Probabilities: [[0.03010416 0.96989584]
 [0.9971662  0.00283376]
 [0.16670108 0.8332989 ]
 [0.8340855  0.16591449]
 [0.17400992 0.8259901 ]]

Verification - Loaded model predictions: [1 0 1 0 1]
Predictions match: True


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

MLflow model predictions: [1 0 1 0 1]
MLflow predictions match: True
