# XGBoost


In [20]:
import warnings
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
import xgboost as xgb
import mlflow
from mlflow.models.signature import infer_signature
from tqdm import tqdm
import joblib
import json
warnings.filterwarnings('ignore')

In [21]:
# --- File Paths ---
SOURCE_DATASET_PATH = "data/final_dataset_with_embeddings.csv" # Your final dataset with embeddings
BASE_LOG_DIR = "logs" # A parent directory to store all results

# --- Feature Configuration ---
METADATA_COLS = ["commit_hash", "author_email", "commit_date"]
LABEL_COL = "is_bug_introducing"
N_PCA_COMPONENTS = 177 # The optimal number you found

# --- Model Configuration ---
PARAM_GRID = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.7, 1.0],
}

Generic Data Loading and Preparation Functions

In [22]:
def load_base_data(path):
    """Loads the source CSV, cleans it, and sorts by date."""
    print(f"Loading and preparing base data from '{path}'...")
    df = pd.read_csv(path)
    df.dropna(subset=['commit_hash', LABEL_COL], inplace=True)
    df["commit_date"] = pd.to_datetime(df["commit_date"])
    df.sort_values(by="commit_date", inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Base data loaded successfully.")
    return df

def prepare_feature_sets(df):
    """Creates all the different feature combinations for our experiments."""
    print("Preparing all feature sets...")
    
    embedding_cols = [col for col in df.columns if col.startswith('emb_')]
    stats_cols = [col for col in df.columns if col not in embedding_cols + METADATA_COLS + [LABEL_COL]]
    
    # Normalize and apply PCA to embeddings
    X_embed = df[embedding_cols].values
    X_normalized = Normalizer(norm='l2').fit_transform(X_embed)
    pca = PCA(n_components=N_PCA_COMPONENTS, random_state=42)
    X_pca = pca.fit_transform(X_normalized)
    
    pca_cols = [f'pca_{i+1}' for i in range(N_PCA_COMPONENTS)]
    df_pca = pd.DataFrame(X_pca, columns=pca_cols)

    feature_sets = {
        "stats_only": df[stats_cols],
        "embeddings_only": df[embedding_cols],
        "pca_only": df_pca,
        "stats_and_embeddings": pd.concat([df[stats_cols], df[embedding_cols]], axis=1),
        "stats_and_pca": pd.concat([df[stats_cols], df_pca], axis=1)
    }
    
    print("All feature sets are ready.")
    return feature_sets, df[LABEL_COL]

Generic Modeling and Logging Functions

In [23]:
def train_and_evaluate(params, X_train, y_train, X_test, y_test):
    """Trains an XGBoost model and returns the model and its performance metrics."""
    model = xgb.XGBClassifier(random_state=42, eval_metric="logloss", n_jobs=-1, **params)
    
    start_time = time.time()
    model.fit(X_train, y_train)
    training_duration = time.time() - start_time

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_pred_proba),
        "training_duration": training_duration,
    }
    return model, metrics

def run_experiment(experiment_name, X_data, y_data, param_grid):
    """
    Runs a full hyperparameter search and logs metrics for all runs,
    but only logs the model artifact for the single best run.
    """
    print(f"\n--- Running Experiment: {experiment_name} ---")
    
    # 1. Create dedicated directories for this experiment
    results_dir = os.path.join(BASE_LOG_DIR, experiment_name)
    os.makedirs(results_dir, exist_ok=True)

    # 2. Split data chronologically
    split_point = int(len(X_data) * 0.80)
    X_train, X_test = X_data.iloc[:split_point], X_data.iloc[split_point:]
    y_train, y_test = y_data.iloc[:split_point], y_data.iloc[split_point:]
    
    # 3. Set up MLflow experiment
    mlflow.set_tracking_uri("file:./mlruns")
    mlflow.set_experiment(experiment_name)
    
    # 4. Run hyperparameter search
    best_f1_score = -1
    best_model, best_params, best_run_id = None, None, None
    results_list = []

    grid = list(ParameterGrid(param_grid))
    for i, params in enumerate(tqdm(grid, desc=f"Training models for {experiment_name}")):
        with mlflow.start_run(run_name=f"run_{i:03d}") as run:
            current_run_id = run.info.run_id
            model, metrics = train_and_evaluate(params, X_train, y_train, X_test, y_test)
            
            # Log only params and metrics for every run
            mlflow.log_params(params)
            mlflow.log_metrics(metrics)
            mlflow.set_tag("model_type", "XGBoost")

            # Track the best model in memory
            if metrics["f1"] > best_f1_score:
                best_f1_score = metrics["f1"]
                best_model, best_params, best_run_id = model, params, current_run_id
                mlflow.set_tag("best_model", "True") # Tag the best run as it happens

            results_list.append({"run_name": f"run_{i:03d}", **params, **metrics})

    # --- After the search, log the single best model artifact ---
    if best_run_id:
        print(f"Logging the best model artifact to run ID: {best_run_id}")
        with mlflow.start_run(run_id=best_run_id):
            signature = infer_signature(X_train.head(), best_model.predict(X_train.head()))
            mlflow.xgboost.log_model(
                xgb_model=best_model,
                artifact_path="model",
                signature=signature,
                input_example=X_train.head()
            )
        print("Best model artifact logged successfully.")

    # 5. Save local artifacts and summary
    results_df = pd.DataFrame(results_list).sort_values(by="f1", ascending=False)
    results_df.to_csv(os.path.join(results_dir, "results_summary.csv"), index=False)
    joblib.dump(best_model, os.path.join(results_dir, "best_model.pkl"))

    print(f"--- Experiment '{experiment_name}' Complete ---")
    print(f"Best F1 Score: {best_f1_score:.4f}")
    print(f"Best Parameters: {best_params}")
    print(f"Results and artifacts saved to: '{results_dir}'")
    """Runs a full hyperparameter search for a given feature set and logs to MLflow."""
    print(f"\n--- Running Experiment: {experiment_name} ---")
    

In [24]:
# --- Load data and create all feature sets once ---
base_df = load_base_data(SOURCE_DATASET_PATH)
all_feature_sets, y_data = prepare_feature_sets(base_df)

# --- Experiment 1: Statistical Features Only ---
run_experiment(
    experiment_name="XGBoost_Stats_Only",
    X_data=all_feature_sets["stats_only"],
    y_data=y_data,
    param_grid=PARAM_GRID
)

Loading and preparing base data from 'data/final_dataset_with_embeddings.csv'...
Base data loaded successfully.
Preparing all feature sets...


2025/09/22 14:00:25 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Stats_Only' does not exist. Creating a new experiment.


All feature sets are ready.

--- Running Experiment: XGBoost_Stats_Only ---


Training models for XGBoost_Stats_Only: 100%|██████████| 24/24 [01:52<00:00,  4.67s/it]


Logging the best model artifact to run ID: 897591c954f34a8ea0ee1cf3d76d8ac7


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Best model artifact logged successfully.
--- Experiment 'XGBoost_Stats_Only' Complete ---
Best F1 Score: 0.2961
Best Parameters: {'learning_rate': 0.05, 'max_depth': 20, 'n_estimators': 200, 'subsample': 1.0}
Results and artifacts saved to: 'logs\XGBoost_Stats_Only'

--- Running Experiment: XGBoost_Stats_Only ---


In [25]:
# --- Experiment 2: Embeddings Only ---
run_experiment(
    experiment_name="XGBoost_Embeddings_Only",
    X_data=all_feature_sets["embeddings_only"],
    y_data=y_data,
    param_grid=PARAM_GRID
)

2025/09/22 14:02:25 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Embeddings_Only' does not exist. Creating a new experiment.



--- Running Experiment: XGBoost_Embeddings_Only ---


Training models for XGBoost_Embeddings_Only: 100%|██████████| 24/24 [1:09:18<00:00, 173.28s/it]


Logging the best model artifact to run ID: dac4445ce7f544d59be546f47e8c338a




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Best model artifact logged successfully.
--- Experiment 'XGBoost_Embeddings_Only' Complete ---
Best F1 Score: 0.2850
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.7}
Results and artifacts saved to: 'logs\XGBoost_Embeddings_Only'

--- Running Experiment: XGBoost_Embeddings_Only ---


In [26]:
# --- Experiment 3: PCA-Reduced Embeddings Only ---
run_experiment(
    experiment_name="XGBoost_PCA_Only",
    X_data=all_feature_sets["pca_only"],
    y_data=y_data,
    param_grid=PARAM_GRID
)

2025/09/22 15:12:01 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_PCA_Only' does not exist. Creating a new experiment.



--- Running Experiment: XGBoost_PCA_Only ---


Training models for XGBoost_PCA_Only: 100%|██████████| 24/24 [18:33<00:00, 46.38s/it]


Logging the best model artifact to run ID: 562ae067496141fb8a4f4feb8782a5dd




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Best model artifact logged successfully.
--- Experiment 'XGBoost_PCA_Only' Complete ---
Best F1 Score: 0.2790
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.7}
Results and artifacts saved to: 'logs\XGBoost_PCA_Only'

--- Running Experiment: XGBoost_PCA_Only ---


In [27]:
# --- Experiment 4: Stats + Full Embeddings ---
run_experiment(
    experiment_name="XGBoost_Stats_and_Embeddings",
    X_data=all_feature_sets["stats_and_embeddings"],
    y_data=y_data,
    param_grid=PARAM_GRID
)

2025/09/22 15:30:46 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Stats_and_Embeddings' does not exist. Creating a new experiment.



--- Running Experiment: XGBoost_Stats_and_Embeddings ---


Training models for XGBoost_Stats_and_Embeddings: 100%|██████████| 24/24 [59:44<00:00, 149.36s/it]


Logging the best model artifact to run ID: c5598c13cc0c42d091b774d8c4108bf6




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Best model artifact logged successfully.
--- Experiment 'XGBoost_Stats_and_Embeddings' Complete ---
Best F1 Score: 0.3169
Best Parameters: {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 300, 'subsample': 0.7}
Results and artifacts saved to: 'logs\XGBoost_Stats_and_Embeddings'

--- Running Experiment: XGBoost_Stats_and_Embeddings ---


In [28]:
# --- Experiment 5: Stats + PCA-Reduced Embeddings ---
run_experiment(
    experiment_name="XGBoost_Stats_and_PCA",
    X_data=all_feature_sets["stats_and_pca"],
    y_data=y_data,
    param_grid=PARAM_GRID
)



--- Running Experiment: XGBoost_Stats_and_PCA ---


2025/09/22 16:30:45 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Stats_and_PCA' does not exist. Creating a new experiment.


Training models for XGBoost_Stats_and_PCA: 100%|██████████| 24/24 [15:32<00:00, 38.87s/it]


Logging the best model artifact to run ID: 94688ec27f9f4828a4baed6da22fbfb4


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Best model artifact logged successfully.
--- Experiment 'XGBoost_Stats_and_PCA' Complete ---
Best F1 Score: 0.3040
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'subsample': 0.7}
Results and artifacts saved to: 'logs\XGBoost_Stats_and_PCA'

--- Running Experiment: XGBoost_Stats_and_PCA ---
