# XGBoost

In [2]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
)
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

print("Libraries imported successfully.")

Libraries imported successfully.


configuration

In [2]:
param_grid = {
    "n_estimators": [50 ,100, 200, 300],
    "max_depth": [None,10, 20],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.7, 1.0]
}

# Create a list of all possible combinations
grid = list(ParameterGrid(param_grid))

print(f"Created a grid with {len(grid)} hyperparameter combinations to test.")

Created a grid with 48 hyperparameter combinations to test.


Load and Split Data

In [3]:
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
df = pd.read_csv(FINAL_DATASET_PATH)
df["commit_date"] = pd.to_datetime(df["commit_date"])
df.sort_values(by="commit_date", inplace=True)

X = df.drop(
    columns=["commit_hash", "author_email", "commit_date", "is_bug_introducing"]
)
y = df["is_bug_introducing"]

split_point = int(len(df) * 0.80)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

  df["commit_date"] = pd.to_datetime(df["commit_date"])


In [4]:
df.shape

(125825, 18)

## functions

In [4]:
import xgboost as xgb

def train_and_evaluate_xgb(params, X_train, y_train, X_test, y_test):
    """Trains an XGBoost model and returns the model and its performance metrics."""
    # Use 'use_label_encoder=False' and 'eval_metric' to avoid common warnings
    xgb_model = xgb.XGBClassifier(
        random_state=42, 
        eval_metric='logloss',
        n_jobs=-1, 
        **params
    )
    
    start_time = time.time()
    xgb_model.fit(X_train, y_train)
    end_time = time.time()
    
    y_pred = xgb_model.predict(X_test)
    y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_pred_proba),
        'training_duration': end_time - start_time
    }
    return xgb_model, metrics

def plot_feature_importance(model, feature_names):
    """Creates and returns a matplotlib figure of feature importances."""
    importances = model.feature_importances_
    df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    df = df.sort_values(by='importance', ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(df['feature'], df['importance'], color='skyblue')
    ax.set_title('Feature Importance', fontsize=16)
    ax.set_xlabel('Importance')
    plt.tight_layout()
    return fig

def log_to_tensorboard(log_dir, params, metrics, model, feature_names):
    """Logs all experiment data for a single run to TensorBoard."""
    writer = SummaryWriter(log_dir=log_dir)
    
    # Log individual scalar metrics
    for key, value in metrics.items():
        writer.add_scalar(f"Metrics/{key}", value, 0)

    # Log feature importance plot
    fig = plot_feature_importance(model, feature_names)
    writer.add_figure("Charts/Feature_Importance", fig, 0)
    plt.close(fig) # Prevent inline display

    # Sanitize hparams for logging (e.g., convert None to string)
    hparams_for_log = {key: str(value) for key, value in params.items()}
    
    # Log to HParams dashboard
    writer.add_hparams(hparam_dict=hparams_for_log, metric_dict=metrics)
    
    writer.close()

print("Helper functions defined successfully.")

Helper functions defined successfully.


## Model training

In [5]:
print("\n--- Starting XGBoost Hyperparameter Search ---")

# We create a new base directory to keep RF and XGBoost results separate
base_log_dir_xgb = os.path.join(os.getcwd(), "logs", "runs", "xgboost")
results_list_xgb = []

for i, params in enumerate(tqdm(grid, desc="Training XGBoost Models")):
    run_name = f"000{i}"
    log_dir = os.path.join(base_log_dir_xgb, run_name)

    # 1. Train and evaluate the XGBoost model
    model, metrics = train_and_evaluate_xgb(params, X_train, y_train, X_test, y_test)
    
    # 2. Log everything to TensorBoard for this run (using the generic logging function)
    log_to_tensorboard(log_dir, params, metrics, model, X_train.columns)
    
    # 3. Store results for the final summary table
    run_results = {'run_name': run_name, **params, **metrics}
    results_list_xgb.append(run_results)

print("\n--- XGBoost Hyperparameter Search Complete ---")
print(f"Log directory: {base_log_dir_xgb}")
print(f"To view all results (RF and XGBoost), run: tensorboard --logdir runs")


--- Starting XGBoost Hyperparameter Search ---


Training XGBoost Models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48/48 [07:11<00:00,  8.99s/it]


--- XGBoost Hyperparameter Search Complete ---
Log directory: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\xgboost
To view all results (RF and XGBoost), run: tensorboard --logdir runs





In [6]:
# --- Display the final results table ---
print("\nðŸ“ˆ XGBoost Results Summary Table:\n")
results_df_xgb = pd.DataFrame(results_list_xgb).sort_values(by='f1', ascending=False)
results_df_xgb.to_csv(os.path.join(base_log_dir_xgb, "results_summary.csv"), index=False)
results_df_xgb


ðŸ“ˆ XGBoost Results Summary Table:



Unnamed: 0,run_name,learning_rate,max_depth,n_estimators,subsample,accuracy,precision,recall,f1,roc_auc,training_duration
6,6,0.05,,300,0.7,0.671846,0.485471,0.601556,0.537315,0.724903,2.014713
10,10,0.05,10.0,100,0.7,0.670376,0.483663,0.601681,0.536255,0.722327,1.889399
26,26,0.1,,100,0.7,0.671687,0.485142,0.596036,0.534902,0.722049,1.328261
4,4,0.05,,200,0.7,0.669183,0.482161,0.600176,0.534734,0.722345,1.764133
7,7,0.05,,300,1.0,0.670733,0.483989,0.59729,0.534704,0.721731,2.191278
11,11,0.05,10.0,100,1.0,0.670972,0.484259,0.596287,0.534465,0.720775,3.220655
28,28,0.1,,200,0.7,0.675382,0.489651,0.58763,0.534185,0.725966,3.458498
12,12,0.05,10.0,200,0.7,0.669978,0.483028,0.596287,0.533715,0.721736,3.498808
34,34,0.1,10.0,100,0.7,0.671011,0.484228,0.59315,0.533183,0.717226,1.832369
33,33,0.1,10.0,50,1.0,0.670058,0.483085,0.594781,0.533146,0.718785,1.249635


### SMOTE

In [7]:
from imblearn.over_sampling import SMOTE
print("Original training set class distribution:\n", y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(
    "\nResampled training set class distribution:\n", y_train_resampled.value_counts()
)

Original training set class distribution:
 is_bug_introducing
0    66588
1    34072
Name: count, dtype: int64





Resampled training set class distribution:
 is_bug_introducing
0    66588
1    66588
Name: count, dtype: int64


In [8]:
print("\n--- Starting XGBoost Hyperparameter Search ---")

# We create a new base directory to keep RF and XGBoost results separate
base_log_dir_xgb = os.path.join(os.getcwd(), "logs", "runs", "xgboost_smote")
results_list_xgb = []

for i, params in enumerate(tqdm(grid, desc="Training XGBoost Models")):
    run_name = f"000{i}"
    log_dir = os.path.join(base_log_dir_xgb, run_name)

    # 1. Train and evaluate the XGBoost model
    model, metrics = train_and_evaluate_xgb(params, X_train_resampled, y_train_resampled, X_test, y_test)
    
    # 2. Log everything to TensorBoard for this run (using the generic logging function)
    log_to_tensorboard(log_dir, params, metrics, model, X_train_resampled.columns)
    
    # 3. Store results for the final summary table
    run_results = {'run_name': run_name, **params, **metrics}
    results_list_xgb.append(run_results)

print("\n--- XGBoost Hyperparameter Search Complete ---")
print(f"Log directory: {base_log_dir_xgb}")
print(f"To view all results (RF and XGBoost), run: tensorboard --logdir runs")


--- Starting XGBoost Hyperparameter Search ---


Training XGBoost Models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48/48 [08:02<00:00, 10.06s/it]


--- XGBoost Hyperparameter Search Complete ---
Log directory: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\xgboost_smote
To view all results (RF and XGBoost), run: tensorboard --logdir runs





In [9]:
# --- Display the final results table ---
print("\nðŸ“ˆ XGBoost Results Summary Table:\n")
results_df_xgb = pd.DataFrame(results_list_xgb).sort_values(by='f1', ascending=False)
results_df_xgb.to_csv(os.path.join(base_log_dir_xgb, "results_summary.csv"), index=False)
results_df_xgb


ðŸ“ˆ XGBoost Results Summary Table:



Unnamed: 0,run_name,learning_rate,max_depth,n_estimators,subsample,accuracy,precision,recall,f1,roc_auc,training_duration
8,8,0.05,10.0,50,0.7,0.632704,0.452784,0.765149,0.56891,0.726856,1.254827
26,26,0.1,,100,0.7,0.646414,0.463246,0.732907,0.56768,0.728652,1.694307
9,9,0.05,10.0,50,1.0,0.632108,0.452096,0.761887,0.567464,0.726856,1.14872
11,11,0.05,10.0,100,1.0,0.643036,0.460401,0.73805,0.567063,0.728511,2.019135
5,5,0.05,,200,1.0,0.644586,0.461605,0.733785,0.566709,0.727295,1.448408
4,4,0.05,,200,0.7,0.646732,0.463354,0.728892,0.566553,0.729417,1.600619
25,25,0.1,,50,1.0,0.628492,0.449256,0.765274,0.566152,0.720891,0.962565
6,6,0.05,,300,0.7,0.653487,0.469108,0.713461,0.56604,0.730624,3.898042
28,28,0.1,,200,0.7,0.654838,0.470285,0.709823,0.565743,0.730739,3.155541
3,3,0.05,,100,1.0,0.629446,0.44983,0.76151,0.565572,0.723113,1.414169


### SMOTE + Tomek

In [10]:
# You may need to have this library installed: pip install imbalanced-learn
from imblearn.combine import SMOTETomek

print("Original training set class distribution:")
print(y_train.value_counts())

# Initialize SMOTETomek instead of SMOTE
# This combines oversampling (SMOTE) and undersampling (Tomek Links)
smt = SMOTETomek(random_state=42)

# Apply the combined resampling to the training data
X_train_t_resampled, y_train_t_resampled = smt.fit_resample(X_train, y_train)

print("\nResampled training set class distribution:")
print(y_train_resampled.value_counts())

Original training set class distribution:
is_bug_introducing
0    66588
1    34072
Name: count, dtype: int64





Resampled training set class distribution:
is_bug_introducing
0    66588
1    66588
Name: count, dtype: int64


In [11]:
print("\n--- Starting XGBoost Hyperparameter Search ---")

# We create a new base directory to keep RF and XGBoost results separate
base_log_dir_xgb = os.path.join(os.getcwd(), "logs", "runs", "xgboost_smote_tomek")
results_list_xgb = []

for i, params in enumerate(tqdm(grid, desc="Training XGBoost Models")):
    run_name = f"000{i}"
    log_dir = os.path.join(base_log_dir_xgb, run_name)

    # 1. Train and evaluate the XGBoost model
    model, metrics = train_and_evaluate_xgb(params, X_train_t_resampled, y_train_t_resampled, X_test, y_test)
    
    # 2. Log everything to TensorBoard for this run (using the generic logging function)
    log_to_tensorboard(log_dir, params, metrics, model, X_train.columns)
    
    # 3. Store results for the final summary table
    run_results = {'run_name': run_name, **params, **metrics}
    results_list_xgb.append(run_results)

print("\n--- XGBoost Hyperparameter Search Complete ---")
print(f"Log directory: {base_log_dir_xgb}")
print(f"To view all results (RF and XGBoost), run: tensorboard --logdir runs")


--- Starting XGBoost Hyperparameter Search ---


Training XGBoost Models: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48/48 [07:33<00:00,  9.45s/it]


--- XGBoost Hyperparameter Search Complete ---
Log directory: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\xgboost_smote_tomek
To view all results (RF and XGBoost), run: tensorboard --logdir runs





In [12]:
# --- Display the final results table ---
print("\nðŸ“ˆ XGBoost Results Summary Table:\n")
results_df_xgb = pd.DataFrame(results_list_xgb).sort_values(by='f1', ascending=False)
results_df_xgb.to_csv(os.path.join(base_log_dir_xgb, "results_summary.csv"), index=False)
results_df_xgb


ðŸ“ˆ XGBoost Results Summary Table:



Unnamed: 0,run_name,learning_rate,max_depth,n_estimators,subsample,accuracy,precision,recall,f1,roc_auc,training_duration
4,4,0.05,,200,0.7,0.643354,0.460695,0.738176,0.567324,0.730769,1.285666
8,8,0.05,10.0,50,0.7,0.628691,0.4496,0.768285,0.567247,0.725796,1.111999
10,10,0.05,10.0,100,0.7,0.639221,0.457306,0.744449,0.566573,0.72821,2.11851
26,26,0.1,,100,0.7,0.642718,0.46005,0.736796,0.566427,0.728116,1.119097
6,6,0.05,,300,0.7,0.648281,0.464619,0.724878,0.566276,0.732688,1.880657
28,28,0.1,,200,0.7,0.652494,0.468247,0.71597,0.566199,0.729814,3.15078
7,7,0.05,,300,1.0,0.651818,0.467601,0.716096,0.565765,0.730059,2.844365
24,24,0.1,,50,0.7,0.62881,0.449304,0.761636,0.565191,0.723321,0.63796
5,5,0.05,,200,1.0,0.643513,0.460493,0.73115,0.565085,0.729448,0.974619
2,2,0.05,,100,0.7,0.626187,0.447314,0.764772,0.564471,0.723585,1.23873
