# Random foreset


In [1]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
)
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

print("Libraries imported successfully.")

Libraries imported successfully.


Configuration


In [2]:
# Define the grid of hyperparameters to search over
# param_grid = {
#     'n_estimators': [100, 200, 300, 500],            # More trees → better generalization, higher cost
#     'max_depth': [None, 10, 20, 30],                 # Controls overfitting; None lets trees grow fully
#     'min_samples_split': [2, 5, 10],                 # Higher values → more conservative splits
#     'min_samples_leaf': [1, 2, 4],                   # Ensures enough samples at each leaf to reduce noise
#     'max_features': ['sqrt', 'log2', None],          # Controls number of features to consider per split
#     'bootstrap': [True, False],                      # Whether sampling is with replacement
#     'class_weight': [None, 'balanced'],              # Essential for imbalanced datasets
#     'criterion': ['gini', 'entropy'],                # Different impurity measures for split quality
# }

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None,10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

# Create a list of all possible combinations
grid = list(ParameterGrid(param_grid))

print(f"Created a grid with {len(grid)} hyperparameter combinations to test.")

Created a grid with 36 hyperparameter combinations to test.


Load and Split Data


In [3]:
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
df = pd.read_csv(FINAL_DATASET_PATH)
df["commit_date"] = pd.to_datetime(df["commit_date"])
df.sort_values(by="commit_date", inplace=True)

X = df.drop(
    columns=["commit_hash", "author_email", "commit_date", "is_bug_introducing"]
)
y = df["is_bug_introducing"]

split_point = int(len(df) * 0.80)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

  df["commit_date"] = pd.to_datetime(df["commit_date"])


SMOTE


In [4]:
print("Original training set class distribution:\n", y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(
    "\nResampled training set class distribution:\n", y_train_resampled.value_counts()
)



Original training set class distribution:
 is_bug_introducing
0    66588
1    34072
Name: count, dtype: int64

Resampled training set class distribution:
 is_bug_introducing
0    66588
1    66588
Name: count, dtype: int64


In [5]:
def prepare_hparams_for_logging(params):
    hparams_for_log = {}
    for key, value in params.items():
        if value is None:
            hparams_for_log[key] = "None"  # Convert to string
        elif isinstance(value, (int, float, bool)):
            hparams_for_log[key] = value
        else:
            hparams_for_log[key] = str(value)
    return hparams_for_log

In [None]:
print("\n--- Starting Comprehensive Hyperparameter Search ---")

# Create absolute path for logs
base_log_dir = os.path.join(os.getcwd(), "logs", "runs", "rf_grid_search")

# Create base directory
os.makedirs(base_log_dir, exist_ok=True)
print(f"Created log directory at: {base_log_dir}")

for i, params in enumerate(tqdm(grid, desc="Training Models")):
    # Create a simpler run name using an index
    run_name = f"run_{i:04d}"  # This will create names like run_0000, run_0001, etc.
    log_dir = os.path.join(base_log_dir, run_name)

    # Create the directory if it doesn't exist
    os.makedirs(log_dir, exist_ok=True)

    writer = SummaryWriter(log_dir=log_dir)

    # --- Train the model and time it ---
    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1, **params)

    start_time = time.time()
    rf_model.fit(X_train, y_train)
    end_time = time.time()
    training_duration = end_time - start_time

    # --- Evaluate the model ---
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # --- Log all metrics and artifacts to TensorBoard ---
    hparams_for_log = prepare_hparams_for_logging(params)
    run_name=f"run_{i:04d}"

    # 1. Log individual scalar metrics
    writer.add_scalar("Metrics/Accuracy", accuracy, 1)
    writer.add_scalar("Metrics/Precision", precision, 1)
    writer.add_scalar("Metrics/Recall", recall, 1)
    writer.add_scalar("Metrics/F1_Score", f1, 1)
    writer.add_scalar("Metrics/ROC_AUC", roc_auc, 1)
    writer.add_scalar("Performance/Training_Duration_sec", training_duration, 1)

    # Only hyperparameters go here
    hparam_dict = hparams_for_log

    # Only evaluation results go here
    metric_dict = {
        "hparam/accuracy": accuracy,
        "hparam/precision": precision,
        "hparam/recall": recall,
        "hparam/f1": f1,
        "hparam/roc_auc": roc_auc,
        "hparam/training_duration": training_duration,
    }

    # 3. Log everything to the HParams dashboard for easy comparison
    writer.add_hparams(
        hparam_dict={
            **hparam_dict,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "roc_auc": roc_auc,
            "training_duration": training_duration,
        },
        metric_dict=metric_dict,
        run_name=f"run_{i:04d}",
    )

    # 2. Log the feature importance plot
    importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame(
        {"feature": X_train.columns, "importance": importances}
    ).sort_values(by="importance", ascending=True)  # Ascending for horizontal plot

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(
        feature_importance_df["feature"], feature_importance_df["importance"], color="c"
    )
    ax.set_title(f"Feature Importance (Run {i:04d})")
    writer.add_figure("Charts/Feature_Importance", fig, 1)
    plt.close(fig)  # Close the plot to prevent it from displaying in the notebook

    writer.close()

print("\n--- Hyperparameter Search Complete ---")
print(
    "All results, including feature importance plots, have been logged to TensorBoard."
)
print(f"Log directory: {base_log_dir}")


--- Starting Comprehensive Hyperparameter Search ---
Created log directory at: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\rf_grid_search


Training Models:  14%|█▍        | 5/36 [02:28<15:22, 29.75s/it]


KeyboardInterrupt: 