# Random forest

In [1]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
)
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

print("Libraries imported successfully.")

Libraries imported successfully.


Configuration


In [2]:
# Define the grid of hyperparameters to search over
# param_grid = {
#     'n_estimators': [100, 200, 300, 500],            # More trees → better generalization, higher cost
#     'max_depth': [None, 10, 20, 30],                 # Controls overfitting; None lets trees grow fully
#     'min_samples_split': [2, 5, 10],                 # Higher values → more conservative splits
#     'min_samples_leaf': [1, 2, 4],                   # Ensures enough samples at each leaf to reduce noise
#     'max_features': ['sqrt', 'log2', None],          # Controls number of features to consider per split
#     'bootstrap': [True, False],                      # Whether sampling is with replacement
#     'class_weight': [None, 'balanced'],              # Essential for imbalanced datasets
#     'criterion': ['gini', 'entropy'],                # Different impurity measures for split quality
# }

param_grid = {
    "n_estimators": [50 ,100, 200, 300],
    "max_depth": [None,10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

# Create a list of all possible combinations
grid = list(ParameterGrid(param_grid))

print(f"Created a grid with {len(grid)} hyperparameter combinations to test.")

Created a grid with 48 hyperparameter combinations to test.


Load and Split Data


In [3]:
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
df = pd.read_csv(FINAL_DATASET_PATH)
df["commit_date"] = pd.to_datetime(df["commit_date"])
df.sort_values(by="commit_date", inplace=True)

X = df.drop(
    columns=["commit_hash", "author_email", "commit_date", "is_bug_introducing"]
)
y = df["is_bug_introducing"]

split_point = int(len(df) * 0.80)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

  df["commit_date"] = pd.to_datetime(df["commit_date"])


## functions

In [4]:
def train_and_evaluate(params, X_train, y_train, X_test, y_test):
    """Trains a model and returns the model and its performance metrics."""
    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1, **params)
    
    start_time = time.time()
    rf_model.fit(X_train, y_train)
    end_time = time.time()
    
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_pred_proba),
        'training_duration': end_time - start_time
    }
    return rf_model, metrics

def plot_feature_importance(model, feature_names):
    """Creates and returns a matplotlib figure of feature importances."""
    importances = model.feature_importances_
    df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    df = df.sort_values(by='importance', ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(df['feature'], df['importance'], color='skyblue')
    ax.set_title('Feature Importance', fontsize=16)
    ax.set_xlabel('Importance')
    plt.tight_layout()
    return fig

def log_to_tensorboard(log_dir, params, metrics, model, feature_names):
    """Logs all experiment data for a single run to TensorBoard."""
    writer = SummaryWriter(log_dir=log_dir)
    
    # Log individual scalar metrics
    for key, value in metrics.items():
        writer.add_scalar(f"Metrics/{key}", value, 0)

    # Log feature importance plot
    fig = plot_feature_importance(model, feature_names)
    writer.add_figure("Charts/Feature_Importance", fig, 0)
    plt.close(fig) # Prevent inline display

    # Sanitize hparams for logging (e.g., convert None to string)
    hparams_for_log = {key: str(value) for key, value in params.items()}
    
    # Log to HParams dashboard
    writer.add_hparams(hparam_dict=hparams_for_log, metric_dict=metrics)
    
    writer.close()

print("Helper functions defined successfully.")

Helper functions defined successfully.


## Model training

In [5]:
print("\n--- Starting Hyperparameter Search ---")

base_log_dir = os.path.join(os.getcwd(), "logs", "runs", "random_forest")
results_list = []

for i, params in enumerate(tqdm(grid, desc="Training Models")):
    run_name = f"000{i}"
    log_dir = os.path.join(base_log_dir, run_name)

    # 1. Train and evaluate the model
    # Note the correction: X_train_resampled, not X_train_t_resampled
    model, metrics = train_and_evaluate(params, X_train, y_train, X_test, y_test)
    
    # 2. Log everything to TensorBoard for this run
    log_to_tensorboard(log_dir, params, metrics, model, X_train.columns)
    
    # 3. Store results for the final summary table
    run_results = {'run_name': run_name, **params, **metrics}
    results_list.append(run_results)

print("\n--- Hyperparameter Search Complete ---")
print(f"Log directory: {base_log_dir}")
print(f"To view results, run: tensorboard --logdir \"{base_log_dir}\"")



--- Starting Hyperparameter Search ---


Training Models: 100%|██████████| 48/48 [11:36<00:00, 14.52s/it]


--- Hyperparameter Search Complete ---
Log directory: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\random_forest
To view results, run: tensorboard --logdir "c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\random_forest"





In [6]:
# --- Display the final results table ---
results_df = pd.DataFrame(results_list).sort_values(by='f1', ascending=False)
results_df.to_csv(os.path.join(base_log_dir, "summary_results.csv"), index=False)
results_df

Unnamed: 0,run_name,max_depth,min_samples_leaf,min_samples_split,n_estimators,accuracy,precision,recall,f1,roc_auc,training_duration
34,34,20.0,1,2,200,0.669064,0.481638,0.587379,0.529279,0.715035,19.6643
47,47,20.0,2,5,300,0.671091,0.484069,0.583239,0.529047,0.718242,30.410865
46,46,20.0,2,5,200,0.670534,0.483389,0.584117,0.529001,0.717535,21.634728
11,11,,2,2,300,0.67121,0.484211,0.582863,0.528976,0.716939,29.81385
35,35,20.0,1,2,300,0.669342,0.48194,0.585874,0.528849,0.71559,26.794922
10,10,,2,2,200,0.670137,0.482855,0.582988,0.528218,0.71584,18.399975
45,45,20.0,2,5,100,0.668627,0.481011,0.584745,0.52783,0.714864,11.395471
39,39,20.0,1,5,300,0.670256,0.482946,0.580856,0.527395,0.716636,29.257645
42,42,20.0,2,2,200,0.669382,0.481885,0.582361,0.52738,0.717317,19.48921
43,43,20.0,2,2,300,0.669621,0.482171,0.581859,0.527345,0.717738,28.86826


### SMOTE

In [7]:
from imblearn.over_sampling import SMOTE
print("Original training set class distribution:\n", y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(
    "\nResampled training set class distribution:\n", y_train_resampled.value_counts()
)

Original training set class distribution:
 is_bug_introducing
0    66588
1    34072
Name: count, dtype: int64





Resampled training set class distribution:
 is_bug_introducing
0    66588
1    66588
Name: count, dtype: int64


In [8]:
print("\n--- Starting Hyperparameter Search ---")

base_log_dir = os.path.join(os.getcwd(), "logs", "runs", "random_forest_smote")
results_list = []

for i, params in enumerate(tqdm(grid, desc="Training Models")):
    run_name = f"000{i}"
    log_dir = os.path.join(base_log_dir, run_name)

    # 1. Train and evaluate the model
    # Note the correction: X_train_resampled, not X_train_t_resampled
    model, metrics = train_and_evaluate(params, X_train_resampled, y_train_resampled, X_test, y_test)
    
    # 2. Log everything to TensorBoard for this run
    log_to_tensorboard(log_dir, params, metrics, model, X_train_resampled.columns)
    
    # 3. Store results for the final summary table
    run_results = {'run_name': run_name, **params, **metrics}
    results_list.append(run_results)

print("\n--- Hyperparameter Search Complete ---")
print(f"Log directory: {base_log_dir}")
print(f"To view results, run: tensorboard --logdir \"{base_log_dir}\"")


--- Starting Hyperparameter Search ---


Training Models: 100%|██████████| 48/48 [17:58<00:00, 22.46s/it]


--- Hyperparameter Search Complete ---
Log directory: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\random_forest_smote
To view results, run: tensorboard --logdir "c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\random_forest_smote"





In [9]:
# --- Display the final results table ---
print("\n📈 Results Summary Table:\n")
results_df = pd.DataFrame(results_list).sort_values(by='f1', ascending=False)
results_df.to_csv(os.path.join(base_log_dir, "summary_results.csv"), index=False)
results_df


📈 Results Summary Table:



Unnamed: 0,run_name,max_depth,min_samples_leaf,min_samples_split,n_estimators,accuracy,precision,recall,f1,roc_auc,training_duration
47,47,20.0,2,5,300,0.639499,0.457362,0.74081,0.565559,0.724977,42.698518
37,37,20.0,1,5,100,0.64089,0.458398,0.736796,0.565173,0.721788,15.830275
46,46,20.0,2,5,200,0.639301,0.457132,0.739807,0.56509,0.724892,27.838318
43,43,20.0,2,2,300,0.63946,0.457247,0.739305,0.565032,0.725336,41.887589
45,45,20.0,2,5,100,0.638228,0.456278,0.741689,0.564985,0.723396,14.376509
42,42,20.0,2,2,200,0.639539,0.457292,0.738803,0.564919,0.725061,28.195496
39,39,20.0,1,5,300,0.639579,0.457307,0.738427,0.564821,0.724683,44.077209
34,34,20.0,1,2,200,0.638188,0.456203,0.740936,0.564708,0.724422,28.750157
38,38,20.0,1,5,200,0.639897,0.457512,0.736921,0.564536,0.723672,29.03483
41,41,20.0,2,2,100,0.639301,0.457005,0.737423,0.564297,0.723413,14.298861


### SMOTE + Tomek

In [10]:
# You may need to have this library installed: pip install imbalanced-learn
from imblearn.combine import SMOTETomek

print("Original training set class distribution:")
print(y_train.value_counts())

# Initialize SMOTETomek instead of SMOTE
# This combines oversampling (SMOTE) and undersampling (Tomek Links)
smt = SMOTETomek(random_state=42)

# Apply the combined resampling to the training data
X_train_t_resampled, y_train_t_resampled = smt.fit_resample(X_train, y_train)

print("\nResampled training set class distribution:")
print(y_train_resampled.value_counts())



Original training set class distribution:
is_bug_introducing
0    66588
1    34072
Name: count, dtype: int64





Resampled training set class distribution:
is_bug_introducing
0    66588
1    66588
Name: count, dtype: int64


In [11]:
print("\n--- Starting Hyperparameter Search ---")

base_log_dir = os.path.join(os.getcwd(), "logs", "runs", "random_forest_smote_tomek")
results_list = []

for i, params in enumerate(tqdm(grid, desc="Training Models")):
    run_name = f"000{i}"
    log_dir = os.path.join(base_log_dir, run_name)

    # 1. Train and evaluate the model
    # Note the correction: X_train_resampled, not X_train_t_resampled
    model, metrics = train_and_evaluate(params, X_train_t_resampled, y_train_t_resampled, X_test, y_test)
    
    # 2. Log everything to TensorBoard for this run
    log_to_tensorboard(log_dir, params, metrics, model, X_train_t_resampled.columns)
    
    # 3. Store results for the final summary table
    run_results = {'run_name': run_name, **params, **metrics}
    results_list.append(run_results)

print("\n--- Hyperparameter Search Complete ---")
print(f"Log directory: {base_log_dir}")
print(f"To view results, run: tensorboard --logdir \"{base_log_dir}\"")


--- Starting Hyperparameter Search ---


Training Models: 100%|██████████| 48/48 [14:12<00:00, 17.77s/it]


--- Hyperparameter Search Complete ---
Log directory: c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\random_forest_smote_tomek
To view results, run: tensorboard --logdir "c:\Users\pradishan\code\wso2-AI-Tool\statistical-model\logs\runs\random_forest_smote_tomek"





In [13]:
# --- Display the final results table ---
print("\n📈 Results Summary Table:\n")
results_df = pd.DataFrame(results_list).sort_values(by='f1', ascending=False)
results_df.to_csv(os.path.join(base_log_dir, "summary_results.csv"), index=False)
results_df


📈 Results Summary Table:



Unnamed: 0,run_name,max_depth,min_samples_leaf,min_samples_split,n_estimators,accuracy,precision,recall,f1,roc_auc,training_duration
34,34,20.0,1,2,200,0.636241,0.454782,0.74633,0.565172,0.723631,22.889879
39,39,20.0,1,5,300,0.637592,0.45575,0.742316,0.564761,0.724727,34.784233
35,35,20.0,1,2,300,0.63636,0.454768,0.744198,0.564549,0.724292,33.857251
38,38,20.0,1,5,200,0.636757,0.455069,0.74332,0.564528,0.724187,23.185664
43,43,20.0,2,2,300,0.636201,0.454552,0.742818,0.563985,0.724164,32.734998
47,47,20.0,2,5,300,0.6364,0.454685,0.742065,0.56387,0.725149,34.000894
36,36,20.0,1,5,50,0.637314,0.455305,0.738678,0.563364,0.721813,6.217262
42,42,20.0,2,2,200,0.63481,0.453356,0.743194,0.563171,0.723864,23.281348
37,37,20.0,1,5,100,0.635724,0.454035,0.741061,0.563081,0.723068,11.78696
11,11,,2,2,300,0.643115,0.459863,0.725881,0.563032,0.724266,37.73406
