# Random foreset

In [3]:
# Install the necessary libraries if you haven't already
# !pip install tensorboard torch

import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from torch.utils.tensorboard import SummaryWriter
import time

print("Libraries imported successfully.")

Libraries imported successfully.


In [4]:
# --- Configuration ---
FINAL_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"
N_ESTIMATORS = 200
TEST_SPLIT_RATIO = 0.20

# --- Set up TensorBoard ---
# Create a unique log directory for this run using a timestamp
log_dir = f"runs/szz_experiment_{int(time.time())}"
writer = SummaryWriter(log_dir)
print(f"TensorBoard log directory created at: {log_dir}")


# --- Load and Split Data ---
df = pd.read_csv(FINAL_DATASET_PATH)
df['commit_date'] = pd.to_datetime(df['commit_date'])
df.sort_values(by='commit_date', inplace=True)

X = df.drop(columns=['commit_hash', 'author_email', 'commit_date', 'is_bug_introducing'])
y = df['is_bug_introducing']

split_point = int(len(df) * (1 - TEST_SPLIT_RATIO))
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

TensorBoard log directory created at: runs/szz_experiment_1752039287


  df['commit_date'] = pd.to_datetime(df['commit_date'])


In [5]:
print("Original training set class distribution:\n", y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nResampled training set class distribution:\n", y_train_resampled.value_counts())

Original training set class distribution:
 is_bug_introducing
0    66588
1    34072
Name: count, dtype: int64





Resampled training set class distribution:
 is_bug_introducing
0    66588
1    66588
Name: count, dtype: int64


In [7]:
# --- Train Model ---
rf_model = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model...")
rf_model.fit(X_train_resampled, y_train_resampled)
print("Model training complete. ✅")

# --- Evaluate and Log to TensorBoard ---
print("\nEvaluating model and logging to TensorBoard...")
y_pred = rf_model.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Log metrics as scalars
writer.add_scalar("Test/Precision", precision, 1)
writer.add_scalar("Test/Recall", recall, 1)
writer.add_scalar("Test/F1_Score", f1, 1)

# Log hyperparameters and final metrics together for easy comparison
hparams = {"n_estimators": N_ESTIMATORS, "test_split_ratio": TEST_SPLIT_RATIO}
metrics = {"hparam/precision": precision, "hparam/recall": recall, "hparam/f1": f1}
writer.add_hparams(hparams, metrics)

# Close the writer to ensure everything is saved
writer.close()

print("\n--- Results ---")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-Score:  {f1:.3f}")
print("\n✅ Results successfully logged to TensorBoard.")


Training the Random Forest model...
Model training complete. ✅

Evaluating model and logging to TensorBoard...

--- Results ---
Precision: 0.459
Recall:    0.702
F1-Score:  0.555

✅ Results successfully logged to TensorBoard.
