In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

import warnings
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np

import keras_tuner as kt
warnings.filterwarnings('ignore')

# --- NN Configuration ---
# We'll use these for the baseline experiments
NN_EPOCHS = 100
NN_BATCH_SIZE = 64

In [2]:
# --- File Paths ---
SOURCE_DATASET_PATH = "data/final_dataset_with_embeddings.csv" # Your final dataset with embeddings
BASE_LOG_DIR = "logs" # A parent directory to store all results

# --- Feature Configuration ---
METADATA_COLS = ["commit_hash", "author_email", "commit_date"]
LABEL_COL = "is_bug_introducing"
N_PCA_COMPONENTS = 177 # The optimal number you found

In [3]:
def load_base_data(path):
    """Loads the source CSV, cleans it, and sorts by date."""
    print(f"Loading and preparing base data from '{path}'...")
    df = pd.read_csv(path)
    df.dropna(subset=['commit_hash', LABEL_COL], inplace=True)
    df["commit_date"] = pd.to_datetime(df["commit_date"])
    df.sort_values(by="commit_date", inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Base data loaded successfully.")
    return df

def prepare_feature_sets(df):
    """Creates all the different feature combinations for our experiments."""
    print("Preparing all feature sets...")
    
    embedding_cols = [col for col in df.columns if col.startswith('emb_')]
    stats_cols = [col for col in df.columns if col not in embedding_cols + METADATA_COLS + [LABEL_COL]]
    
    # Normalize and apply PCA to embeddings
    X_embed = df[embedding_cols].values
    X_normalized = Normalizer(norm='l2').fit_transform(X_embed)
    pca = PCA(n_components=N_PCA_COMPONENTS, random_state=42)
    X_pca = pca.fit_transform(X_normalized)
    
    pca_cols = [f'pca_{i+1}' for i in range(N_PCA_COMPONENTS)]
    df_pca = pd.DataFrame(X_pca, columns=pca_cols)

    feature_sets = {
        "stats_only": df[stats_cols],
        "embeddings_only": df[embedding_cols],
        "pca_only": df_pca,
        "stats_and_embeddings": pd.concat([df[stats_cols], df[embedding_cols]], axis=1),
        "stats_and_pca": pd.concat([df[stats_cols], df_pca], axis=1)
    }
    
    print("All feature sets are ready.")
    return feature_sets, df[LABEL_COL]

In [4]:
def create_baseline_nn(input_shape):
    """Defines and compiles a standard baseline Neural Network."""
    model = keras.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid') # Binary classification
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
        ]
    )
    return model

def run_nn_experiment(experiment_name, X_data, y_data):
    """Runs a baseline NN experiment for a given feature set and logs to MLflow."""
    print(f"\n--- Running Experiment: {experiment_name} ---")
    
    # 1. Create dedicated directories
    results_dir = os.path.join(BASE_LOG_DIR, experiment_name)
    os.makedirs(results_dir, exist_ok=True)

    # 2. Split and scale data
    split_point = int(len(X_data) * 0.80)
    X_train, X_test = X_data.iloc[:split_point], X_data.iloc[split_point:]
    y_train, y_test = y_data.iloc[:split_point], y_data.iloc[split_point:]
    
    # Neural networks benefit from feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 3. Set up MLflow
    mlflow.set_tracking_uri("file:./mlruns")
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run() as run:
        # 4. Create and train the model
        model = create_baseline_nn(X_train_scaled.shape[1])
        
        # Handle class imbalance
        neg, pos = np.bincount(y_train)
        class_weight = {0: (1 / neg) * (len(y_train) / 2.0), 1: (1 / pos) * (len(y_train) / 2.0)}
        
        history = model.fit(
            X_train_scaled,
            y_train,
            epochs=NN_EPOCHS,
            batch_size=NN_BATCH_SIZE,
            validation_data=(X_test_scaled, y_test),
            class_weight=class_weight,
            verbose=0 # Suppress output during training
        )
        
        # 5. Evaluate and log metrics
        y_pred_proba = model.predict(X_test_scaled).ravel()
        y_pred = (y_pred_proba > 0.5).astype(int)
        
        metrics = {
            "f1": f1_score(y_test, y_pred, zero_division=0),
            "roc_auc": roc_auc_score(y_test, y_pred_proba),
            "precision": precision_score(y_test, y_pred, zero_division=0),
            "recall": recall_score(y_test, y_pred, zero_division=0),
        }
        mlflow.log_metrics(metrics)
        mlflow.tensorflow.log_model(model, "model")
        
        print(f"--- Experiment '{experiment_name}' Complete ---")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"ROC AUC: {metrics['roc_auc']:.4f}")

In [5]:
# --- Load data and create all feature sets once ---
base_df = load_base_data(SOURCE_DATASET_PATH)
all_feature_sets, y_data = prepare_feature_sets(base_df)

Loading and preparing base data from 'data/final_dataset_with_embeddings.csv'...
Base data loaded successfully.
Preparing all feature sets...
All feature sets are ready.


In [6]:
# --- Experiment 1: Stats Only ---
run_nn_experiment(
    experiment_name="NN_Stats_Only",
    X_data=all_feature_sets["stats_only"],
    y_data=y_data
)


--- Running Experiment: NN_Stats_Only ---




INFO:tensorflow:Assets written to: C:\Users\PRADIS~1\AppData\Local\Temp\tmpcfcen7e5\model\data\model\assets




--- Experiment 'NN_Stats_Only' Complete ---
F1 Score: 0.2609
ROC AUC: 0.6014


In [7]:
# --- Experiment 2: Embeddings Only ---
run_nn_experiment(
    experiment_name="NN_Embeddings_Only",
    X_data=all_feature_sets["embeddings_only"],
    y_data=y_data
)


--- Running Experiment: NN_Embeddings_Only ---


2025/09/23 08:38:45 INFO mlflow.tracking.fluent: Experiment with name 'NN_Embeddings_Only' does not exist. Creating a new experiment.






INFO:tensorflow:Assets written to: C:\Users\PRADIS~1\AppData\Local\Temp\tmpqbjkk68j\model\data\model\assets




--- Experiment 'NN_Embeddings_Only' Complete ---
F1 Score: 0.2996
ROC AUC: 0.6456


In [8]:
# --- Experiment 3: PCA-Reduced Embeddings Only ---
run_nn_experiment(
    experiment_name="NN_PCA_Only",
    X_data=all_feature_sets["pca_only"],
    y_data=y_data
)


--- Running Experiment: NN_PCA_Only ---


2025/09/23 08:40:22 INFO mlflow.tracking.fluent: Experiment with name 'NN_PCA_Only' does not exist. Creating a new experiment.






INFO:tensorflow:Assets written to: C:\Users\PRADIS~1\AppData\Local\Temp\tmphgx2cn2v\model\data\model\assets




--- Experiment 'NN_PCA_Only' Complete ---
F1 Score: 0.2732
ROC AUC: 0.6308


In [9]:
# --- Experiment 4: Stats + Full Embeddings ---
run_nn_experiment(
    experiment_name="NN_Stats_and_Embeddings",
    X_data=all_feature_sets["stats_and_embeddings"],
    y_data=y_data
)


--- Running Experiment: NN_Stats_and_Embeddings ---


2025/09/23 08:42:08 INFO mlflow.tracking.fluent: Experiment with name 'NN_Stats_and_Embeddings' does not exist. Creating a new experiment.






INFO:tensorflow:Assets written to: C:\Users\PRADIS~1\AppData\Local\Temp\tmp5zk6bpgp\model\data\model\assets




--- Experiment 'NN_Stats_and_Embeddings' Complete ---
F1 Score: 0.3209
ROC AUC: 0.6725


In [10]:
# --- Experiment 5: Stats + PCA-Reduced Embeddings ---
run_nn_experiment(
    experiment_name="NN_Stats_and_PCA",
    X_data=all_feature_sets["stats_and_pca"],
    y_data=y_data
)

2025/09/23 08:43:42 INFO mlflow.tracking.fluent: Experiment with name 'NN_Stats_and_PCA' does not exist. Creating a new experiment.



--- Running Experiment: NN_Stats_and_PCA ---




INFO:tensorflow:Assets written to: C:\Users\PRADIS~1\AppData\Local\Temp\tmpy45bjx19\model\data\model\assets




--- Experiment 'NN_Stats_and_PCA' Complete ---
F1 Score: 0.2910
ROC AUC: 0.6517
