In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

import warnings
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np

import keras_tuner as kt
warnings.filterwarnings('ignore')

# --- NN Configuration ---
# We'll use these for the baseline experiments
NN_EPOCHS = 100
NN_BATCH_SIZE = 64

In [None]:
# --- File Paths ---
SOURCE_DATASET_PATH = "data/final_dataset_with_embeddings.csv" # Your final dataset with embeddings
BASE_LOG_DIR = "logs" # A parent directory to store all results

# --- Feature Configuration ---
METADATA_COLS = ["commit_hash", "author_email", "commit_date"]
LABEL_COL = "is_bug_introducing"
N_PCA_COMPONENTS = 177 # The optimal number you found

In [None]:
def load_base_data(path):
    """Loads the source CSV, cleans it, and sorts by date."""
    print(f"Loading and preparing base data from '{path}'...")
    df = pd.read_csv(path)
    df.dropna(subset=['commit_hash', LABEL_COL], inplace=True)
    df["commit_date"] = pd.to_datetime(df["commit_date"])
    df.sort_values(by="commit_date", inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Base data loaded successfully.")
    return df

def prepare_feature_sets(df):
    """Creates all the different feature combinations for our experiments."""
    print("Preparing all feature sets...")
    
    embedding_cols = [col for col in df.columns if col.startswith('emb_')]
    stats_cols = [col for col in df.columns if col not in embedding_cols + METADATA_COLS + [LABEL_COL]]
    
    # Normalize and apply PCA to embeddings
    X_embed = df[embedding_cols].values
    X_normalized = Normalizer(norm='l2').fit_transform(X_embed)
    pca = PCA(n_components=N_PCA_COMPONENTS, random_state=42)
    X_pca = pca.fit_transform(X_normalized)
    
    pca_cols = [f'pca_{i+1}' for i in range(N_PCA_COMPONENTS)]
    df_pca = pd.DataFrame(X_pca, columns=pca_cols)

    feature_sets = {
        "stats_only": df[stats_cols],
        "embeddings_only": df[embedding_cols],
        "pca_only": df_pca,
        "stats_and_embeddings": pd.concat([df[stats_cols], df[embedding_cols]], axis=1),
        "stats_and_pca": pd.concat([df[stats_cols], df_pca], axis=1)
    }
    
    print("All feature sets are ready.")
    return feature_sets, df[LABEL_COL]

In [None]:
def build_tunable_model(hp):
    """Builds a Keras model with tunable hyperparameters."""
    model = keras.Sequential()
    
    # Tune the number of hidden layers (1 to 3)
    for i in range(hp.Int('num_layers', 1, 3)):
        # Tune the number of neurons in each layer
        model.add(layers.Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
            activation='relu'
        ))
        # Tune the dropout rate for each layer
        model.add(layers.Dropout(
            rate=hp.Float(f'dropout_{i}', min_value=0.2, max_value=0.5, step=0.1)
        ))
        
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # Tune the learning rate
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy'] # Objective is specified in the tuner
    )
    return model

In [None]:
# --- Select your best performing feature set from the baseline experiments ---
best_feature_set_name = "NN_Stats_and_PCA" # Change this based on your results
X_data_best = all_feature_sets[best_feature_set_name.replace("NN_", "").lower()]

# Split and scale the best data
split_point = int(len(X_data_best) * 0.80)
X_train, X_test = X_data_best.iloc[:split_point], X_data_best.iloc[split_point:]
y_train, y_test = y_data.iloc[:split_point], y_data.iloc[split_point:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Set up the KerasTuner ---
tuner = kt.Hyperband(
    build_tunable_model,
    objective='val_loss', # Objective to minimize
    max_epochs=30,
    factor=3,
    directory=os.path.join(BASE_LOG_DIR, 'keras_tuner'),
    project_name=f'tune_{best_feature_set_name}'
)

# Define a callback to stop training early if the model is not improving
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

print(f"\n--- Starting Hyperparameter and Architecture Search for '{best_feature_set_name}' ---")
tuner.search(
    X_train_scaled, 
    y_train, 
    epochs=50, 
    validation_data=(X_test_scaled, y_test), 
    callbacks=[stop_early]
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("\n--- Search Complete --- ✅")
print(f"Optimal number of layers: {best_hps.get('num_layers')}")
print(f"Optimal learning rate: {best_hps.get('learning_rate')}")
# You can print other best params similarly...

# Build the best model and train it on the full data
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test))