# ANN only for embeddings

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

# --- Configuration ---
FINAL_EMBEDDING_CSV_PATH = "data/final_embedding_dataset.csv"

# --- Load Data ---
print(f"Loading final embedding dataset from '{FINAL_EMBEDDING_CSV_PATH}'...")
df = pd.read_csv(FINAL_EMBEDDING_CSV_PATH)

# --- Prepare Data for PyTorch ---
# Get all columns that start with 'emb_'
embedding_cols = [col for col in df.columns if col.startswith('emb_')]
X = df[embedding_cols].values
y = df['is_bug_introducing'].values

# Split the data into training and testing sets (approximating chronological split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Convert NumPy arrays to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

print("Data loaded and converted to PyTorch tensors. ✅")

Loading final embedding dataset from 'data/final_embedding_dataset.csv'...
Data loaded and converted to PyTorch tensors. ✅


In [2]:
from torch.utils.data import TensorDataset, DataLoader

# Create TensorDataset objects
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
eval_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader objects to handle batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=32)

print("PyTorch DataLoaders created successfully.")

PyTorch DataLoaders created successfully.


In [5]:
import mlflow
import torch.nn as nn
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from tqdm.notebook import tqdm

# --- Define the Neural Network (same as before) ---
class BugPredictorNN(nn.Module):
    def __init__(self, input_size=768):
        super(BugPredictorNN, self).__init__()
        self.layer_stack = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.layer_stack(x)

# --- Configuration for the experiment ---
config = {"learning_rate": 1e-4, "batch_size": 32, "num_epochs": 3}

# --- Start an MLflow Run ---
with mlflow.start_run(run_name="NeuralNet_on_CSV_Embeddings") as run:
    mlflow.log_params(config)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BugPredictorNN().to(device)
    
    # Handle class imbalance with class weights
    # Filter out NaN values before computing class weights
    valid_indices = ~np.isnan(y_train)
    valid_y_train = y_train[valid_indices]
    class_counts = np.bincount(valid_y_train.astype(int))
    pos_weight = torch.tensor([class_counts[0] / class_counts[1]]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

    # --- Training Loop ---
    for epoch in range(config["num_epochs"]):
        model.train()
        for embeddings, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            embeddings, labels = embeddings.to(device), labels.to(device).unsqueeze(1)
            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        # --- Evaluation Loop ---
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for embeddings, labels in eval_loader:
                outputs = model(embeddings.to(device))
                preds = (torch.sigmoid(outputs).squeeze() > 0.5).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
        
        # Calculate and log metrics
        f1 = f1_score(all_labels, all_preds, zero_division=0)
        recall = recall_score(all_labels, all_preds, zero_division=0)
        mlflow.log_metric("eval_f1", f1, step=epoch)
        mlflow.log_metric("eval_recall", recall, step=epoch)
        print(f"Epoch {epoch+1}: F1: {f1:.4f}, Recall: {recall:.4f}")

print("\n--- Training Finished --- ✅")

Epoch 1:   0%|          | 0/3134 [00:00<?, ?it/s]

  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.