In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd

train_df = pd.read_csv("/content/drive/MyDrive/black_only/train_black.csv")
val_df   = pd.read_csv("/content/drive/MyDrive/black_only/val_black.csv")
test_df  = pd.read_csv("/content/drive/MyDrive/black_only/test_black.csv")


In [3]:
train_data = train_df.to_dict(orient="records")
val_data   = val_df.to_dict(orient="records")
test_data  = test_df.to_dict(orient="records")


In [4]:
# Colab/Notebook setup
!pip -q install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip -q install transformers==4.41.0 torchmetrics==1.4.0 scikit-learn==1.6.0 tree_sitter==0.20.2 tree_sitter_languages transformers torch
!pip -q install torch_geometric==2.6.0 torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.3.0+cu118.html
!pip -q install tree_sitter==0.20.2

import os, sys, json, math, random, numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from torchmetrics.functional import auroc

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.8/137.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m22.0 MB/s[0m eta [36m0:0

# Data Loading and Preprocessing
Goal: Read your dataset rows, decode  as UTF‑8 (fixed file after commit), preserve exact line boundaries (including whitespace), and build labels from .

Line numbering: This notebook assumes  uses 0-based indices. If yours are 1-based, subtract 1 during preprocessing.

Commit/diff metadata: Optional functions are included to enrich samples later

In [5]:
# ================================
# 1. Imports
# ================================
import json
import random
import pandas as pd


# ================================
# 2. YOUR ORIGINAL ADAPTER (UNCHANGED)
# ================================
def df_row_to_record(row):
    raw = row['content']

    # Step 1: Decode bytes → string
    if isinstance(raw, (bytes, bytearray)):
        content = raw.decode('utf-8', errors='ignore')
    else:
        content = str(raw)

    # Step 2: FIX escaped newlines (CRITICAL)
    if "\\n" in content:
        content = content.encode('utf-8').decode('unicode_escape')

    # After preprocess_df, induce_bug should already be a list.
    # The 'if isinstance(induce_bug, str)' check is now redundant and problematic.
    induce_bug = row['induce_bug']

    record = {
        "datetime": row.get('datetime', ''),
        "committer": row.get('committer', ''),
        "commit": row.get('commit', ''),
        "repo": row.get('repo', ''),
        "filepath": row.get('filepath', ''),
        "content": content,           # NOW real multi-line code
        "methods": row.get('methods', []),
        "induce_bug": induce_bug, # induce_bug is expected to be a list here
        "file_label": int(row.get('file_label', 1 if len(induce_bug) > 0 else 0)),
    }
    return record


# ================================
# 3. LOAD REAL DATASET
# ================================
# ⟳ CHANGE THIS PATH TO YOUR DATASET
DATASET_PATH = "/content/drive/MyDrive/black_only/train_black.csv"

df = pd.read_csv(DATASET_PATH)
print("Loaded dataset size:", len(df))


# ================================
# 4. PREPROCESSING (SAFE & MINIMAL)
# ================================
def preprocess_df(df):
    # Drop rows without code content
    df = df.dropna(subset=["content"]).reset_index(drop=True)

    # Normalize induce_bug column
    def parse_induce_bug(x):
        if isinstance(x, list):
            # Ensure all elements in the list are integers
            return [int(val) for val in x if isinstance(val, (int, str)) and str(val).isdigit()]
        if isinstance(x, (str, bytes, bytearray)):
            # Decode bytes/bytearray to string if necessary
            s = x.decode('utf-8', errors='ignore') if isinstance(x, (bytes, bytearray)) else str(x)
            try:
                # Try JSON parsing first for standard JSON arrays
                parsed = json.loads(s)
                if isinstance(parsed, list):
                    # Ensure all elements in the parsed list are integers
                    return [int(val) for val in parsed if isinstance(val, (int, str)) and str(val).isdigit()]
                return []
            except json.JSONDecodeError:
                # If JSON parsing fails, try to parse space-separated numbers
                cleaned_str = s.strip('[] \t\n\r') # More robust strip
                if cleaned_str:
                    try:
                        # Ensure numerical values before converting to int
                        return [int(val) for val in cleaned_str.split() if str(val).isdigit()]
                    except ValueError:
                        return [] # Fallback if direct parsing fails
                else:
                    return []
            except Exception as e:
                # Catch any other errors during parsing
                print(f"Error parsing induce_bug string '{s}': {e}")
                return []
        return [] # Fallback for non-string, non-list types

    df["induce_bug"] = df["induce_bug"].apply(parse_induce_bug)

    # File-level label
    if "file_label" not in df.columns:
        df["file_label"] = df["induce_bug"].apply(
            lambda x: 1 if len(x) > 0 else 0
        )

    return df


df = preprocess_df(df)
print("After preprocessing:", len(df))


# ================================
# 5. SELECT TOP-10 FILES WITH MOST BUGGY LINES
# ================================
df["bug_line_count"] = df["induce_bug"].apply(len)

top10_df = (
    df[df["bug_line_count"] > 0]
    .sort_values("bug_line_count", ascending=False)
    .head(10)
)

print("\nTop-10 buggy files:")
print(top10_df[["filepath", "bug_line_count"]])


# ================================
# 6. CONVERT TO MODEL-READY RECORDS
# ================================
records = [
    df_row_to_record(row)
    for _, row in top10_df.iterrows()
]

print("\nConverted records:", len(records))


# ================================
# 7. OPTIONAL: BATCH ITERATOR (FOR TRAINING)
# ================================
def batch_iterator(df, batch_size=16):
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start + batch_size]
        yield [
            df_row_to_record(row)
            for _, row in batch.iterrows()
        ]


# ================================
# 8. QUICK SANITY CHECK
# ================================
sample = records[0]

print("\n--- SAMPLE RECORD ---")
print("File:", sample["filepath"])
print("Buggy lines:", sample["induce_bug"])
print("Total lines:", len(sample["content"].splitlines()))
print("File label:", sample["file_label"])


Loaded dataset size: 500
After preprocessing: 499

Top-10 buggy files:
                       filepath  bug_line_count
179     src\black_primer\lib.py             261
68                     black.py             229
17   blib2to3\pgen2\tokenize.py             218
188                    black.py             208
111       src\black\__init__.py             190
121                    black.py             185
140                    black.py             146
200    src\black\concurrency.py             143
178     src\black_primer\cli.py             135
10                     black.py             132

Converted records: 10

--- SAMPLE RECORD ---
File: src\black_primer\lib.py
Buggy lines: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 

In [6]:
len(sample["content"].splitlines())  # ✅ correct

263

In [7]:
df_raw = pd.read_csv(DATASET_PATH)
print(df_raw["content"].isna().sum())


1


In [8]:
lines = sample["content"].splitlines()
induce_bug = sample["induce_bug"]
len(lines) > max(induce_bug) if induce_bug else True

True

In [9]:
print(df.columns.tolist())


['datetime', 'commit', 'repo', 'filepath', 'content', 'methods', 'induce_bug', 'file_label', 'bug_line_count']


In [10]:
#Turn the preprocessing in to reusable pipeline

def load_and_preprocess(csv_path):
    df = pd.read_csv(csv_path)
    print(f"Loaded {csv_path}: {len(df)} rows")

    df = preprocess_df(df)
    print(f"After preprocessing: {len(df)} rows")

    return df


In [11]:
#All splits using same pipeline

train_df = load_and_preprocess("/content/drive/MyDrive/black_only/train_black.csv")
val_df   = load_and_preprocess("/content/drive/MyDrive/black_only/val_black.csv")
test_df  = load_and_preprocess("/content/drive/MyDrive/black_only/test_black.csv")


Loaded /content/drive/MyDrive/black_only/train_black.csv: 500 rows
After preprocessing: 499 rows
Loaded /content/drive/MyDrive/black_only/val_black.csv: 29 rows
After preprocessing: 28 rows
Loaded /content/drive/MyDrive/black_only/test_black.csv: 27 rows
After preprocessing: 27 rows


In [12]:
#Covert data frames in to records

def df_to_records(df):
    return [df_row_to_record(row) for _, row in df.iterrows()]

train_records = df_to_records(train_df)
val_records   = df_to_records(val_df)
test_records  = df_to_records(test_df)


In [13]:
#Bug statics per splits

def print_stats(name, records):
    buggy = sum(len(r["induce_bug"]) > 0 for r in records)
    print(f"{name}: {len(records)} files | {buggy} buggy")

print_stats("Train", train_records)
print_stats("Val", val_records)
print_stats("Test", test_records)


Train: 499 files | 221 buggy
Val: 28 files | 4 buggy
Test: 27 files | 3 buggy


# Commit diff enrichment
This step shows how you could fetch commit diffs and highlight changed lines. It’s optional and may require auth for large-scale use.

In [14]:
import re
import requests

def fetch_github_diff(owner, repo, commit_hash):
    """
    Fetch unified diff for a public GitHub repo commit.
    Returns raw diff text or None on failure.
    """
    url = f"https://github.com/{owner}/{repo}/commit/{commit_hash}.diff"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            return r.text
    except Exception as e:
        return None
    return None

def parse_changed_lines_from_diff(diff_text, target_filepath):
    """
    Very light parser to extract changed line numbers (post-commit) for a single file.
    Parses hunk headers like @@ -a,b +c,d @@ and counts added lines in that hunk.
    Returns set of 0-based indices for lines in the post-commit file that were changed.
    """
    if diff_text is None:
        return set()
    changed = set()
    current_file = None
    for line in diff_text.splitlines():
        if line.startswith("diff --git"):
            current_file = None
        elif line.startswith("+++ b/"):
            current_file = line[len("+++ b/"):].strip()
        elif current_file == target_filepath and line.startswith("@@"):
            # Example: @@ -10,7 +10,9 @@
            m = re.search(r"\+(\d+),?(\d+)?", line)
            if not m:
                continue
            start = int(m.group(1))
            count = int(m.group(2)) if m.group(2) else 1
            # Track indices in the post-commit file; subtract 1 for 0-based
            for idx in range(start-1, start-1+count):
                changed.add(idx)
        # Note: A robust parser should also handle '+' lines inside hunks to mark added lines.
    return changed

# Example usage (public repo): owner='psf', repo='black', commit='00e7e12a3a412ea386806d5d4eeaed345e912940'
# diff = fetch_github_diff('psf', 'black', '00e7e12a3a412ea386806d5d4eeaed345e912940')
# changed = parse_changed_lines_from_diff(diff, 'src/black/linegen.py')
# print("Changed lines (post-commit, 0-based):", sorted(list(changed))[:20])

In [15]:
# Example usage: fetch diff and print changed lines

owner = "psf"
repo = "black"
commit_hash = "00e7e12a3a412ea386806d5d4eeaed345e912940"
target_filepath = "src/black/linegen.py"

diff = fetch_github_diff(owner, repo, commit_hash)

if diff is None:
    print("Failed to fetch diff")
else:
    changed = parse_changed_lines_from_diff(diff, target_filepath)
    print("Changed lines (post-commit, 0-based):")
    print(sorted(changed))


Changed lines (post-commit, 0-based):
[225, 226, 227, 228, 229, 230, 231, 232, 233]


# Line tokenization and CodeBERT embeddings
Keep whitespace: We never strip or trim lines. We split by  exactly to preserve  alignment.
Embedding strategy: Use CodeBERT’s [CLS] embedding per line. Then contextualize with BiLSTM across lines to capture neighbors.

In [16]:
# CodeBERT
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
codebert = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)
codebert.eval()

@torch.no_grad()
def embed_lines_with_codebert(lines, max_len=128):
    # Returns tensor [L, 768]; preserves line indices and whitespace alignment
    embs = []
    for line in lines:
        tokens = tokenizer.encode(line, truncation=True, max_length=max_len)
        input_ids = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)
        outputs = codebert(input_ids)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # [1, 768]
        embs.append(cls_emb.squeeze(0))
    return torch.stack(embs, dim=0)

# BiLSTM contextualizer
class ContextEncoder(nn.Module):
    def __init__(self, in_dim=768, hidden=256, num_layers=1, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hidden, num_layers=num_layers,
                            bidirectional=True, batch_first=True, dropout=dropout)
        self.proj = nn.Linear(hidden*2, hidden*2)
        self.norm = nn.LayerNorm(hidden*2)

    def forward(self, x):  # x: [B, L, 768]
        h, _ = self.lstm(x)
        y = self.norm(torch.tanh(self.proj(h)))  # [B, L, 512]
        return y

context_encoder = ContextEncoder().to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]



# Graph construction: AST + CFG-lite + PDG-lite → CPG
AST: Tree-sitter Python grammar to gather node spans. We link consecutive lines inside the same span.

CFG-lite: Simple sequential edges line i ↔ i+1. This keeps the graph sparse and Colab-friendly.

PDG-lite: Connect lines that share identifier names (excluding Python keywords), within a local window.

In [17]:

!pip install tree-sitter tree-sitter-languages
!pip install torch
from tree_sitter import Language, Parser
from tree_sitter_languages import get_language
import torch


# Build Python grammar

from tree_sitter import Parser

PY_LANGUAGE = get_language("python")

parser = Parser()
parser.set_language(PY_LANGUAGE)


def parse_ast_spans(code):
    tree = parser.parse(code.encode("utf8"))
    root = tree.root_node
    spans = []

    def visit(node):
        spans.append((node.type, node.start_point[0], node.end_point[0]))  # 0-based lines
        for child in node.children:
            visit(child)

    visit(root)
    return spans


def extract_identifiers_python(line):
    tokens = [t for t in ''.join([c if (c.isalnum() or c=='_') else ' ' for c in line]).split() if len(t) > 1]
    stop = set([
        "def","class","return","if","else","elif","for","while","import","from","as","with",
        "try","except","finally","True","False","None","and","or","not","in","is","lambda",
        "global","nonlocal","assert","yield","pass","break","continue","raise"
    ])
    return [t for t in tokens if t not in stop]

def build_cpg(lines, max_window=15):
    num_lines = len(lines)
    edge_index = []
    edge_type = []  # 0=sequential, 1=AST-span adjacency, 2=def-use PDG-lite

    # CFG-lite: sequential edges
    for i in range(num_lines-1):
        edge_index.append([i, i+1]); edge_type.append(0)
        edge_index.append([i+1, i]); edge_type.append(0)

    # AST-lite: adjacency inside span ranges
    code = "\n".join(lines)
    spans = parse_ast_spans(code)
    for tp, s, e in spans:
        s = max(0, min(s, num_lines-1))
        e = max(0, min(e, num_lines-1))
        for i in range(s, e):
            edge_index.append([i, i+1]); edge_type.append(1)
            edge_index.append([i+1, i]); edge_type.append(1)

    # PDG-lite: co-occurrence window edges for identifiers
    id_map = {}
    for i, line in enumerate(lines):
        ids = extract_identifiers_python(line)
        for tok in ids:
            id_map.setdefault(tok, []).append(i)
    for tok, idxs in id_map.items():
        idxs = sorted(set(idxs))
        for a in idxs:
            for b in idxs:
                if a != b and abs(a-b) <= max_window:
                    edge_index.append([a, b]); edge_type.append(2)

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() if edge_index else torch.empty((2,0), dtype=torch.long)
    edge_type = torch.tensor(edge_type, dtype=torch.long) if edge_type else torch.empty((0,), dtype=torch.long)
    return edge_index, edge_type



# Dataset class
Function: Turn each record into a graph sample:

CodeBERT embeddings → BiLSTM contextualization → node features

Build CPG-lite edges

Prepare line labels from  (exact indices preserved)

File label for the file-level head

In [18]:
from torch.utils.data import Dataset, DataLoader

# -----------------------------
# Dataset wrapper (unchanged)
# -----------------------------
class LineGraphSample:
    def __init__(self, x, edge_index, edge_type, line_labels, file_label, mask_lines):
        self.x = x
        self.edge_index = edge_index
        self.edge_type = edge_type
        self.line_labels = line_labels
        self.file_label = file_label
        self.mask_lines = mask_lines


class DefectDataset(Dataset):
    def __init__(self, records, cache=True):
        self.records = records
        self.cache = cache
        self._cache = [None] * len(records)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        if self.cache and self._cache[idx] is not None:
            return self._cache[idx]

        rec = self.records[idx]
        lines = rec["content"].split("\n")   # keep original line indices
        L = len(lines)

        with torch.no_grad():
            line_embs = embed_lines_with_codebert(lines)   # [L, 768]

        ctx_inp = line_embs.unsqueeze(0)                   # [1, L, 768]
        ctx_out = context_encoder(ctx_inp).squeeze(0)      # [L, 512]

        edge_index, edge_type = build_cpg(lines)

        # ----- line-level labels (0-based) -----
        line_labels = torch.zeros(L, dtype=torch.float32)
        for ln in rec["induce_bug"]:
            if 0 <= ln < L:
                line_labels[ln] = 1.0

        file_label = torch.tensor(rec["file_label"], dtype=torch.long)
        mask_lines = torch.ones(L, dtype=torch.bool)

        sample = LineGraphSample(
            x=ctx_out,
            edge_index=edge_index,
            edge_type=edge_type,
            line_labels=line_labels,
            file_label=file_label,
            mask_lines=mask_lines
        )

        if self.cache:
            self._cache[idx] = sample

        return sample


In [19]:
# Split data and create DataLoaders
from torch.utils.data import DataLoader

def collate(samples):
    return samples  # one file per batch

train_ds = DefectDataset(train_records, cache=True)
val_ds   = DefectDataset(val_records,   cache=False)
test_ds  = DefectDataset(test_records,  cache=False)

train_loader = DataLoader(
    train_ds,
    batch_size=1,
    shuffle=True,
    collate_fn=collate
)

val_loader = DataLoader(
    val_ds,
    batch_size=1,
    shuffle=False,
    collate_fn=collate
)

test_loader = DataLoader(
    test_ds,
    batch_size=1,
    shuffle=False,
    collate_fn=collate
)

print("Train:", len(train_ds))
print("Val:  ", len(val_ds))
print("Test: ", len(test_ds))


Train: 499
Val:   28
Test:  27


# GAT fusion and dual-head model

Edge-type embeddings: Inject coarse edge-type info into GAT via an augmentation trick.

Residuals and normalization: Keep features stable and avoid over-smoothing.

Heads:

Line-level sigmoid for per-line probabilities
File-level attention pooling + MLP for buggy vs clean

In [20]:
from torch_geometric.nn import GATConv, GlobalAttention

class LineDefectModel(nn.Module):
    def __init__(self, in_dim=512, hidden=256, heads1=8, heads2=4, num_edge_types=3, dropout=0.2):
        super().__init__()
        self.edge_emb = nn.Embedding(num_edge_types, 16)
        self.gat1 = GATConv(in_dim + 16, hidden, heads=heads1, dropout=dropout, concat=True)
        self.gat2 = GATConv(hidden*heads1 + 16, hidden, heads=heads2, dropout=dropout, concat=True)
        out_dim = hidden*heads2
        self.norm = nn.LayerNorm(out_dim)
        self.line_head = nn.Linear(out_dim, 1)
        self.gate_nn = nn.Sequential(nn.Linear(out_dim, 64), nn.ReLU(), nn.Linear(64, 1))
        self.global_att = GlobalAttention(gate_nn=self.gate_nn)
        self.file_head = nn.Sequential(nn.Linear(out_dim, 256), nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, 2))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index, edge_type, mask_lines=None):
        if edge_index.numel() == 0:
            h = self.norm(x)
        else:
            e = self.edge_emb(edge_type)
            src, dst = edge_index
            # Aggregate edge-type embeddings to source nodes (approximate edge feature injection)
            aug1 = torch.zeros((x.size(0), e.size(1)), device=x.device)
            aug1.index_add_(0, src, e)
            x1 = torch.cat([x, aug1], dim=1)
            h1 = self.gat1(x1, edge_index)
            h1 = F.elu(h1)
            # Second layer with updated augmentation
            aug2 = torch.zeros((h1.size(0), e.size(1)), device=h1.device)
            aug2.index_add_(0, src, e)
            x2 = torch.cat([h1, aug2], dim=1)
            h2 = self.gat2(x2, edge_index)
            h = self.norm(h2)

        h = self.dropout(h)
        line_logits = self.line_head(h).squeeze(-1)  # [L]
        if mask_lines is None:
            mask_lines = torch.ones(h.size(0), dtype=torch.bool, device=h.device)
        file_repr = self.global_att(h[mask_lines])   # [D]
        file_logits = self.file_head(file_repr)      # [2]
        return line_logits, file_logits



# Losses, optimizer, and training loop

Focal loss: For line-level imbalance.

Multi-task loss: Emphasize line-level learning while keeping file-level useful.

Gradient clipping: Stability.

In [21]:
def focal_loss(prob, target, alpha=0.25, gamma=2.0, eps=1e-6):
    prob = torch.clamp(prob, eps, 1.0 - eps)
    ce_pos = -torch.log(prob)
    ce_neg = -torch.log(1.0 - prob)
    loss = alpha * (1 - prob)**gamma * target * ce_pos + (1 - alpha) * (prob**gamma) * (1 - target) * ce_neg
    return loss.mean()

lambda_line, lambda_file = 2.0, 1.0
model = LineDefectModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

def train_epoch(loader):
    model.train()
    total = 0.0
    for batch in loader:
        sample = batch[0]  # single sample per batch
        x = sample.x.to(device)
        edge_index = sample.edge_index.to(device)
        edge_type = sample.edge_type.to(device)
        line_labels = sample.line_labels.to(device)
        file_label = sample.file_label.to(device)
        mask_lines = sample.mask_lines.to(device)

        optimizer.zero_grad()
        line_logits, file_logits = model(x, edge_index, edge_type, mask_lines)
        line_prob = torch.sigmoid(line_logits)
        line_loss = focal_loss(line_prob, line_labels)

        # Original issue: file_label_one_hot was float, causing RuntimeError. The line below is correct.
        file_label = file_label.long()   # ensure correct dtype

        # file_logits MUST be [1, 2]
        file_loss = F.cross_entropy(
            file_logits,
            file_label.unsqueeze(0)
            )

        loss = lambda_line * line_loss + lambda_file * file_loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()
    return total / max(1, len(loader))

    print("file_logits shape:", file_logits.shape)
    print("file_label shape:", file_label.shape, file_label.dtype)




# Evaluation metrics

File-level: Accuracy, F1, ROC-AUC, PR-AUC

Line-level: PR-AUC (global), plus Top‑k and MRR for localization

In [None]:
def evaluate(loader):
    model.eval()
    file_probs, file_labels = [], []
    line_probs, line_labels = [], []

    with torch.no_grad():
        for batch in loader:
            sample = batch[0]
            x = sample.x.to(device)
            edge_index = sample.edge_index.to(device)
            edge_type = sample.edge_type.to(device)
            mask_lines = sample.mask_lines.to(device)

            line_logits, file_logits = model(x, edge_index, edge_type, mask_lines)
            fp = torch.softmax(file_logits, dim=-1)[1].item()
            file_probs.append(fp)
            file_labels.append(sample.file_label.item())

            lp = torch.sigmoid(line_logits).cpu().numpy().tolist()
            ll = sample.line_labels.cpu().numpy().tolist()
            line_probs.extend(lp)
            line_labels.extend(ll)

    # File-level
    file_probs_np = np.array(file_probs)
    file_labels_np = np.array(file_labels)
    file_preds_bin = (file_probs_np >= 0.5).astype(int)

    acc = accuracy_score(file_labels_np, file_preds_bin)
    f1 = f1_score(file_labels_np, file_preds_bin)
    try:
        roc = roc_auc_score(file_labels_np, file_probs_np)
    except Exception:
        roc = float('nan')
    pr = average_precision_score(file_labels_np, file_probs_np)

    # Line-level (global PR-AUC across all lines)
    line_probs_np = np.array(line_probs)
    line_labels_np = np.array(line_labels)
    l_pr = average_precision_score(line_labels_np, line_probs_np)

    return {
        "file": {"acc": acc, "f1": f1, "roc": roc, "pr": pr},
        "line": {"pr": l_pr}
    }

def topk_mrr(loader, k=5):
    model.eval()
    hits, rr_list = [], []
    with torch.no_grad():
        for batch in loader:
            sample = batch[0]
            x = sample.x.to(device)
            edge_index = sample.edge_index.to(device)
            edge_type = sample.edge_type.to(device)
            mask_lines = sample.mask_lines.to(device)

            line_logits, _ = model(x, edge_index, edge_type, mask_lines)
            probs = torch.sigmoid(line_logits).cpu().numpy()
            buggy_idx = np.where(sample.line_labels.cpu().numpy() > 0.5)[0]
            if len(buggy_idx) == 0:
                continue
            order = np.argsort(-probs)
            topk = order[:k]
            hit = int(any(b in topk for b in buggy_idx))
            hits.append(hit)
            ranks = [np.where(order == b)[0][0] + 1 for b in buggy_idx]
            rr = 1.0 / min(ranks)
            rr_list.append(rr)
    hit_rate = np.mean(hits) if hits else 0.0
    mrr = np.mean(rr_list) if rr_list else 0.0
    return hit_rate, mrr

#Train and Validate

epochs = 6 # Moved from inside topk_mrr function to global scope
for ep in range(1, epochs+1):
    tr_loss = train_epoch(train_loader)
    val_metrics = evaluate(val_loader)
    print(f"Epoch {ep:02d} | TrainLoss {tr_loss:.4f} | "
          f"Val File(acc {val_metrics['file']['acc']:.3f}, f1 {val_metrics['file']['f1']:.3f}, "
          f"roc {val_metrics['file']['roc']:.3f}, pr {val_metrics['file']['pr']:.3f}) | "
          f"Line (pr {val_metrics['line']['pr']:.3f})")

#Train and Validate

For real datasets, increase epochs (e.g., 20–40), and consider caching embeddings to disk for speed.

In [None]:
#Training loop
def train_epoch(loader):
    model.train()
    total = 0.0
    for batch in loader:
        # single-sample batches for simplicity
        sample = batch[0]
        x = sample.x.to(device)
        edge_index = sample.edge_index.to(device)
        edge_type = sample.edge_type.to(device)
        line_labels = sample.line_labels.to(device)
        file_label = sample.file_label.to(device)
        mask_lines = sample.mask_lines.to(device)

        optimizer.zero_grad()
        line_logits, file_logits = model(x, edge_index, edge_type, mask_lines)
        line_prob = torch.sigmoid(line_logits)
        line_loss = focal_loss(line_prob, line_labels)

        # file_logits is [2] for a single sample. F.cross_entropy expects input to be [N, C] and target to be [N] (class indices).
        # So, unsqueeze file_logits to [1, 2] and pass file_label (scalar Long) directly as target.
        file_label = file_label.long()   # ensure correct dtype
        # file_logits MUST be [1, 2]
        file_loss = F.cross_entropy(
            file_logits,
            file_label.unsqueeze(0)
            )


        loss = lambda_line * line_loss + lambda_file * file_loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()
    return total / len(loader)

def evaluate(loader):
    model.eval()
    file_probs, file_labels = [], []
    line_probs, line_labels = [], []

    with torch.no_grad():
        for batch in loader:
            sample = batch[0]
            x = sample.x.to(device)
            edge_index = sample.edge_index.to(device)
            edge_type = sample.edge_type.to(device)
            mask_lines = sample.mask_lines.to(device)

            line_logits, file_logits = model(x, edge_index, edge_type, mask_lines)
            fp = torch.softmax(file_logits, dim=-1)[1].item()
            file_probs.append(fp)
            file_labels.append(sample.file_label.item())

            lp = torch.sigmoid(line_logits).detach().cpu().numpy().tolist()
            ll = sample.line_labels.detach().cpu().numpy().tolist()
            line_probs.extend(lp)
            line_labels.extend(ll)

    # File-level metrics
    file_probs_np = np.array(file_probs)
    file_labels_np = np.array(file_labels)
    file_preds_bin = (file_probs_np >= 0.5).astype(int)

    acc = accuracy_score(file_labels_np, file_preds_bin)
    f1 = f1_score(file_labels_np, file_preds_bin)
    try:
        roc = roc_auc_score(file_labels_np, file_probs_np)
    except:
        roc = float('nan')
    pr = average_precision_score(file_labels_np, file_probs_np)

    # Line-level metrics (PR-AUC)
    line_probs_np = np.array(line_probs)
    line_labels_np = np.array(line_labels)
    l_pr = average_precision_score(line_labels_np, line_probs_np)

    return {
        "file": {"acc": acc, "f1": f1, "roc": roc, "pr": pr},
        "line": {"pr": l_pr}
    }

# Train
epochs = 6
for ep in range(1, epochs+1):
    tr_loss = train_epoch(train_loader)
    val_metrics = evaluate(val_loader)
    print(f"Epoch {ep:02d} | TrainLoss {tr_loss:.4f} | "
          f"Val File(acc {val_metrics['file']['acc']:.3f}, f1 {val_metrics['file']['f1']:.3f}, "
          f"roc {val_metrics['file']['roc']:.3f}, pr {val_metrics['file']['pr']:.3f}) | "
          f"Line (pr {val_metrics['line']['pr']:.3f})")

#Test Evaluationa and Localization metrics


In [None]:
test_metrics = evaluate(test_loader)
hit5, mrr = topk_mrr(test_loader, k=5)

print(f"Test File: Acc={test_metrics['file']['acc']:.3f} F1={test_metrics['file']['f1']:.3f} "
      f"ROC={test_metrics['file']['roc']:.3f} PR={test_metrics['file']['pr']:.3f}")
print(f"Test Line: PR-AUC={test_metrics['line']['pr']:.3f} Top-5 Hit={hit5:.3f} MRR={mrr:.3f}")

IndexError: index 1 is out of bounds for dimension 0 with size 1

#Visualization Top-K predicted buggy lines
Shows predicted probabilities and whether each top line is labeled in induce_bug. This aligns with the dataset’s content (post-commit fixed file), including whitespace and exact indexing.


In [None]:
def inspect_sample(dataset, idx=0, k=10):
    sample = dataset[idx]
    x = sample.x.to(device)
    edge_index = sample.edge_index.to(device)
    edge_type = sample.edge_type.to(device)
    mask_lines = sample.mask_lines.to(device)
    lines = dataset.records[idx]["content"].split("\n")
    with torch.no_grad():
        line_logits, file_logits = model(x, edge_index, edge_type, mask_lines)
        probs = torch.sigmoid(line_logits).cpu().numpy()
        file_prob = torch.softmax(file_logits, dim=-1)[1].item()
    order = np.argsort(-probs)
    print(f"File buggy prob: {file_prob:.3f} | True label: {sample.file_label.item()} | Lines={len(lines)}")
    print(f"Bug indices (induce_bug): {dataset.records[idx]['induce_bug']}")
    print("Top-k lines:")
    for r in order[:k]:
        flag = ("<BUG>" if sample.line_labels[r].item() > 0.5 else "")
        # Show line index and content without trimming; keep whitespace visible by repr
        print(f"{r:4d}  prob={probs[r]:.3f}  {flag}  | {lines[r]}")

# Inspect a few test samples
inspect_sample(test_ds, idx=0, k=10)

#Within-project and cross-project evaluation scaffolding
Evaluates generalization across repositories using your repo field.


In [None]:
def split_within_project(records, repo_name, train_ratio=0.7, val_ratio=0.15):
    proj_records = [r for r in records if r['repo'] == repo_name]
    random.shuffle(proj_records)
    n = len(proj_records)
    t = int(train_ratio * n); v = int((train_ratio+val_ratio) * n)
    return proj_records[:t], proj_records[t:v], proj_records[v:]

def leave_one_repo_out(records):
    repo_names = sorted(set(r['repo'] for r in records))
    results = []
    for holdout in repo_names:
        train_recs = [r for r in records if r['repo'] != holdout]
        test_recs  = [r for r in records if r['repo'] == holdout]
        if len(test_recs) < 5:
            continue
        train_ds = DefectDataset(train_recs, cache=True)
        test_ds  = DefectDataset(test_recs, cache=False)
        train_loader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=collate_identity)
        test_loader  = DataLoader(test_ds, batch_size=1, shuffle=False, collate_fn=collate_identity)
        # Quick fine-tune (small epochs for demo)
        for ep in range(2):
            _ = train_epoch(train_loader)
        m = evaluate(test_loader)
        hit5, mrr = topk_mrr(test_loader, k=5)
        results.append((holdout, m, hit5, mrr))
        print(f"Repo {holdout}: File acc={m['file']['acc']:.3f}, F1={m['file']['f1']:.3f}, PR-AUC={m['file']['pr']:.3f} | "
              f"Line PR-AUC={m['line']['pr']:.3f}, Top-5={hit5:.3f}, MRR={mrr:.3f}")
    return results

# Example: run leave-one-repo-out (commented out for speed)
# loo_results = leave_one_repo_out(data)

#Performance measure graphs
You can plot metrics like Accuracy, F1, ROC-AUC, PR-AUC across epochs.


In [None]:
import matplotlib.pyplot as plt

# Example: track metrics during training
train_losses, val_accs, val_f1s, val_prs = [], [], [], []

for ep in range(epochs):
    tr_loss = train_epoch(train_loader)
    val_metrics = evaluate(val_loader)
    train_losses.append(tr_loss)
    val_accs.append(val_metrics['file']['acc'])
    val_f1s.append(val_metrics['file']['f1'])
    val_prs.append(val_metrics['file']['pr'])

# Plot curves
plt.figure(figsize=(10,6))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_accs, label="Val Accuracy")
plt.plot(val_f1s, label="Val F1")
plt.plot(val_prs, label="Val PR-AUC")
plt.xlabel("Epoch")
plt.ylabel("Metric")
plt.title("Training Performance over Epochs")
plt.legend()
plt.show()

#Confusion Metrix file level
This shows how well the model distinguishes buggy vs clean files

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

test_metrics = evaluate(test_loader)
file_probs, file_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        sample = batch[0]
        _, file_logits = model(sample.x.to(device),
                               sample.edge_index.to(device),
                               sample.edge_type.to(device),
                               sample.mask_lines.to(device))
        prob = torch.softmax(file_logits, dim=-1)[1].item()
        file_probs.append(prob)
        file_labels.append(sample.file_label.item())

preds = (np.array(file_probs) >= 0.5).astype(int)
cm = confusion_matrix(file_labels, preds)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Clean","Buggy"],
            yticklabels=["Clean","Buggy"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (File-Level)")
plt.show()

#Attension Heatmap (Line Level)
Visualize which lines the model attends to most strongly.

In [None]:
def visualize_attention(sample_idx=0, dataset=test_ds, top_k=20):
    sample = dataset[sample_idx]
    x = sample.x.to(device)
    edge_index = sample.edge_index.to(device)
    edge_type = sample.edge_type.to(device)
    mask_lines = sample.mask_lines.to(device)
    lines = dataset.records[sample_idx]["content"].split("\n")

    with torch.no_grad():
        line_logits, _ = model(x, edge_index, edge_type, mask_lines)
        probs = torch.sigmoid(line_logits).cpu().numpy()

    # Plot heatmap of line probabilities
    plt.figure(figsize=(10,6))
    sns.heatmap([probs], cmap="Reds", cbar=True,
                xticklabels=[f"{i}" for i in range(len(lines))])
    plt.title("Attention Heatmap: Buggy Line Probabilities")
    plt.xlabel("Line Index")
    plt.ylabel("Buggy Probability")
    plt.show()

    # Print top-k lines
    order = np.argsort(-probs)
    print("Top-k lines with highest buggy probability:")
    for r in order[:top_k]:
        flag = "<BUG>" if sample.line_labels[r].item() > 0.5 else ""
        print(f"{r:3d} prob={probs[r]:.3f} {flag} | {lines[r]}")

• 	Performance graphs → show training dynamics.

• 	Confusion matrix → file-level classification clarity.

• 	Attention heatmap → line-level interpretability.

#Notes and practical tips
Alignment: Because  is the fixed post-commit file and  includes whitespace line indices, we never strip or normalize lines when splitting. This preserves index alignment.

Diff integration: For efficiency and developer trust, consider highlighting changed lines from the commit diff and prioritizing them in attention pooling or thresholding.

Scaling: Precompute CodeBERT embeddings and cache them to disk; limit graphs to changed methods ± context; sparsify PDG edges to top‑k.

Epochs: On real datasets, increase training to 20–40 epochs; add early stopping on validation PR-AUC.