In [2]:


import os, shutil
import pandas as pd
from clang.cindex import Config, Index, CursorKind


# Paths
CSV_IN      = "devign.csv"
CSV_OUT     = "devign_with_atoms.csv"
SNIPPET_DIR = "snippets_clang"


assert os.path.isfile(CSV_IN), f"Cannot find {CSV_IN}"
index = Index.create()
print("Setup OK")


Setup OK


In [3]:

df = pd.read_csv(CSV_IN)
df['id'] = df.index


shutil.rmtree(SNIPPET_DIR, ignore_errors=True)
os.makedirs(SNIPPET_DIR, exist_ok=True)

for i, row in df.iterrows():
    path = os.path.join(SNIPPET_DIR, f"snippet_{i}.c")
    with open(path, "w", encoding="utf-8") as f:
        f.write(row["func_cleaned"])
print(f"Wrote {len(df)} snippets → {SNIPPET_DIR}")


Wrote 14032 snippets → snippets_clang


In [4]:

def extract_atoms(path):
    tu = index.parse(path, args=['-std=c99'])
    calls, decls, params, casts, lits, guards = [], [], [], [], [], []

    def walk(node):
        # Calls
        if node.kind == CursorKind.CALL_EXPR and node.spelling:
            calls.append((node.spelling, node.location.line))
        # Local decls
        if node.kind == CursorKind.VAR_DECL and node.spelling:
            decls.append((node.spelling, node.location.line))
        # Params
        if node.kind == CursorKind.PARM_DECL and node.spelling:
            params.append((node.spelling, node.location.line))
        # C-style casts
        if node.kind == CursorKind.CSTYLE_CAST_EXPR:
            casts.append((node.type.spelling, node.location.line))
        # Literals
        if node.kind in (
            CursorKind.INTEGER_LITERAL,
            CursorKind.FLOATING_LITERAL,
            CursorKind.STRING_LITERAL
        ):
            code = "".join(tok.spelling for tok in node.get_tokens())
            lits.append((code, node.location.line))
        # If-guards
        if node.kind == CursorKind.IF_STMT:
            children = list(node.get_children())
            if children:
                cond = "".join(tok.spelling for tok in children[0].get_tokens())
                guards.append((cond.replace(" ", ""), node.location.line))
        for c in node.get_children():
            walk(c)
    walk(tu.cursor)

    atoms = []
    atoms += [f"Call(S,'{n}',{l})"      for n,l in calls]
    atoms += [f"Decl(S,'{n}',{l})"      for n,l in decls]
    atoms += [f"Param(S,'{n}',{l})"     for n,l in params]
    atoms += [f"Cast(S,'{t}',{l})"      for t,l in casts]
    atoms += [f"Literal(S,'{c}',{l})"   for c,l in lits]
    atoms += [f"Guard(S,'if-{c}',{l},{l+1})" for c,l in guards]
    return atoms

# Run across all snippets
rows = []
for i in df['id']:
    file = os.path.join(SNIPPET_DIR, f"snippet_{i}.c")
    for atom in extract_atoms(file):
        rows.append((i, atom))
print("Extracted", len(rows), "atoms")


Extracted 195889 atoms


In [5]:
# %% Cell 4: Build fol_logic & save
atoms_df = pd.DataFrame(rows, columns=['id','atom'])
df['fol_logic'] = (
    atoms_df.groupby('id')['atom']
            .apply(lambda atoms: ", ".join(atoms))
            .reindex(df['id'])
            .fillna("")
)
df.to_csv(CSV_OUT, index=False)
print("Saved augmented CSV to", CSV_OUT)


Saved augmented CSV to devign_with_atoms.csv


In [6]:
data= pd.read_csv('devign_with_atoms.csv')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14032 entries, 0 to 14031
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   func          14032 non-null  object
 1   func_cleaned  14032 non-null  object
 2   project       14032 non-null  object
 3   target        14032 non-null  bool  
 4   id            14032 non-null  int64 
 5   fol_logic     13736 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 562.0+ KB


++++++++++++++

In [18]:
import os
import re
import pandas as pd
from clang.cindex import Config, Index, CursorKind

# Set path to your libclang (update this path if necessary)
Config.set_library_file(r"C:\Program Files\LLVM\bin\libclang.dll")

# ─────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────
CSV_IN = "devign.csv"
CSV_OUT = "devign_with_pdg_fol.csv"
SNIPPET_DIR = "snippets_clang_pdg"

# Load dataset
df = pd.read_csv(CSV_IN)
df['id'] = df.index

# Create snippet files
os.makedirs(SNIPPET_DIR, exist_ok=True)
for i, row in df.iterrows():
    path = os.path.join(SNIPPET_DIR, f"snippet_{i}.c")
    with open(path, "w", encoding="utf-8") as f:
        f.write(row["func_cleaned"])

# Initialize Clang index
index = Index.create()

# Function to extract PDG-style atoms
def extract_pdg(path):
    tu = index.parse(path, args=['-std=c99'])
    data_deps = []
    ctrl_deps = []
    assigns = []
    current_if = None

    def walk(node, parent_if=None):
        nonlocal current_if

        # Assignment expression
        if node.kind == CursorKind.BINARY_OPERATOR:
            tokens = list(node.get_tokens())
            expr = " ".join(t.spelling for t in tokens)
            m = re.match(r"(\w+)\s*=\s*(\w+)", expr)
            if m:
                lhs, rhs = m.group(1), m.group(2)
                data_deps.append((rhs, lhs, node.location.line))
                assigns.append((lhs, rhs, node.location.line))

        # If-statement for control dependency
        if node.kind == CursorKind.IF_STMT:
            cond_node = list(node.get_children())[0]
            cond_tokens = list(cond_node.get_tokens())
            cond = "".join(tok.spelling for tok in cond_tokens)
            current_if = (cond, node.location.line)

        # Control dependencies for child tokens
        if parent_if:
            for tok in node.get_tokens():
                ctrl_deps.append((parent_if[0], parent_if[1], node.location.line))

        for child in node.get_children():
            walk(child, current_if if node.kind == CursorKind.IF_STMT else parent_if)

    walk(tu.cursor)

    atoms = []
    atoms += [f"DataDep(S,'{src}','{dst}',{line})" for src, dst, line in data_deps]
    atoms += [f"Assign(S,'{lhs}','{rhs}',{line})" for lhs, rhs, line in assigns]
    atoms += [f"CtrlDep(S,'if-{cond}',{src},{dst})" for cond, src, dst in ctrl_deps]
    return atoms

# Extract atoms for each snippet
rows = []
for i in df['id']:
    file = os.path.join(SNIPPET_DIR, f"snippet_{i}.c")
    for atom in extract_pdg(file):
        rows.append((i, atom))

# Merge atoms into DataFrame
atoms_df = pd.DataFrame(rows, columns=['id', 'atom'])
df['fol_logic'] = (
    atoms_df.groupby('id')['atom']
            .apply(lambda atoms: ", ".join(atoms))
            .reindex(df['id'])
            .fillna("")
)

# Save updated CSV
df.to_csv(CSV_OUT, index=False)

import ace_tools as tools; tools.display_dataframe_to_user(name="PDG-Enhanced Dataset", dataframe=df)


Exception: library file must be set before before using any other functionalities in libclang.

In [None]:
data.columns

CODEPTM

CODEBERT

FOL_PDF

In [17]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix
)

# ───────────────────────────────────────────────────────────
# 1) LOAD & PREPARE DATA
# ───────────────────────────────────────────────────────────
df = pd.read_csv("devign_with_atoms.csv")        # must have 'fol_logic','func_cleaned','target'
df['target'] = df['target'].astype(int)           # ensure labels are ints
SEP = " // LOGIC: "
df['model_input'] = df['fol_logic'].fillna('') + SEP + df['func_cleaned']

# build HF Dataset and split 80/20
hf_ds = Dataset.from_pandas(
    df[['model_input','target']].rename(columns={'model_input':'text','target':'label'})
)
splits = hf_ds.train_test_split(test_size=0.2, seed=42)

# ───────────────────────────────────────────────────────────
# 2) TOKENIZATION
# ───────────────────────────────────────────────────────────
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
def tokenize_fn(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )
tokenized = splits.map(tokenize_fn, batched=True)
tokenized = tokenized.rename_column('label','labels')
tokenized.set_format('torch', columns=['input_ids','attention_mask','labels'])

# ───────────────────────────────────────────────────────────
# 3) LOAD MODEL & CONFIGURE LOSS
# ───────────────────────────────────────────────────────────
model = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/codebert-base',
    num_labels=2
)
model.config.problem_type = "single_label_classification"
model.config.id2label      = {0:"safe",1:"vuln"}
model.config.label2id      = {"safe":0,"vuln":1}

# ───────────────────────────────────────────────────────────
# 4) METRICS
# ───────────────────────────────────────────────────────────
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    return {
        'accuracy':    accuracy_score(labels, preds),
        'precision':   precision_score(labels, preds),
        'recall':      recall_score(labels, preds),            # sensitivity
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
        'f1':          f1_score(labels, preds)
    }

# ───────────────────────────────────────────────────────────
# 5) SET UP TRAINER
# ───────────────────────────────────────────────────────────
data_collator = DataCollatorWithPadding(tokenizer)
args = TrainingArguments(
    output_dir='./codebert_fol',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# ───────────────────────────────────────────────────────────
# 6) TRAIN & EVALUATE
# ───────────────────────────────────────────────────────────
trainer.train()
metrics = trainer.evaluate()
print("\n=== Hold‐out Metrics ===")
for key, value in metrics.items():
    if key.startswith("eval_"):
        print(f"{key[5:]:<12}: {value:.4f}")

# ───────────────────────────────────────────────────────────
# 7) SAVE MODEL & TOKENIZER
# ───────────────────────────────────────────────────────────
save_dir = './codebert_fol_saved'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\nModel and tokenizer saved to {save_dir}")

# ───────────────────────────────────────────────────────────
# 8) LOAD & TEST ON NEW DATA
# ───────────────────────────────────────────────────────────
'''
new_df = pd.read_csv('new_data.csv')
new_df['model_input'] = new_df['fol_logic'].fillna('') + SEP + new_df['func_cleaned']
new_ds = Dataset.from_pandas(new_df[['model_input']].rename(columns={'model_input':'text'}))

# Reload saved artifacts
saved_tokenizer = RobertaTokenizer.from_pretrained(save_dir)
saved_model     = AutoModelForSequenceClassification.from_pretrained(save_dir)
saved_model.eval()

# Tokenize new data
def tokenize_new(batch):
    return saved_tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )
new_tok = new_ds.map(tokenize_new, batched=True)
new_tok.set_format('torch', columns=['input_ids','attention_mask'])

# Inference DataLoader
new_loader = torch.utils.data.DataLoader(
    new_tok, batch_size=16, collate_fn=data_collator
)

# Predict
preds = []
for batch in new_loader:
    with torch.no_grad():
        out = saved_model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask']
        )
    preds.extend(torch.argmax(out.logits, axis=-1).cpu().tolist())

new_df['prediction'] = preds
print("\nSample predictions on new data:")
print(new_df[['func_cleaned','prediction']].head())
'''

Map: 100%|██████████| 11225/11225 [00:45<00:00, 244.75 examples/s]
Map: 100%|██████████| 2807/2807 [00:21<00:00, 132.74 examples/s]


ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434