big_vultest

In [10]:
import pandas as pd
data = pd.read_csv('big_vultest_fol.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170 entries, 0 to 1169
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  1170 non-null   object
 1   input        1170 non-null   object
 2   output       1170 non-null   int64 
 3   fol_logic    1170 non-null   object
dtypes: int64(1), object(3)
memory usage: 36.7+ KB


In [4]:
import os
import pandas as pd
import subprocess

# Paths
joern_bin = "/home/shaon/bin/joern/joern-cli/"
dataset = "big_vultest.csv"
code_column = "input"
out_dir = "joern_output_big"

os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(dataset)

for idx, row in df.iterrows():
    code = row[code_column]
    sample_dir = os.path.join(out_dir, f"sample_{idx}")
    os.makedirs(sample_dir, exist_ok=True)

    src_path = os.path.join(sample_dir, "code.c")
    with open(src_path, "w") as f:
        f.write(code)

    # Export CPG using Joern via subprocess
    subprocess.run([
        f"{joern_bin}/c2cpg.sh",
        src_path,
        "--output",
        os.path.join(sample_dir, "cpg.bin")
    ], check=True)


In [5]:
import zipfile
for idx in range(len(df)):
    sample_path = f"joern_output_big/sample_{idx}/cpg.bin"
    zip_path = f"{sample_path}.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(sample_path, arcname="cpg.bin")


In [6]:
import os
import shutil
import subprocess

joern_export = "/home/shaon/bin/joern/joern-cli/joern-export"
out_dir = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big"

for sample in os.listdir(out_dir):
    sample_dir = os.path.join(out_dir, sample)
    cpg_bin = os.path.join(sample_dir, "cpg.bin")
    if not os.path.isfile(cpg_bin):
        continue

    json_out = os.path.join(sample_dir, "json")

    # THIS IS THE CRITICAL PART! REMOVE FOLDER IF IT EXISTS!
    if os.path.exists(json_out):
        print(f"Deleting old output: {json_out}")
        shutil.rmtree(json_out)  # <<------ THIS LINE IS NECESSARY

    # Now export (do NOT create the json folder yourself)
    print(f"Exporting {cpg_bin} to {json_out}")
    try:
        result = subprocess.run(
            [joern_export, "--repr", "all", "--out", json_out, cpg_bin],
            check=True, capture_output=True, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"[Error] Failed to export {sample}")
        print("stdout:", e.stdout)
        print("stderr:", e.stderr)


Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_0/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_0/json
exported 16 nodes, 10 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_0/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_1/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_1/json
exported 16 nodes, 10 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_1/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_10/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_10/json
exported 16 nodes, 10 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_10/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big/sample_100/cpg.bin to /mnt/c/Us

In [7]:
import os
import pydot
import json

def dot_to_json(dot_path, json_path):
    # Read the DOT file
    (graph,) = pydot.graph_from_dot_file(dot_path)

    # Nodes
    nodes = []
    for node in graph.get_nodes():
        node_id = node.get_name().strip('"')
        attrs = node.get_attributes()
        node_entry = {'id': node_id, **attrs}
        nodes.append(node_entry)

    # Edges
    edges = []
    for edge in graph.get_edges():
        src = edge.get_source().strip('"')
        dst = edge.get_destination().strip('"')
        attrs = edge.get_attributes()
        edge_entry = {'src': src, 'dst': dst, **attrs}
        edges.append(edge_entry)

    # Build JSON object
    graph_json = {
        "nodes": nodes,
        "edges": edges
    }

    # Save as JSON
    with open(json_path, "w") as f:
        json.dump(graph_json, f, indent=2)

In [8]:
import os

joern_outputs = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big"
n = 1170

for i in range(n):
    sample_dir = os.path.join(joern_outputs, f"sample_{i}", "json")
    dot_path = os.path.join(sample_dir, "export.dot")
    json_path = os.path.join(sample_dir, "graph.json")   # <--- Add this line!
    if not os.path.exists(dot_path):
        print(f"Missing: {dot_path}")
        continue
    dot_to_json(dot_path, json_path)

In [9]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths
dataset_path = 'big_vultest.csv'
joern_outputs = '/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_big'

# Function to convert graph.json to FOL logic string
def graph_to_fol(graph_path):
    if not os.path.exists(graph_path):
        return ''
    try:
        with open(graph_path) as f:
            graph = json.load(f)
    except Exception as e:
        print(f"Error loading {graph_path}: {e}")
        return ''

    fol_facts = []
    for node in graph['nodes']:
        node_id = node['id']
        label = str(node.get('label', '')).replace('"', '')
        code = str(node.get('CODE', '')).replace('"', "'").replace('\n', '\\n').strip()
        fol_facts.append(f'Node({node_id}, {label}, "{code}")')

    pdg_labels = {'REACHING_DEF', 'DDG', 'CDG'} # Add or remove as needed
    for edge in graph['edges']:
        label = edge.get('label', '').replace('"', '')
        if label in pdg_labels:
            src = edge['src']
            dst = edge['dst']
            fol_facts.append(f'Edge({src}, {dst}, "{label}")')

    return '\n'.join(fol_facts)

# 1. Load your dataset
df = pd.read_csv(dataset_path)

n = 1170

fol_logic_list = []
print("Processing PDG graphs and generating FOL logic facts...")

for i in tqdm(range(n)):
    graph_path = os.path.join(joern_outputs, f'sample_{i}', 'json', 'graph.json')
    fol_logic = graph_to_fol(graph_path)
    fol_logic_list.append(fol_logic)
    if not fol_logic:
        print(f"[WARNING] Empty or missing FOL for sample {i} at {graph_path}")

# 2. Add new column to DataFrame
df['fol_logic'] = fol_logic_list

# 3. Save the new DataFrame with FOL logic
output_path = 'big_vultest_fol.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved FOL-augmented dataset to: {output_path}")


Processing PDG graphs and generating FOL logic facts...


100%|██████████| 1170/1170 [00:14<00:00, 83.56it/s]



Saved FOL-augmented dataset to: big_vultest_fol.csv


diverse test

In [19]:
import pandas as pd
data = pd.read_csv('diverse_test_fol.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532 entries, 0 to 1531
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code_snip  1532 non-null   object
 1   output     1532 non-null   int64 
 2   fol_logic  1532 non-null   object
dtypes: int64(1), object(2)
memory usage: 36.0+ KB


In [20]:
data.head()

Unnamed: 0,code_snip,output,fol_logic
0,static inline int get_v4l2_input32(struct v4l2...,1,"Node(25769803776, BLOCK, ""'<empty>'"")\nNode(25..."
1,"static bool sfb_classify(struct sk_buff *skb, ...",0,"Node(21474836480, BINDING, """")\nNode(257698037..."
2,auto ReferenceHandle::Get(Local<Value> key_han...,1,"Node(25769803776, BLOCK, ""'<empty>'"")\nNode(25..."
3,static void set_ns_and_type_ex(xmlNodePtr node...,0,"Node(21474836480, BINDING, """")\nNode(257698037..."
4,"static int skfp_ioctl(struct net_device *dev, ...",1,"Node(21474836480, BINDING, """")\nNode(257698037..."


In [12]:
import os
import pandas as pd
import subprocess

# Paths
joern_bin = "/home/shaon/bin/joern/joern-cli/"
dataset = "diverse_test.csv"
code_column = "code_snip"
out_dir = "joern_output_diverse"

os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(dataset)

for idx, row in df.iterrows():
    code = row[code_column]
    sample_dir = os.path.join(out_dir, f"sample_{idx}")
    os.makedirs(sample_dir, exist_ok=True)

    src_path = os.path.join(sample_dir, "code.c")
    with open(src_path, "w") as f:
        f.write(code)


    subprocess.run([
        f"{joern_bin}/c2cpg.sh",
        src_path,
        "--output",
        os.path.join(sample_dir, "cpg.bin")
    ], check=True)


In [13]:
import zipfile
for idx in range(len(df)):
    sample_path = f"joern_output_diverse/sample_{idx}/cpg.bin"
    zip_path = f"{sample_path}.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(sample_path, arcname="cpg.bin")


In [14]:
import os
import shutil
import subprocess

joern_export = "/home/shaon/bin/joern/joern-cli/joern-export"
out_dir = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse"

for sample in os.listdir(out_dir):
    sample_dir = os.path.join(out_dir, sample)
    cpg_bin = os.path.join(sample_dir, "cpg.bin")
    if not os.path.isfile(cpg_bin):
        continue

    json_out = os.path.join(sample_dir, "json")

    # THIS IS THE CRITICAL PART! REMOVE FOLDER IF IT EXISTS!
    if os.path.exists(json_out):
        print(f"Deleting old output: {json_out}")
        shutil.rmtree(json_out)  # <<------ THIS LINE IS NECESSARY

    # Now export (do NOT create the json folder yourself)
    print(f"Exporting {cpg_bin} to {json_out}")
    try:
        result = subprocess.run(
            [joern_export, "--repr", "all", "--out", json_out, cpg_bin],
            check=True, capture_output=True, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"[Error] Failed to export {sample}")
        print("stdout:", e.stdout)
        print("stderr:", e.stderr)


Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_0/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_0/json
exported 16 nodes, 10 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_0/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_1/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_1/json
exported 71 nodes, 80 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_1/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_10/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_10/json
exported 38 nodes, 32 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse/sample_10/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output

In [15]:
import os
import pydot
import json

def dot_to_json(dot_path, json_path):
    # Read the DOT file
    (graph,) = pydot.graph_from_dot_file(dot_path)

    # Nodes
    nodes = []
    for node in graph.get_nodes():
        node_id = node.get_name().strip('"')
        attrs = node.get_attributes()
        node_entry = {'id': node_id, **attrs}
        nodes.append(node_entry)

    # Edges
    edges = []
    for edge in graph.get_edges():
        src = edge.get_source().strip('"')
        dst = edge.get_destination().strip('"')
        attrs = edge.get_attributes()
        edge_entry = {'src': src, 'dst': dst, **attrs}
        edges.append(edge_entry)

    # Build JSON object
    graph_json = {
        "nodes": nodes,
        "edges": edges
    }

    # Save as JSON
    with open(json_path, "w") as f:
        json.dump(graph_json, f, indent=2)

In [17]:
import os

joern_outputs = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse"
n = 1532

for i in range(n):
    sample_dir = os.path.join(joern_outputs, f"sample_{i}", "json")
    dot_path = os.path.join(sample_dir, "export.dot")
    json_path = os.path.join(sample_dir, "graph.json")   # <--- Add this line!
    if not os.path.exists(dot_path):
        print(f"Missing: {dot_path}")
        continue
    dot_to_json(dot_path, json_path)    # Pass the file, not the dir!


In [18]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths
dataset_path = 'diverse_test.csv'
joern_outputs = '/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_diverse'

# Function to convert graph.json to FOL logic string
def graph_to_fol(graph_path):
    if not os.path.exists(graph_path):
        return ''
    try:
        with open(graph_path) as f:
            graph = json.load(f)
    except Exception as e:
        print(f"Error loading {graph_path}: {e}")
        return ''

    fol_facts = []
    for node in graph['nodes']:
        node_id = node['id']
        label = str(node.get('label', '')).replace('"', '')
        code = str(node.get('CODE', '')).replace('"', "'").replace('\n', '\\n').strip()
        fol_facts.append(f'Node({node_id}, {label}, "{code}")')

    pdg_labels = {'REACHING_DEF', 'DDG', 'CDG'} # Add or remove as needed
    for edge in graph['edges']:
        label = edge.get('label', '').replace('"', '')
        if label in pdg_labels:
            src = edge['src']
            dst = edge['dst']
            fol_facts.append(f'Edge({src}, {dst}, "{label}")')

    return '\n'.join(fol_facts)

# 1. Load your dataset
df = pd.read_csv(dataset_path)

n = 1532

fol_logic_list = []
print("Processing PDG graphs and generating FOL logic facts...")

for i in tqdm(range(n)):
    graph_path = os.path.join(joern_outputs, f'sample_{i}', 'json', 'graph.json')
    fol_logic = graph_to_fol(graph_path)
    fol_logic_list.append(fol_logic)
    if not fol_logic:
        print(f"[WARNING] Empty or missing FOL for sample {i} at {graph_path}")

# 2. Add new column to DataFrame
df['fol_logic'] = fol_logic_list

# 3. Save the new DataFrame with FOL logic
output_path = 'diverse_test_fol.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved FOL-augmented dataset to: {output_path}")


Processing PDG graphs and generating FOL logic facts...


100%|██████████| 1532/1532 [00:06<00:00, 220.24it/s]



Saved FOL-augmented dataset to: diverse_test_fol.csv


djuliet

In [21]:
import pandas as pd
data = pd.read_csv('djuliet_test.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   code_snip  3152 non-null   object
 1   output     3152 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 49.4+ KB


In [22]:
import os
import pandas as pd
import subprocess

# Paths
joern_bin = "/home/shaon/bin/joern/joern-cli/"
dataset = "djuliet_test.csv"
code_column = "code_snip"
out_dir = "joern_output_djuliet"

os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(dataset)

for idx, row in df.iterrows():
    code = row[code_column]
    sample_dir = os.path.join(out_dir, f"sample_{idx}")
    os.makedirs(sample_dir, exist_ok=True)

    src_path = os.path.join(sample_dir, "code.c")
    with open(src_path, "w") as f:
        f.write(code)

    # Export CPG using Joern via subprocess
    subprocess.run([
        f"{joern_bin}/c2cpg.sh",
        src_path,
        "--output",
        os.path.join(sample_dir, "cpg.bin")
    ], check=True)


In [23]:
import zipfile
for idx in range(len(df)):
    sample_path = f"joern_output_djuliet/sample_{idx}/cpg.bin"
    zip_path = f"{sample_path}.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(sample_path, arcname="cpg.bin")

In [24]:
import os
import shutil
import subprocess

joern_export = "/home/shaon/bin/joern/joern-cli/joern-export"
out_dir = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet"

for sample in os.listdir(out_dir):
    sample_dir = os.path.join(out_dir, sample)
    cpg_bin = os.path.join(sample_dir, "cpg.bin")
    if not os.path.isfile(cpg_bin):
        continue

    json_out = os.path.join(sample_dir, "json")

    # THIS IS THE CRITICAL PART! REMOVE FOLDER IF IT EXISTS!
    if os.path.exists(json_out):
        print(f"Deleting old output: {json_out}")
        shutil.rmtree(json_out)  # <<------ THIS LINE IS NECESSARY

    # Now export (do NOT create the json folder yourself)
    print(f"Exporting {cpg_bin} to {json_out}")
    try:
        result = subprocess.run(
            [joern_export, "--repr", "all", "--out", json_out, cpg_bin],
            check=True, capture_output=True, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"[Error] Failed to export {sample}")
        print("stdout:", e.stdout)
        print("stderr:", e.stderr)


Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_0/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_0/json
exported 82 nodes, 107 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_0/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_1/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_1/json
exported 58 nodes, 62 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_1/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_10/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_10/json
exported 84 nodes, 115 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet/sample_10/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_outp

In [25]:
import os
import pydot
import json

def dot_to_json(dot_path, json_path):
    # Read the DOT file
    (graph,) = pydot.graph_from_dot_file(dot_path)

    # Nodes
    nodes = []
    for node in graph.get_nodes():
        node_id = node.get_name().strip('"')
        attrs = node.get_attributes()
        node_entry = {'id': node_id, **attrs}
        nodes.append(node_entry)

    # Edges
    edges = []
    for edge in graph.get_edges():
        src = edge.get_source().strip('"')
        dst = edge.get_destination().strip('"')
        attrs = edge.get_attributes()
        edge_entry = {'src': src, 'dst': dst, **attrs}
        edges.append(edge_entry)

    # Build JSON object
    graph_json = {
        "nodes": nodes,
        "edges": edges
    }

    # Save as JSON
    with open(json_path, "w") as f:
        json.dump(graph_json, f, indent=2)

In [26]:
import os

joern_outputs = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet"
n = 3152

for i in range(n):
    sample_dir = os.path.join(joern_outputs, f"sample_{i}", "json")
    dot_path = os.path.join(sample_dir, "export.dot")
    json_path = os.path.join(sample_dir, "graph.json")   # <--- Add this line!
    if not os.path.exists(dot_path):
        print(f"Missing: {dot_path}")
        continue
    dot_to_json(dot_path, json_path)

In [27]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths
dataset_path = 'djuliet_test.csv'
joern_outputs = '/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_djuliet'

# Function to convert graph.json to FOL logic string
def graph_to_fol(graph_path):
    if not os.path.exists(graph_path):
        return ''
    try:
        with open(graph_path) as f:
            graph = json.load(f)
    except Exception as e:
        print(f"Error loading {graph_path}: {e}")
        return ''

    fol_facts = []
    for node in graph['nodes']:
        node_id = node['id']
        label = str(node.get('label', '')).replace('"', '')
        code = str(node.get('CODE', '')).replace('"', "'").replace('\n', '\\n').strip()
        fol_facts.append(f'Node({node_id}, {label}, "{code}")')

    pdg_labels = {'REACHING_DEF', 'DDG', 'CDG'} # Add or remove as needed
    for edge in graph['edges']:
        label = edge.get('label', '').replace('"', '')
        if label in pdg_labels:
            src = edge['src']
            dst = edge['dst']
            fol_facts.append(f'Edge({src}, {dst}, "{label}")')

    return '\n'.join(fol_facts)


df = pd.read_csv(dataset_path)

n = 3152

fol_logic_list = []
print("Processing PDG graphs and generating FOL logic facts...")

for i in tqdm(range(n)):
    graph_path = os.path.join(joern_outputs, f'sample_{i}', 'json', 'graph.json')
    fol_logic = graph_to_fol(graph_path)
    fol_logic_list.append(fol_logic)
    if not fol_logic:
        print(f"[WARNING] Empty or missing FOL for sample {i} at {graph_path}")

# 2. Add new column to DataFrame
df['fol_logic'] = fol_logic_list

# 3. Save the new DataFrame with FOL logic
output_path = 'djuliet_test_fol.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved FOL-augmented dataset to: {output_path}")


Processing PDG graphs and generating FOL logic facts...


100%|██████████| 3152/3152 [00:12<00:00, 246.02it/s]



Saved FOL-augmented dataset to: djuliet_test_fol.csv


Reveal_

In [28]:
import pandas as pd
data = pd.read_csv('Reveal_vultest.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2028 entries, 0 to 2027
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  2028 non-null   object
 1   input        2028 non-null   object
 2   output       2028 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.7+ KB


In [30]:
import os
import pandas as pd
import subprocess

# Paths
joern_bin = "/home/shaon/bin/joern/joern-cli/"
dataset = "Reveal_vultest.csv"
code_column = "input"
out_dir = "joern_output_reveal"

os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(dataset)

for idx, row in df.iterrows():
    code = row[code_column]
    sample_dir = os.path.join(out_dir, f"sample_{idx}")
    os.makedirs(sample_dir, exist_ok=True)

    src_path = os.path.join(sample_dir, "code.c")
    with open(src_path, "w") as f:
        f.write(code)

    # Export CPG using Joern via subprocess
    subprocess.run([
        f"{joern_bin}/c2cpg.sh",
        src_path,
        "--output",
        os.path.join(sample_dir, "cpg.bin")
    ], check=True)


In [31]:
import zipfile
for idx in range(len(df)):
    sample_path = f"joern_output_reveal/sample_{idx}/cpg.bin"
    zip_path = f"{sample_path}.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(sample_path, arcname="cpg.bin")

In [32]:
import os
import shutil
import subprocess

joern_export = "/home/shaon/bin/joern/joern-cli/joern-export"
out_dir = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal"

for sample in os.listdir(out_dir):
    sample_dir = os.path.join(out_dir, sample)
    cpg_bin = os.path.join(sample_dir, "cpg.bin")
    if not os.path.isfile(cpg_bin):
        continue

    json_out = os.path.join(sample_dir, "json")

    # THIS IS THE CRITICAL PART! REMOVE FOLDER IF IT EXISTS!
    if os.path.exists(json_out):
        print(f"Deleting old output: {json_out}")
        shutil.rmtree(json_out)  # <<------ THIS LINE IS NECESSARY

    # Now export (do NOT create the json folder yourself)
    print(f"Exporting {cpg_bin} to {json_out}")
    try:
        result = subprocess.run(
            [joern_export, "--repr", "all", "--out", json_out, cpg_bin],
            check=True, capture_output=True, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"[Error] Failed to export {sample}")
        print("stdout:", e.stdout)
        print("stderr:", e.stderr)


Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_0/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_0/json
exported 47 nodes, 51 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_0/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_1/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_1/json
exported 107 nodes, 151 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_1/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_10/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_10/json
exported 42 nodes, 38 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal/sample_10/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal

In [33]:
import os
import pydot
import json

def dot_to_json(dot_path, json_path):
    # Read the DOT file
    (graph,) = pydot.graph_from_dot_file(dot_path)

    # Nodes
    nodes = []
    for node in graph.get_nodes():
        node_id = node.get_name().strip('"')
        attrs = node.get_attributes()
        node_entry = {'id': node_id, **attrs}
        nodes.append(node_entry)

    # Edges
    edges = []
    for edge in graph.get_edges():
        src = edge.get_source().strip('"')
        dst = edge.get_destination().strip('"')
        attrs = edge.get_attributes()
        edge_entry = {'src': src, 'dst': dst, **attrs}
        edges.append(edge_entry)

    # Build JSON object
    graph_json = {
        "nodes": nodes,
        "edges": edges
    }

    # Save as JSON
    with open(json_path, "w") as f:
        json.dump(graph_json, f, indent=2)

In [34]:
import os

joern_outputs = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal"
n =  2028

for i in range(n):
    sample_dir = os.path.join(joern_outputs, f"sample_{i}", "json")
    dot_path = os.path.join(sample_dir, "export.dot")
    json_path = os.path.join(sample_dir, "graph.json")   # <--- Add this line!
    if not os.path.exists(dot_path):
        print(f"Missing: {dot_path}")
        continue
    dot_to_json(dot_path, json_path)

In [35]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths
dataset_path = 'Reveal_vultest.csv'
joern_outputs = '/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_reveal'

# Function to convert graph.json to FOL logic string
def graph_to_fol(graph_path):
    if not os.path.exists(graph_path):
        return ''
    try:
        with open(graph_path) as f:
            graph = json.load(f)
    except Exception as e:
        print(f"Error loading {graph_path}: {e}")
        return ''

    fol_facts = []
    for node in graph['nodes']:
        node_id = node['id']
        label = str(node.get('label', '')).replace('"', '')
        code = str(node.get('CODE', '')).replace('"', "'").replace('\n', '\\n').strip()
        fol_facts.append(f'Node({node_id}, {label}, "{code}")')

    pdg_labels = {'REACHING_DEF', 'DDG', 'CDG'} # Add or remove as needed
    for edge in graph['edges']:
        label = edge.get('label', '').replace('"', '')
        if label in pdg_labels:
            src = edge['src']
            dst = edge['dst']
            fol_facts.append(f'Edge({src}, {dst}, "{label}")')

    return '\n'.join(fol_facts)


df = pd.read_csv(dataset_path)

n = 2028

fol_logic_list = []
print("Processing PDG graphs and generating FOL logic facts...")

for i in tqdm(range(n)):
    graph_path = os.path.join(joern_outputs, f'sample_{i}', 'json', 'graph.json')
    fol_logic = graph_to_fol(graph_path)
    fol_logic_list.append(fol_logic)
    if not fol_logic:
        print(f"[WARNING] Empty or missing FOL for sample {i} at {graph_path}")

# 2. Add new column to DataFrame
df['fol_logic'] = fol_logic_list

# 3. Save the new DataFrame with FOL logic
output_path = 'reveal_test_fol.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved FOL-augmented dataset to: {output_path}")


Processing PDG graphs and generating FOL logic facts...


100%|██████████| 2028/2028 [00:09<00:00, 222.22it/s]



Saved FOL-augmented dataset to: reveal_test_fol.csv


cvefixes

In [1]:
import pandas as pd

fun_test_df = pd.read_json("test_512.json")
fun_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4216 entries, 0 to 4215
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  4216 non-null   object
 1   input        4216 non-null   object
 2   output       4216 non-null   int64 
 3   idx          4216 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 131.9+ KB


In [36]:
import os
import pandas as pd
import subprocess

# Paths
joern_bin = "/home/shaon/bin/joern/joern-cli/"
dataset = "test_512.json"
code_column = "input"
out_dir = "joern_output_fun"

os.makedirs(out_dir, exist_ok=True)

df = pd.read_json(dataset)

for idx, row in df.iterrows():
    code = row[code_column]
    sample_dir = os.path.join(out_dir, f"sample_{idx}")
    os.makedirs(sample_dir, exist_ok=True)

    src_path = os.path.join(sample_dir, "code.c")
    with open(src_path, "w") as f:
        f.write(code)

    # Export CPG using Joern via subprocess
    subprocess.run([
        f"{joern_bin}/c2cpg.sh",
        src_path,
        "--output",
        os.path.join(sample_dir, "cpg.bin")
    ], check=True)

In [37]:
import zipfile
for idx in range(len(df)):
    sample_path = f"joern_output_fun/sample_{idx}/cpg.bin"
    zip_path = f"{sample_path}.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(sample_path, arcname="cpg.bin")

In [38]:
import os
import shutil
import subprocess

joern_export = "/home/shaon/bin/joern/joern-cli/joern-export"
out_dir = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun"

for sample in os.listdir(out_dir):
    sample_dir = os.path.join(out_dir, sample)
    cpg_bin = os.path.join(sample_dir, "cpg.bin")
    if not os.path.isfile(cpg_bin):
        continue

    json_out = os.path.join(sample_dir, "json")

    # THIS IS THE CRITICAL PART! REMOVE FOLDER IF IT EXISTS!
    if os.path.exists(json_out):
        print(f"Deleting old output: {json_out}")
        shutil.rmtree(json_out)  # <<------ THIS LINE IS NECESSARY

    # Now export (do NOT create the json folder yourself)
    print(f"Exporting {cpg_bin} to {json_out}")
    try:
        result = subprocess.run(
            [joern_export, "--repr", "all", "--out", json_out, cpg_bin],
            check=True, capture_output=True, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"[Error] Failed to export {sample}")
        print("stdout:", e.stdout)
        print("stderr:", e.stderr)

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_0/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_0/json
exported 212 nodes, 369 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_0/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_1/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_1/json
exported 42 nodes, 46 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_1/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_10/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_10/json
exported 60 nodes, 77 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_10/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun/sample_100/cpg.bin to /mnt/c/

In [39]:
import os
import pydot
import json

def dot_to_json(dot_path, json_path):
    # Read the DOT file
    (graph,) = pydot.graph_from_dot_file(dot_path)

    # Nodes
    nodes = []
    for node in graph.get_nodes():
        node_id = node.get_name().strip('"')
        attrs = node.get_attributes()
        node_entry = {'id': node_id, **attrs}
        nodes.append(node_entry)

    # Edges
    edges = []
    for edge in graph.get_edges():
        src = edge.get_source().strip('"')
        dst = edge.get_destination().strip('"')
        attrs = edge.get_attributes()
        edge_entry = {'src': src, 'dst': dst, **attrs}
        edges.append(edge_entry)

    # Build JSON object
    graph_json = {
        "nodes": nodes,
        "edges": edges
    }

    # Save as JSON
    with open(json_path, "w") as f:
        json.dump(graph_json, f, indent=2)

In [40]:
import os

joern_outputs = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun"
n =  4216

for i in range(n):
    sample_dir = os.path.join(joern_outputs, f"sample_{i}", "json")
    dot_path = os.path.join(sample_dir, "export.dot")
    json_path = os.path.join(sample_dir, "graph.json")   # <--- Add this line!
    if not os.path.exists(dot_path):
        print(f"Missing: {dot_path}")
        continue
    dot_to_json(dot_path, json_path)

In [42]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths
dataset_path = 'test_512.json'
joern_outputs = '/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_fun'

# Function to convert graph.json to FOL logic string
def graph_to_fol(graph_path):
    if not os.path.exists(graph_path):
        return ''
    try:
        with open(graph_path) as f:
            graph = json.load(f)
    except Exception as e:
        print(f"Error loading {graph_path}: {e}")
        return ''

    fol_facts = []
    for node in graph['nodes']:
        node_id = node['id']
        label = str(node.get('label', '')).replace('"', '')
        code = str(node.get('CODE', '')).replace('"', "'").replace('\n', '\\n').strip()
        fol_facts.append(f'Node({node_id}, {label}, "{code}")')

    pdg_labels = {'REACHING_DEF', 'DDG', 'CDG'} # Add or remove as needed
    for edge in graph['edges']:
        label = edge.get('label', '').replace('"', '')
        if label in pdg_labels:
            src = edge['src']
            dst = edge['dst']
            fol_facts.append(f'Edge({src}, {dst}, "{label}")')

    return '\n'.join(fol_facts)


df = pd.read_json(dataset_path)

n = 4216

fol_logic_list = []
print("Processing PDG graphs and generating FOL logic facts...")

for i in tqdm(range(n)):
    graph_path = os.path.join(joern_outputs, f'sample_{i}', 'json', 'graph.json')
    fol_logic = graph_to_fol(graph_path)
    fol_logic_list.append(fol_logic)
    if not fol_logic:
        print(f"[WARNING] Empty or missing FOL for sample {i} at {graph_path}")

# 2. Add new column to DataFrame
df['fol_logic'] = fol_logic_list

# 3. Save the new DataFrame with FOL logic
output_path = 'CVEFixes_test_fol.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved FOL-augmented dataset to: {output_path}")

Processing PDG graphs and generating FOL logic facts...


100%|██████████| 4216/4216 [00:52<00:00, 80.95it/s] 



Saved FOL-augmented dataset to: CVEFixes_test_fol.csv


mixvul

In [44]:
import pandas as pd

mix_test_df = pd.read_csv("mix_test_vultest.csv")
mix_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  2864 non-null   object
 1   input        2864 non-null   object
 2   output       2864 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 67.2+ KB


In [45]:
import os
import pandas as pd
import subprocess

# Paths
joern_bin = "/home/shaon/bin/joern/joern-cli/"
dataset = "mix_test_vultest.csv"
code_column = "input"
out_dir = "joern_output_mix"

os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(dataset)

for idx, row in df.iterrows():
    code = row[code_column]
    sample_dir = os.path.join(out_dir, f"sample_{idx}")
    os.makedirs(sample_dir, exist_ok=True)

    src_path = os.path.join(sample_dir, "code.c")
    with open(src_path, "w") as f:
        f.write(code)

    # Export CPG using Joern via subprocess
    subprocess.run([
        f"{joern_bin}/c2cpg.sh",
        src_path,
        "--output",
        os.path.join(sample_dir, "cpg.bin")
    ], check=True)

In [46]:
import zipfile
for idx in range(len(df)):
    sample_path = f"joern_output_mix/sample_{idx}/cpg.bin"
    zip_path = f"{sample_path}.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(sample_path, arcname="cpg.bin")

In [47]:
import os
import shutil
import subprocess

joern_export = "/home/shaon/bin/joern/joern-cli/joern-export"
out_dir = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix"

for sample in os.listdir(out_dir):
    sample_dir = os.path.join(out_dir, sample)
    cpg_bin = os.path.join(sample_dir, "cpg.bin")
    if not os.path.isfile(cpg_bin):
        continue

    json_out = os.path.join(sample_dir, "json")

    # THIS IS THE CRITICAL PART! REMOVE FOLDER IF IT EXISTS!
    if os.path.exists(json_out):
        print(f"Deleting old output: {json_out}")
        shutil.rmtree(json_out)  # <<------ THIS LINE IS NECESSARY

    # Now export (do NOT create the json folder yourself)
    print(f"Exporting {cpg_bin} to {json_out}")
    try:
        result = subprocess.run(
            [joern_export, "--repr", "all", "--out", json_out, cpg_bin],
            check=True, capture_output=True, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"[Error] Failed to export {sample}")
        print("stdout:", e.stdout)
        print("stderr:", e.stderr)

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_0/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_0/json
exported 34 nodes, 32 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_0/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_1/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_1/json
exported 64 nodes, 87 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_1/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_10/cpg.bin to /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_10/json
exported 140 nodes, 232 edges into /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_10/json

Exporting /mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix/sample_100/cpg.bin to /mnt/c/

In [48]:
import os
import pydot
import json

def dot_to_json(dot_path, json_path):
    # Read the DOT file
    (graph,) = pydot.graph_from_dot_file(dot_path)

    # Nodes
    nodes = []
    for node in graph.get_nodes():
        node_id = node.get_name().strip('"')
        attrs = node.get_attributes()
        node_entry = {'id': node_id, **attrs}
        nodes.append(node_entry)

    # Edges
    edges = []
    for edge in graph.get_edges():
        src = edge.get_source().strip('"')
        dst = edge.get_destination().strip('"')
        attrs = edge.get_attributes()
        edge_entry = {'src': src, 'dst': dst, **attrs}
        edges.append(edge_entry)

    # Build JSON object
    graph_json = {
        "nodes": nodes,
        "edges": edges
    }

    # Save as JSON
    with open(json_path, "w") as f:
        json.dump(graph_json, f, indent=2)

In [49]:
import os

joern_outputs = "/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix"
n =  2864

for i in range(n):
    sample_dir = os.path.join(joern_outputs, f"sample_{i}", "json")
    dot_path = os.path.join(sample_dir, "export.dot")
    json_path = os.path.join(sample_dir, "graph.json")   # <--- Add this line!
    if not os.path.exists(dot_path):
        print(f"Missing: {dot_path}")
        continue
    dot_to_json(dot_path, json_path)

In [50]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths
dataset_path = 'mix_test_vultest.csv'
joern_outputs = '/mnt/c/Users/user01/PycharmProjects/PythonProject2/joern_output_mix'

# Function to convert graph.json to FOL logic string
def graph_to_fol(graph_path):
    if not os.path.exists(graph_path):
        return ''
    try:
        with open(graph_path) as f:
            graph = json.load(f)
    except Exception as e:
        print(f"Error loading {graph_path}: {e}")
        return ''

    fol_facts = []
    for node in graph['nodes']:
        node_id = node['id']
        label = str(node.get('label', '')).replace('"', '')
        code = str(node.get('CODE', '')).replace('"', "'").replace('\n', '\\n').strip()
        fol_facts.append(f'Node({node_id}, {label}, "{code}")')

    pdg_labels = {'REACHING_DEF', 'DDG', 'CDG'} # Add or remove as needed
    for edge in graph['edges']:
        label = edge.get('label', '').replace('"', '')
        if label in pdg_labels:
            src = edge['src']
            dst = edge['dst']
            fol_facts.append(f'Edge({src}, {dst}, "{label}")')

    return '\n'.join(fol_facts)


df = pd.read_csv(dataset_path)

n = 2864

fol_logic_list = []
print("Processing PDG graphs and generating FOL logic facts...")

for i in tqdm(range(n)):
    graph_path = os.path.join(joern_outputs, f'sample_{i}', 'json', 'graph.json')
    fol_logic = graph_to_fol(graph_path)
    fol_logic_list.append(fol_logic)
    if not fol_logic:
        print(f"[WARNING] Empty or missing FOL for sample {i} at {graph_path}")

# 2. Add new column to DataFrame
df['fol_logic'] = fol_logic_list

# 3. Save the new DataFrame with FOL logic
output_path = 'mixvul_test_fol.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved FOL-augmented dataset to: {output_path}")

Processing PDG graphs and generating FOL logic facts...


100%|██████████| 2864/2864 [00:20<00:00, 138.88it/s]



Saved FOL-augmented dataset to: mixvul_test_fol.csv
