In [9]:
import pefile
import sys

def file_offset_to_virtual_address(pe_file_path, file_offset):
    try:
        # Load the PE file
        pe = pefile.PE(pe_file_path)

        # Get the base address, section alignment, and file alignment
        base_address = pe.OPTIONAL_HEADER.ImageBase
        section_alignment = pe.OPTIONAL_HEADER.SectionAlignment
        file_alignment = pe.OPTIONAL_HEADER.FileAlignment

        # Iterate through sections to find the one containing the file offset
        for section in pe.sections:
            # Get section details
            raw_data_offset = section.PointerToRawData
            raw_data_size = section.SizeOfRawData
            virtual_address = section.VirtualAddress

            # Check if the file offset falls within the section's raw data range
            if raw_data_offset <= file_offset < raw_data_offset + raw_data_size:
                # Calculate the offset within the section
                offset_within_section = file_offset - raw_data_offset

                # Calculate the virtual address
                final_address = base_address + virtual_address + offset_within_section
                print(f"Let's compare! PEFile says {hex(pe.get_rva_from_offset(file_offset))}... I say {hex(final_address)}")
                return final_address

        # If no section contains the file offset, return None
        return None

    except Exception as e:
        print(f"Error processing the PE file: {e}")
        return None


def virtual_address_to_file_offset(pe_file_path, virtual_address):
    try:
        # Load the PE file
        pe = pefile.PE(pe_file_path)

        # Get the base address, section alignment, and file alignment
        base_address = pe.OPTIONAL_HEADER.ImageBase
        section_alignment = pe.OPTIONAL_HEADER.SectionAlignment
        file_alignment = pe.OPTIONAL_HEADER.FileAlignment

        # Calculate the relative virtual address (RVA)
        rva = virtual_address - base_address

        # Iterate through sections to find the one containing the RVA
        for section in pe.sections:
            # Get section details
            raw_data_offset = section.PointerToRawData
            raw_data_size = section.SizeOfRawData
            virtual_address_start = section.VirtualAddress
            virtual_address_end = virtual_address_start + section.Misc_VirtualSize

            # Check if the RVA falls within the section's virtual address range
            if virtual_address_start <= rva < virtual_address_end:
                # Calculate the offset within the section
                offset_within_section = rva - virtual_address_start

                # Calculate the file offset
                file_offset = raw_data_offset + offset_within_section
                print(f"Let's compare! PEFile says {pe.get_offset_from_rva(rva)}... I say {file_offset}")
                return file_offset

        # If no section contains the RVA, return None
        return None

    except Exception as e:
        print(f"Error processing the PE file: {e}")
        return None




In [10]:
executables = {
    "anon_call2_vs_stripped": "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call2_vs-stripped.exe",
    "anon_call3_vs_stripped": "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call3_vs-stripped.exe",
    "virtual7_vs_stripped": "../sunbench25/benchmark/indirect_calls/virtual_tables/virtual7_vs-stripped.exe",
    "anon_jump1_vs_stripped": "../sunbench25/benchmark/indirect_jumps/anonymous_functions/anon_jump1_vs-stripped.exe",
    "anon_jump1_vs_2_stripped": "../sunbench25/benchmark/indirect_jumps/anonymous_functions/anon_jump1_vs_2-stripped.exe",
    "switch4_vs_stripped": "../sunbench25/benchmark/indirect_jumps/switch_statements/switch4_vs-stripped.exe",
    "switch5_vs_stripped": "../sunbench25/benchmark/indirect_jumps/switch_statements/switch5_vs-stripped.exe",
    "switch6_vs_stripped": "../sunbench25/benchmark/indirect_jumps/switch_statements/switch6_vs-stripped.exe",
}

cfr_mapping = {
    path: path.replace(".exe", "-cfr.json")
    for key, path in executables.items()
}


In [11]:
import json
import re
for exepath, cfrpath in cfr_mapping.items():
    with open(cfrpath, 'r') as f:
        cfr = json.load(f)
    question = cfr['question']
    match = re.search(r"'([^']*)'(?!.*')", question)
    offset = match.group(1)
    print(f"{exepath}: {offset}")
    file_offset_to_virtual_address(exepath, int(offset,16))

offset
Let's compare! PEFile says 0x11d1... I say 0x4011d1
offset
Let's compare! PEFile says 0x11f4... I say 0x4011f4
offset
Let's compare! PEFile says 0x72c0... I say 0x1400072c0
offset
Let's compare! PEFile says 0x103e... I say 0x40103e
offset
Let's compare! PEFile says 0x103e... I say 0x40103e
offset
Let's compare! PEFile says 0x7170... I say 0x140007170
offset
Let's compare! PEFile says 0x7168... I say 0x140007168
offset
Let's compare! PEFile says 0x7177... I say 0x140007177


In [1]:
import json
import re
import pefile
import os
from typing import List, Dict
from dataclasses import dataclass

@dataclass
class Program:
    exe_path: str
    cfr_path: str
    question: str
    file_offset: int        # offset inside file (from the question)
    rva: int                # runtime RVA (for breakpoints)
    ground_truth: List[int] # list of file offsets from CFR JSON

# original flat mapping
executables: Dict[str, str] = {
    "anon_call2_vs_stripped":    "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call2_vs-stripped.exe",
    "anon_call3_vs_stripped":    "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call3_vs-stripped.exe",
    "virtual7_vs_stripped":      "../sunbench25/benchmark/indirect_calls/virtual_tables/virtual7_vs-stripped.exe",
    "anon_jump1_vs_stripped":    "../sunbench25/benchmark/indirect_jumps/anonymous_functions/anon_jump1_vs-stripped.exe",
    "anon_jump1_vs_2_stripped":  "../sunbench25/benchmark/indirect_jumps/anonymous_functions/anon_jump1_vs_2-stripped.exe",
    "switch4_vs_stripped":       "../sunbench25/benchmark/indirect_jumps/switch_statements/switch4_vs-stripped.exe",
    "switch5_vs_stripped":       "../sunbench25/benchmark/indirect_jumps/switch_statements/switch5_vs-stripped.exe",
    "switch6_vs_stripped":       "../sunbench25/benchmark/indirect_jumps/switch_statements/switch6_vs-stripped.exe",
}

programs: Dict[str, Program] = {}

for name, exe_path in executables.items():
    # derive the CFR JSON path
    cfr_path = exe_path.replace(".exe", "-cfr.json")

    # load the question, extract the quoted offset, and ground truth
    with open(cfr_path, 'r') as f:
        cfr = json.load(f)

    question = cfr.get('question', '')
    m = re.search(r"'([^']*)'(?!.*')", question)
    if not m:
        raise ValueError(f"Could not parse offset from question for {name!r}")
    file_offset = int(m.group(1), 16)

    # parse ground truth offsets
    gt_strs = cfr.get('groundtruth', [])
    ground_truth = [int(off, 16) for off in gt_strs]

    # compute the RVA from file offset
    pe = pefile.PE(exe_path)
    rva = pe.get_rva_from_offset(file_offset)

    # store everything in our nice dict
    programs[name] = Program(
        exe_path    = exe_path,
        cfr_path    = cfr_path,
        question    = question,
        file_offset = file_offset,
        rva         = rva,
        ground_truth= ground_truth
    )


In [2]:
name = "switch6_vs_stripped"
program = programs[name]
print(f"{name}:")
print(f"  exe         = {program.exe_path}")
print(f"  cfr         = {program.cfr_path}")
print(f"  question    = {program.question!r}")
print(f"  file_offset = {hex(program.file_offset)}")
print(f"  rva         = {hex(program.rva)}")
print(f"  groundtruth = {[hex(x) for x in program.ground_truth]}\n")

from cdbwrapper import cdb
cdb = cdb.CDBWrapper()
cdb.start(program.exe_path)
cdb.stop()

switch6_vs_stripped:
  exe         = ../sunbench25/benchmark/indirect_jumps/switch_statements/switch6_vs-stripped.exe
  cfr         = ../sunbench25/benchmark/indirect_jumps/switch_statements/switch6_vs-stripped-cfr.json
  question    = "What are the file offsets for the instructions that are the targets of the 'jmp rax' instruction at file offset '0x6577' ?"
  file_offset = 0x6577
  rva         = 0x7177
  groundtruth = ['0x657b', '0x6584', '0x658d', '0x6596', '0x659f', '0x65a8', '0x65b1', '0x65ba', '0x65c3', '0x65cc', '0x65d5', '0x65de', '0x65e5']

Debugger is not running
