In [23]:
import pefile
import sys

def file_offset_to_virtual_address(pe_file_path, file_offset):
    try:
        # Load the PE file
        pe = pefile.PE(pe_file_path)

        # Get the base address, section alignment, and file alignment
        base_address = pe.OPTIONAL_HEADER.ImageBase
        section_alignment = pe.OPTIONAL_HEADER.SectionAlignment
        file_alignment = pe.OPTIONAL_HEADER.FileAlignment

        # Iterate through sections to find the one containing the file offset
        for section in pe.sections:
            # Get section details
            raw_data_offset = section.PointerToRawData
            raw_data_size = section.SizeOfRawData
            virtual_address = section.VirtualAddress

            # Check if the file offset falls within the section's raw data range
            if raw_data_offset <= file_offset < raw_data_offset + raw_data_size:
                # Calculate the offset within the section
                offset_within_section = file_offset - raw_data_offset

                # Calculate the virtual address
                final_address = base_address + virtual_address + offset_within_section
                print(f"Let's compare! PEFile says {hex(pe.get_rva_from_offset(file_offset))}... I say {hex(final_address)}")
                return final_address

        # If no section contains the file offset, return None
        return None

    except Exception as e:
        print(f"Error processing the PE file: {e}")
        return None


def virtual_address_to_file_offset(pe_file_path, virtual_address):
    try:
        # Load the PE file
        pe = pefile.PE(pe_file_path)

        # Get the base address, section alignment, and file alignment
        base_address = pe.OPTIONAL_HEADER.ImageBase
        section_alignment = pe.OPTIONAL_HEADER.SectionAlignment
        file_alignment = pe.OPTIONAL_HEADER.FileAlignment

        # Calculate the relative virtual address (RVA)
        rva = virtual_address - base_address

        # Iterate through sections to find the one containing the RVA
        for section in pe.sections:
            # Get section details
            raw_data_offset = section.PointerToRawData
            raw_data_size = section.SizeOfRawData
            virtual_address_start = section.VirtualAddress
            virtual_address_end = virtual_address_start + section.Misc_VirtualSize

            # Check if the RVA falls within the section's virtual address range
            if virtual_address_start <= rva < virtual_address_end:
                # Calculate the offset within the section
                offset_within_section = rva - virtual_address_start

                # Calculate the file offset
                file_offset = raw_data_offset + offset_within_section
                print(f"Let's compare! PEFile says {pe.get_offset_from_rva(rva)}... I say {file_offset}")
                return file_offset

        # If no section contains the RVA, return None
        return None

    except Exception as e:
        print(f"Error processing the PE file: {e}")
        return None




In [9]:
executables = {
    "anon_call2_vs_stripped": "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call2_vs-stripped.exe",
    "anon_call3_vs_stripped": "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call3_vs-stripped.exe",
    "virtual7_vs_stripped": "../sunbench25/benchmark/indirect_calls/virtual_tables/virtual7_vs-stripped.exe",
    "anon_jump1_vs_stripped": "../sunbench25/benchmark/indirect_jumps/anonymous_functions/anon_jump1_vs-stripped.exe",
    "anon_jump1_vs_2_stripped": "../sunbench25/benchmark/indirect_jumps/anonymous_functions/anon_jump1_vs_2-stripped.exe",
    "switch4_vs_stripped": "../sunbench25/benchmark/indirect_jumps/switch_statements/switch4_vs-stripped.exe",
    "switch5_vs_stripped": "../sunbench25/benchmark/indirect_jumps/switch_statements/switch5_vs-stripped.exe",
    "switch6_vs_stripped": "../sunbench25/benchmark/indirect_jumps/switch_statements/switch6_vs-stripped.exe",
}

cfr_mapping = {
    path: path.replace(".exe", "-cfr.json")
    for key, path in executables.items()
}


In [24]:
import json
import re
for exepath, cfrpath in cfr_mapping.items():
    with open(cfrpath, 'r') as f:
        cfr = json.load(f)
    question = cfr['question']
    match = re.search(r"'([^']*)'(?!.*')", question)
    offset = match.group(1)
    print(f"offset")
    file_offset_to_virtual_address(exepath, int(offset,16))

0x5d1
Let's compare! PEFile says 0x11d1... I say 0x4011d1
0x3f4
Let's compare! PEFile says 0x11f4... I say 0x4011f4
0x66c0
Let's compare! PEFile says 0x72c0... I say 0x1400072c0
0x43e
Let's compare! PEFile says 0x103e... I say 0x40103e
0x43e
Let's compare! PEFile says 0x103e... I say 0x40103e
0x6570
Let's compare! PEFile says 0x7170... I say 0x140007170
0x6568
Let's compare! PEFile says 0x7168... I say 0x140007168
0x6577
Let's compare! PEFile says 0x7177... I say 0x140007177


In [34]:
# our cdb wrapper
import os
import re
import queue
import signal
import subprocess
import threading
from pathlib import Path

__all__ = [
    "create", "restart", "lm", "bp", "g",
    "t", "reg", "quit", "SessionError"
]

PROMPT_RX          = re.compile(r"\d+:[0-9a-fA-F`]+\> $")
DEFAULT_CDB        = "cdb"                      # rely on PATH
_CDB_SESSION       = None                       # singleton handle


class SessionError(RuntimeError):
    pass


class _CDBSession:
    """Thin shell around a single cdb.exe process (text mode)."""

    def __init__(self, exe: str, *argv: str, cdb_path: str = DEFAULT_CDB):
        exe = str(Path(exe).expanduser())
        if not Path(exe).is_file():
            raise SessionError(f"Executable not found: {exe}")

        self._cmdline = [cdb_path, "-lines", "-o", exe, *argv]
        self._proc    = subprocess.Popen(
            self._cmdline,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1
        )
        self._q       = queue.Queue(maxsize=10_000)
        self._reader  = threading.Thread(
            target=self._reader_loop, daemon=True
        )
        self._reader.start()
        self._read_prompt()                      # sync on initial stop

    # ------------------------------------------------------------------ I/O --

    def _reader_loop(self):
        for line in self._proc.stdout:
            # avoid clogging if caller forgets to drain output
            try:
                self._q.put_nowait(line)
            except queue.Full:
                pass

    def _read_prompt(self) -> str:
        buf = ""
        while True:
            try:
                line = self._q.get(timeout=0.1)
            except queue.Empty:
                if self._proc.poll() is not None:
                    raise SessionError("cdb exited unexpectedly")
                continue
            buf += line
            if PROMPT_RX.search(line):
                break
        return buf

    def _cmd(self, text: str) -> str:
        if self._proc.poll() is not None:
            raise SessionError("debugger process already terminated")

        self._proc.stdin.write(text + "\n")
        self._proc.stdin.flush()
        return self._read_prompt()

    # --------------------------------------------------------- public helpers

    def restart(self):            self._cmd(".restart")

    def lm(self) -> list[str]:
        """Return module names (just the image file names)."""
        out = self._cmd("lm")
        mods = []
        for ln in out.splitlines():
            m = re.match(r"[0-9a-f`]+\s+[0-9a-f`]+\s+(\S+)", ln)
            if m:
                mods.append(m.group(1))
        return mods

    def bp(self, addr: str | int):
        if isinstance(addr, int):
            addr = f"{addr:#x}"
        self._cmd(f"bp {addr}")

    def g(self):                  self._cmd("g")
    def t(self):                  self._cmd("t")

    def reg(self, name: str = "rip") -> int:
        """
        Return the integer value of a register (RIP/EIP/PC by default).

        Works on both x86 and x64.  Use 'eip' for 32‑bit if you prefer.
        """
        txt = self._cmd(f"? @{name}")
        m   = re.search(r"=\s*([0-9a-f`]+)", txt, re.I)
        if not m:
            raise SessionError(f"Could not parse register '{name}'")
        return int(m.group(1).replace('`', ''), 16)

    def quit(self):
        try:
            self._cmd("q")
        finally:
            with contextlib.suppress(ProcessLookupError):
                os.kill(self._proc.pid, signal.SIGTERM)


# ----------------------------------------------------------------- Facade API

def _require_session(fn):
    def _wrap(*args, **kw):
        if _CDB_SESSION is None:
            raise SessionError("No active session; call create() first")
        return fn(*args, **kw)
    return _wrap


def create(exe: str, *argv: str, **kw):
    """
    Spawn a fresh cdb session and make it the active singleton.

    >>> create("notepad.exe")
    """
    global _CDB_SESSION
    if _CDB_SESSION is not None:
        # quit()                                   # auto‑cleanup prior session
        print("cdb already started")
    _CDB_SESSION = _CDBSession(exe, *argv, **kw)


@_require_session
def restart():          _CDB_SESSION.restart()

@_require_session
def lm():               return _CDB_SESSION.lm()

@_require_session
def bp(addr):           _CDB_SESSION.bp(addr)

@_require_session
def g():                _CDB_SESSION.g()

@_require_session
def t():                _CDB_SESSION.t()

@_require_session
def reg(name="rip"):    return _CDB_SESSION.reg(name)

def quit():
    global _CDB_SESSION
    if _CDB_SESSION:
        _CDB_SESSION.quit()
        _CDB_SESSION = None

In [37]:
test_prog = "../sunbench25/benchmark/indirect_calls/anonymous_functions/anon_call2_vs-stripped.exe"
create(test_prog)
mods = lm()
print(mods)

KeyboardInterrupt: 