In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine
from extract_functions import extract_functions_from_file
import os
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import dotenv

dotenv.load_dotenv()

warnings.filterwarnings('ignore')

In [2]:
engine = create_engine(f'mysql+pymysql://{os.getenv("USER_NAME")}:{os.getenv("USER_PASSWORD")}@{os.getenv("IP")}:{os.getenv("PORT")}/AIS')

In [3]:
def extract_minor_dataset(engine, ori, dec):
    ori_code = pd.read_sql(f"SELECT * FROM {ori}", engine)
    dec_code = pd.read_sql(f"SELECT * FROM {dec}", engine)
    ori_code.set_index('id', inplace=True)
    dec_code.set_index('id', inplace=True)

    c_codes = []

    for filename, c_code in tqdm(zip(ori_code['filename'], ori_code['c_code']), total=len(ori_code)):
        functions = extract_functions_from_file(c_code)
        functions['filename'] = filename
        c_codes.append(functions)

    c_codes = pd.concat(c_codes)

    c_codes.set_index(['filename', 'function_name'], inplace=True)
    dec_code.set_index(['filename', 'function_name'], inplace=True)
    dataset = pd.merge(c_codes, dec_code, left_index=True, right_index=True, how='inner')

    dataset = dataset.reset_index()
    dataset['version'] = ori
    dataset.set_index(['version', 'filename', 'function_name'], inplace=True)

    dataset = dataset.map(lambda x: np.where(x != '', x, None)).dropna()
    dataset = dataset[~dataset.index.duplicated()]
    dataset = dataset.drop_duplicates(subset=['asm_code'])
    dataset = dataset.drop_duplicates(subset=['pseudo_code'])
    return dataset

In [4]:
linux_6_0 = extract_minor_dataset(engine, 'linux_6_0', 'linux_6_0_decompile')
linux_6_0 = linux_6_0.map(lambda x: np.where(x != '', x, None)).dropna()

100%|██████████| 20659/20659 [03:43<00:00, 92.29it/s] 


In [12]:
linux_6_0.columns

Index(['function_content', 'pseudo_code', 'asm_code'], dtype='object')

In [9]:
from datasets import load_dataset, Dataset

In [10]:
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset

Dataset({
    features: ['answer', 'question', 'context'],
    num_rows: 78577
})

In [13]:
Dataset.from_dict({
    'function_content': linux_6_0['function_content'],
    'pseudo_code': linux_6_0['pseudo_code'],
    'asm_code': linux_6_0['asm_code']
})

Dataset({
    features: ['function_content', 'pseudo_code', 'asm_code'],
    num_rows: 207242
})

# OpenAI Chat API

In [5]:
def extract_code_blocks(text):
    pattern = r'```((?:.|\n)*?)```'
    matches = re.findall(pattern, text, re.DOTALL)
    matches = [match.strip() for match in matches]
    return matches[0] if matches else ''

In [32]:
system_prompt_end2end = """
You are an expert system programmer with deep knowledge of the Linux kernel, assembly language, and decompilation techniques. Your task is to analyze assembly code of a single function compiled from the Linux kernel and reconstruct the original C function as accurately as possible. The input will be assembly code for one function only, without headers or other definitions. Follow these guidelines:

1. Analyze the given assembly code carefully, focusing on the structure and logic of this single function.

2. Recognize common compiler optimizations and how they might have transformed the original C code of this function.

3. Reconstruct the high-level control flow, including loops, conditionals, and any function calls made from this function.

4. Infer local data structures and their usage from memory access patterns in the assembly.

5. Pay attention to calling conventions and parameter passing, especially considering this is a kernel function.

6. Consider the target architecture (x86, ARM, etc.) and how it might influence the assembly output and the original C code of this function.

7. Maintain the coding style typical of the Linux kernel in your reconstructed C function.

8. If you encounter inline assembly within this function, reconstruct it as such in the C code.

9. Be aware of and recreate kernel-specific attributes and decorators that might apply to this function (e.g., __init, __exit, __user).

10. Provide brief, inline comments to explain complex parts or your reasoning for non-obvious decompilation decisions within the function.

11. If exact reconstruction is not possible for any part, provide the closest approximation.

12. Infer the function's return type and parameters based on the assembly code.

13. Do not include any #include statements or type definitions outside the function.

After completing the analysis and reconstruction, present the reconstructed C function as a single, continuous block of text. The output should be a valid C function that could theoretically be part of the Linux kernel.

Given an assembly code snippet for a single function, analyze it thoroughly and provide your best reconstruction of the original C function as it might appear in the Linux kernel source code.
"""

system_prompt_refine = """
You are an expert system programmer with deep knowledge of the Linux kernel, C programming, and decompilation techniques. Your task is to analyze pseudocode generated by IDA PRO from a single function compiled from the Linux kernel and reconstruct the original C function as accurately as possible. The input will be IDA PRO pseudocode for one function only, without headers or other definitions. Follow these guidelines:

1. Analyze the given pseudocode carefully, focusing on the structure and logic of this single function.

2. Recognize common decompiler patterns and how they might differ from idiomatic C code in the Linux kernel.

3. Reconstruct the high-level control flow, including loops, conditionals, and any function calls made from this function.

4. Infer local data structures and their usage from the pseudocode. Be aware that IDA PRO might use its own naming conventions for inferred structures.

5. Pay attention to function parameters and return types, especially considering this is a kernel function. IDA PRO might not always accurately represent complex types.

6. Consider how the target architecture (x86, ARM, etc.) might have influenced the original code and the resulting pseudocode.

7. Reconstruct the function using coding style typical of the Linux kernel, which might differ from the style of the pseudocode.

8. If you encounter any patterns that suggest inline assembly in the original code, reconstruct it as such in the C code.

9. Be aware of and recreate kernel-specific attributes and decorators that might apply to this function (e.g., __init, __exit, __user), which might not be present in the pseudocode.

10. Provide brief, inline comments to explain complex parts or your reasoning for non-obvious reconstruction decisions within the function.

11. If exact reconstruction is not possible for any part, provide the closest approximation and explain your reasoning.

12. Infer the function's correct return type and parameters based on the pseudocode and your knowledge of Linux kernel conventions.

13. Do not include any #include statements or type definitions outside the function.

14. Be cautious of potential decompiler errors or misinterpretations in the pseudocode, and use your judgment to correct these in your reconstruction.

15. Pay special attention to pointer arithmetic and type casting, which might be represented differently in the pseudocode compared to idiomatic C code.

16. Look for patterns that suggest the use of Linux kernel-specific macros or inline functions, and reconstruct these appropriately.

After completing the analysis and reconstruction, present the reconstructed C function as a single, continuous block of text. The output should be a valid C function that could theoretically be part of the Linux kernel source code.

Given an IDA PRO pseudocode snippet for a single function, analyze it thoroughly and provide your best reconstruction of the original C function as it might appear in the Linux kernel source code.
"""

In [33]:
def chat_with_gpt(system_prompt, user_input):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_input},
        ],
        model="gpt-4o-mini",
    )
    return extract_code_blocks(chat_completion.choices[0].message.content).replace('c\n', '')


def process_row(args):
    index, system_prompt, row = args
    return index, chat_with_gpt(system_prompt, row['pseudo_code'])

In [34]:
def parallel_process_openai(df: pd.DataFrame) -> list:
    df['end2end'] = ''

    with ThreadPoolExecutor() as executor:
        future_to_index = {
            executor.submit(process_row, (_, system_prompt_refine, row)) for _, row in df.iterrows()
        }

        for future in tqdm(as_completed(future_to_index), total=len(future_to_index), desc="Processing"):
            index, result = future.result()
            df.loc[index, 'end2end'] = result

    return df

In [35]:
result = parallel_process_openai(linux_6_0[:10000])

Processing: 100%|██████████| 10000/10000 [1:38:56<00:00,  1.68it/s] 


In [36]:
result.to_csv('openai_refine.csv')