In [45]:
from datasets import load_dataset
import os

dataset = load_dataset('llvm-ml/ComPile', split='train', streaming=True, cache_dir=os.path.expanduser('~/.cache/huggingface/datasets'))
programs = dataset.take(10)

In [46]:
import subprocess

def generate_ir_file(program, i):

    base_name = 'program_' + i

    # Generate .bc file
    bc_path = f'{base_name}.bc'
    with open(bc_path, 'wb') as f:
        f.write(program['content'])

    # Convert .bc to .ll
    result = subprocess.run(['llvm-dis', base_name + '.bc', '-o', './ir_programs/' + base_name + '.ll'], capture_output=True, text=True)

    if result.returncode == 0:
        print(f'{base_name} code file created.')
    else:
        print(result.stderr)
        raise RuntimeError(f'Failed to create .ll file for {base_name}')
    
    # Delete .bc file
    os.remove(base_name + '.bc')


In [47]:
os.makedirs('ir_programs', exist_ok=True)

for i, program in enumerate(programs):
    generate_ir_file(program, str(i))

program_0 code file created.
program_1 code file created.
program_2 code file created.
program_3 code file created.
program_4 code file created.
program_5 code file created.
program_6 code file created.
program_7 code file created.
program_8 code file created.
program_9 code file created.
program_9 code file created.


In [26]:
def apply_prefix(index, flags):
    src_file = f'./ir_programs/program_{index}.ll'
    dest_file = f'program_{index}_optimized.ll'
    
    cmd = ['opt', '-passes=' + ','.join(flags), src_file, '-S', '-o', dest_file]
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f'Optimization failed: {result.stderr}')
        raise RuntimeError(f'Failed to optimize with flags {flags}')

In [27]:
def apply_baseline(index):
    src_file = f'./ir_programs/program_{index}.ll'
    dest_file = f'program_{index}_optimized.ll'
    
    cmd = ['opt', '-O3', src_file, '-S', '-o', dest_file]
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f'Baseline optimization failed: {result.stderr}')
        raise RuntimeError(f'Failed to apply O3 optimization to program_{index}')

In [28]:
def get_executable(index, isOptimized=True):

    src = 'program_' + str(index)
    
    if isOptimized:
        src += '_optimized.ll'
    else:
        src = './ir_programs/' + src + '.ll'

    dest = 'program_' + str(index)

    result = subprocess.run(['clang', src, '-o', dest], capture_output=True, text=True)

    if result.returncode != 0:
        print(result.stderr)
        raise RuntimeError(f'Failed to create executable for {src}')

In [29]:
def get_static_metrics(index, isOptimized=True):

    ll_file = 'program_' + str(index)

    if isOptimized:
        ll_file += '_optimized.ll'
    else:
        ll_file = './ir_programs/' + ll_file + '.ll'
        
    file_size = os.path.getsize(ll_file)
    
    with open(ll_file, 'r') as f:
        content = f.read()
        lines = content.split('\n')
    
    instructions = len([line for line in lines if line.strip().startswith('%') and '=' in line])
    functions = len([line for line in lines if line.strip().startswith('define')])
    basic_blocks = len([line for line in lines if line.strip().endswith(':') and not line.strip().startswith(';')])
    
    load_count = content.count('load')
    store_count = content.count('store')
    call_count = content.count('call')
    
    return {
        'file_size': file_size,
        'total_lines': len(lines),
        'instructions': instructions,
        'functions': functions,
        'basic_blocks': basic_blocks,
        'load_instructions': load_count,
        'store_instructions': store_count,
        'call_instructions': call_count
    }

In [30]:
import time
import psutil

def get_dynamic_metrics(exec_file):
    
    try:
        start_time = time.time()
        
        process = subprocess.Popen([f'./{exec_file}'], 
                                 stdout=subprocess.PIPE, 
                                 stderr=subprocess.PIPE)
        
        peak_memory = 0
        timeout_seconds = 30
        
        try:
            ps_process = psutil.Process(process.pid)
            while process.poll() is None:
                elapsed = time.time() - start_time
                if elapsed > timeout_seconds:
                    print(f'Timeout: {exec_file} exceeded {timeout_seconds}s')
                    process.terminate()
                    process.wait()
                    return {'runtime_seconds': timeout_seconds, 'peak_memory_mb': 0}
                    
                try:
                    memory_info = ps_process.memory_info()
                    current_memory = memory_info.rss / (1024 * 1024)
                    peak_memory = max(peak_memory, current_memory)
                    time.sleep(0.01)
                except psutil.NoSuchProcess:
                    break
        except psutil.NoSuchProcess:
            pass
        
        stdout, stderr = process.communicate()
        return_code = process.returncode
        end_time = time.time()
        
        if return_code != 0:
            print(f'{exec_file} failed with return code {return_code}')
            if stderr:
                print(f'Stderr: {stderr.decode()[:200]}')
            return {'runtime_seconds': 0, 'peak_memory_mb': 0}
        
        runtime = round(end_time - start_time, 6)
        
        if runtime < 0.001:
            print(f'{exec_file} runtime too fast: {runtime}s - may indicate failure')
            return {'runtime_seconds': 0, 'peak_memory_mb': 0}
        
        return {
            'runtime_seconds': runtime,
            'peak_memory_mb': round(peak_memory, 2),
        }
        
    except Exception as e:
        print(f'Error measuring {exec_file}: {e}')
        return {'runtime_seconds': 0, 'peak_memory_mb': 0}

In [31]:
def get_metrics(index, is_optimized=True):

    static_metrics = get_static_metrics(index, is_optimized)
    # dynamic_metrics = get_dynamic_metrics('program_' + str(index))

    complete_metrics = {
        'file_size': static_metrics['file_size'],
        'total_lines': static_metrics['total_lines'],
        'instructions': static_metrics['instructions'],
        'functions': static_metrics['functions'],
        'basic_blocks': static_metrics['basic_blocks'],
        'load_instructions': static_metrics['load_instructions'],
        'store_instructions': static_metrics['store_instructions'],
        'call_instructions': static_metrics['call_instructions'],
        'runtime_seconds': 0, # dynamic_metrics['runtime_seconds'],
        'peak_memory_mb': 0 # dynamic_metrics['peak_memory_mb']
    }
    
    return complete_metrics

In [32]:
def get_row(program, flags, metrics):
    
    return [
        program,
        str(flags),
        metrics['file_size'],
        metrics['total_lines'],
        metrics['instructions'],
        metrics['functions'],
        metrics['basic_blocks'],
        metrics['load_instructions'],
        metrics['store_instructions'],
        metrics['call_instructions'],
        metrics['runtime_seconds'],
        metrics['peak_memory_mb']
    ]

In [33]:
def evaluate_program(index, flags):
    
    data = []

    # No optimization - just analyze the original IR
    metrics = get_metrics(index, is_optimized=False)
    data.append(get_row(f'program_{index}', 'None', metrics))

    # Baseline optimization (-O3)
    apply_baseline(index)
    metrics = get_metrics(index)
    data.append(get_row(f'program_{index}', ['-O3'], metrics))

    # Apply flags incrementally
    prefix = []
    for flag in flags:
        prefix.append(flag)
        apply_prefix(index, prefix)
        metrics = get_metrics(index)
        data.append(get_row(f'program_{index}', prefix.copy(), metrics))

    os.remove(f'program_{index}_optimized.ll')
    
    return data

In [58]:
from itertools import permutations
from tqdm import tqdm
import random

flags = [
    'mem2reg',
    'sroa', 
    'simplifycfg',
    'dce',
    'gvn',
    'loop-unroll',
]

flag_len = len(flags)

programs_list = list(programs)
print(f'Working with {len(programs_list)} programs')

orders = list(permutations(flags))
# orders = random.sample(orders, 20) 

rows_per_evaluation = 2 + flag_len
total_iterations = len(orders) * len(programs_list) * rows_per_evaluation

print(f'Total iterations: {total_iterations}')

data = []

with tqdm(total=total_iterations, desc='Generating dataset') as pbar:

    for i in range(len(programs_list)):
        for order in orders:

            pbar.set_description(f'Processing program_{i}')
            pbar.set_postfix(flags=f'{order}')

            program_data = evaluate_program(i, order)
            data.extend(program_data)
            
            pbar.update(len(program_data))

Working with 10 programs
Total iterations: 57600


Processing program_9: 100%|██████████| 57600/57600 [24:59<00:00, 38.40it/s, flags=('loop-unroll', 'gvn', 'dce', 'simplifycfg', 'sroa', 'mem2reg')]
Processing program_9: 100%|██████████| 57600/57600 [24:59<00:00, 38.40it/s, flags=('loop-unroll', 'gvn', 'dce', 'simplifycfg', 'sroa', 'mem2reg')]


In [59]:
import pandas as pd

columns = [
    'program',
    'flags',
    'file_size',
    'total_lines',
    'instructions',
    'functions',
    'basic_blocks',
    'load_instructions',
    'store_instructions',
    'call_instructions',
    'runtime_seconds',
    'peak_memory_mb'
]

df = pd.DataFrame(data, columns=columns)
df.to_csv('./dataset.csv', index=False)

print('Dataset creation complete!')
print(df.head(10))
print(f'Total rows in dataset: {len(df)}')

Dataset creation complete!
     program                                              flags  file_size  \
0  program_0                                               None       9636   
1  program_0                                            ['-O3']       9894   
2  program_0                                        ['mem2reg']       9650   
3  program_0                                ['mem2reg', 'sroa']       9650   
4  program_0                 ['mem2reg', 'sroa', 'simplifycfg']       9650   
5  program_0          ['mem2reg', 'sroa', 'simplifycfg', 'dce']       9650   
6  program_0   ['mem2reg', 'sroa', 'simplifycfg', 'dce', 'gvn']       9650   
7  program_0  ['mem2reg', 'sroa', 'simplifycfg', 'dce', 'gvn...       9650   
8  program_0                                               None       9636   
9  program_0                                            ['-O3']       9894   

   total_lines  instructions  functions  basic_blocks  load_instructions  \
0          115            52          