# Random Forest Model for Compiler Optimization Flags
This notebook trains a Random Forest model **without saving/loading a `.pkl` file`. It predicts the best compiler flag for a given C program based on its structural features.

In [3]:
import pandas as pd
import numpy as np
import subprocess
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Load Dataset

In [4]:
# Load dataset
df = pd.read_csv("features.csv")  # Change to actual dataset path

# Extract features and labels
X = df[['LOC', 'ForLoops', 'WhileLoops', 'IfStatements']]
y = df['BestFlag']

## Train Random Forest Model (No .pkl Saving)

In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate model
preds = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, preds) * 100:.2f}%")

Accuracy: 19.21%


## Feature Extraction from C Code

In [6]:
def extract_features_from_c(code_path):
    with open(code_path, "r") as file:
        code = file.readlines()
    
    loc = len(code)
    for_loops = sum(1 for line in code if 'for' in line)
    while_loops = sum(1 for line in code if 'while' in line)
    if_statements = sum(1 for line in code if 'if' in line)
    
    return np.array([[loc, for_loops, while_loops, if_statements]])

## Measuring Execution Time

In [7]:
def measure_exec_time(code_path, flag):
    binary_path = "a.out"
    compile_cmd = f"clang {flag} {code_path} -o {binary_path}"
    subprocess.run(compile_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
    start_time = time.time()
    subprocess.run(f"./{binary_path}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    exec_time = time.time() - start_time
    os.remove(binary_path)
    
    return exec_time

## Predicting the Best Compiler Flag

In [8]:
def predict_best_flag(code_path):
    features = extract_features_from_c(code_path)
    print("Extracted Features:", features)
    
    best_flag = rf_model.predict(features)[0]
    print("Predicted Best Flag:", best_flag)
    
    exec_time_before = measure_exec_time(code_path, "-O0")
    print(f"Execution Time Before (-O0): {exec_time_before:.6f} sec")

    exec_time_after = measure_exec_time(code_path, best_flag)
    print(f"Execution Time After ({best_flag}): {exec_time_after:.6f} sec")

    improvement = (exec_time_before - exec_time_after) / exec_time_before * 100
    print(f"Performance Improvement: {improvement:.2f}%")
    
    return best_flag

## Example Usage

In [9]:
# predict_best_flag("example_c_files__for_model/program_1.c")  # Change to actual C file path