## AIDE exp results

In [None]:
import json
import numpy as np
import pandas as pd
import os
import re

root_path = '/home/v-yuanteli/aide_gpt_4o_our_results/2025-03-04T01-11-04-GMT_run-group_aide'
all_dfs = []

for competition in os.listdir(root_path):
    competition_path = os.path.join(root_path, competition)

    if not os.path.isdir(competition_path):
        continue

    file_path = os.path.join(competition_path, 'logs/journal.json')
    log_path = os.path.join(competition_path, 'logs/aide.log')
    base_path = competition_path
    
    if not os.path.exists(file_path) or not os.path.exists(log_path):
        continue
    
    # extract JSON for each loop's info
    with open(file_path, 'r') as file:
        content = file.read().strip()
        if not content:
            print(f"Error: {file_path} is empty, skipping this competition.")
            continue

        try:
            data = json.loads(content)
        except json.JSONDecodeError as e:
            print(f"Error: read {file_path} failed, error info is : {e}")
            continue

    
    nodes = data.get('nodes', [])
    records = []
    for node in nodes:
        step = node.get('step')
        node_id = node.get('id')
        metric = node.get('metric', {})
        is_buggy = node.get('is_buggy')
        records.append({'step': step, 'id': node_id, 'metric': metric, 'is_buggy': is_buggy})
    
    df = pd.DataFrame(records)
    
    # extract time
    with open(log_path, 'r') as file:
        log = file.read()
    
    pattern = r'\[(.*?)\] INFO: Agent is generating code, parent node type'
    times = re.findall(pattern, log)
    
    if len(times) > len(df):
        new_row = pd.DataFrame([{'step': df['step'].iloc[-1] + 1, 'id': '', 'metric': {}, 'is_buggy': None}])
        df = pd.concat([df, new_row], ignore_index=True)
    
    df['times'] = times[:len(df)]
    
    maximize = None
    
    for index, row in df.iterrows():
        if row['metric'].get('maximize') is not None:
            maximize = row['metric']['maximize']
            break
    
    if maximize is None:
        df['sota'] = None
    else:
        if maximize:
            max_value = -np.inf
            def calculate_sota(row, max_value=[-np.inf]):
                metric = row['metric']
                if metric.get('value') is None:
                    return None
                if metric['value'] > max_value[0]:
                    max_value[0] = metric['value']
                    return True
                return False
        else:
            min_value = np.inf
            def calculate_sota(row, min_value=[np.inf]):
                metric = row['metric']
                if metric.get('value') is None:
                    return None
                if metric['value'] < min_value[0]:
                    min_value[0] = metric['value']
                    return True
                return False
    
        df['sota'] = df.apply(calculate_sota, axis=1)
    
    # extract grading data
    grading_folders = [f for f in os.listdir(base_path) if f.startswith('grading_output_') and os.path.isdir(os.path.join(base_path, f))]
    scores = []
    
    for folder in grading_folders:
        folder_path = os.path.join(base_path, folder)
        folder_number = int(folder.split('_')[-1])
        json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
        assert len(json_files) == 1, f"Expected exactly one JSON file in {folder_path}, but found {len(json_files)}"
        
        json_path = os.path.join(folder_path, json_files[0])
        with open(json_path, 'r') as file:
            data = json.load(file)
            competition_reports = data.get('competition_reports', [])
            score = competition_reports[0].get('score') if competition_reports else None
            rank = competition_reports[0].get('rank') if competition_reports else None
            any_medal = competition_reports[0].get('any_medal') if competition_reports else None
            gold_medal = competition_reports[0].get('gold_medal') if competition_reports else None
            silver_medal = competition_reports[0].get('silver_medal') if competition_reports else None
            bronze_medal = competition_reports[0].get('bronze_medal') if competition_reports else None
            above_median = competition_reports[0].get('above_median') if competition_reports else None
            submission_exists = competition_reports[0].get('submission_exists') if competition_reports else None
            valid_submission = competition_reports[0].get('valid_submission') if competition_reports else None
            scores.append({'folder_number': folder_number, 'score_loop': score, 'rank': rank, 'any_medal': any_medal, 'gold_medal': gold_medal, 'silver_medal': silver_medal, 'bronze_medal': bronze_medal, 'above_median': above_median, 'submission_exists': submission_exists, 'valid_submission': valid_submission})
    
    scores_df = pd.DataFrame(scores)

    if not scores_df.empty:
        scores_df = scores_df.sort_values(by='folder_number').reset_index(drop=True)
        for col in ['score_loop', 'rank', 'any_medal', 'gold_medal', 'silver_medal', 'bronze_medal', 'above_median', 'submission_exists', 'valid_submission']:
            score_dict = scores_df.set_index('folder_number')[col].to_dict()
            df[col] = df['step'].map(score_dict)
    else:
        for col in ['score_loop', 'rank', 'any_medal', 'gold_medal', 'silver_medal', 'bronze_medal', 'above_median', 'submission_exists', 'valid_submission']:
            df[col] = None
    
    df['score'] = df['score_loop'].copy()

    for i in range(1, len(df)):
        if df.at[i, 'sota'] != True:
            for col in ['score', 'rank', 'any_medal', 'gold_medal', 'silver_medal', 'bronze_medal', 'above_median', 'submission_exists', 'valid_submission']:
                df.at[i, col] = df.at[i-1, col]
    
    df['competition'] = competition
    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)

In [None]:
final_df.to_csv('aide_gpt_4o_our_results.csv', index=False)

---

## Calculate 11 hours AIDE exp results

In [None]:
import pandas as pd

metrics = ["any_medal", "gold_medal", "silver_medal", "bronze_medal", "above_median", "submission_exists", "valid_submission"]
df = pd.read_csv('aide_gpt_4o_our_results.csv')
df['relative_time'] = pd.to_datetime(df['times']) - pd.to_datetime(df['times'].min())

def compute_ratio_within_time(df, metric, hours):
    df_time = df[df['relative_time'] <= pd.Timedelta(hours=hours)]
    if df_time.empty:
        return 0

    last_entries = df_time.groupby('competition').apply(lambda x: x.loc[x['relative_time'].idxmax()])
    return last_entries[metric].sum() / 22 if last_entries.shape[0] > 0 else 0

aide_ratios_11h = {metric: round(compute_ratio_within_time(df, metric, 11) * 100, 1) for metric in metrics}
aide_ratios_11h

---

In [None]:
import json
import numpy as np
import pandas as pd
import os
import re

root_path = '/home/v-yuanteli/aide_gpt_4o_our_results/2025-03-04T01-11-04-GMT_run-group_aide'
all_dfs = []

for competition in os.listdir(root_path):
    competition_path = os.path.join(root_path, competition)
    if not competition.startswith('aerial-cactus-identification'):
        continue
    if not os.path.isdir(competition_path):
        continue

    file_path = os.path.join(competition_path, 'logs/journal.json')
    log_path = os.path.join(competition_path, 'logs/aide.log')
    base_path = competition_path
    
    if not os.path.exists(file_path) or not os.path.exists(log_path):
        continue
    
    # extract JSON for each loop's info
    with open(file_path, 'r') as file:
        content = file.read().strip()
        if not content:
            print(f"Error: {file_path} is empty, skipping this competition.")
            continue

        try:
            data = json.loads(content)
        except json.JSONDecodeError as e:
            print(f"Error: read {file_path} failed, error info is : {e}")
            continue

    
    nodes = data.get('nodes', [])
    records = []
    for node in nodes:
        step = node.get('step')
        node_id = node.get('id')
        metric = node.get('metric', {})
        is_buggy = node.get('is_buggy')
        records.append({'step': step, 'id': node_id, 'metric': metric, 'is_buggy': is_buggy})
    
    df = pd.DataFrame(records)
    
    # extract time
    with open(log_path, 'r') as file:
        log = file.read()
    
    pattern = r'\[(.*?)\] INFO: Agent is generating code, parent node type'
    times = re.findall(pattern, log)
    
    if len(times) > len(df):
        new_row = pd.DataFrame([{'step': df['step'].iloc[-1] + 1, 'id': '', 'metric': {}, 'is_buggy': None}])
        df = pd.concat([df, new_row], ignore_index=True)
    
    df['times'] = times[:len(df)]
    
    maximize = None
    
    for index, row in df.iterrows():
        if row['metric'].get('maximize') is not None:
            maximize = row['metric']['maximize']
            break
    
    if maximize is None:
        df['sota'] = None
    else:
        if maximize:
            max_value = -np.inf
            def calculate_sota(row, max_value=[-np.inf]):
                metric = row['metric']
                if metric.get('value') is None:
                    return None
                if metric['value'] > max_value[0]:
                    max_value[0] = metric['value']
                    return True
                return False
        else:
            min_value = np.inf
            def calculate_sota(row, min_value=[np.inf]):
                metric = row['metric']
                if metric.get('value') is None:
                    return None
                if metric['value'] < min_value[0]:
                    min_value[0] = metric['value']
                    return True
                return False
    
        df['sota'] = df.apply(calculate_sota, axis=1)
    
    # extract grading data
    grading_folders = [f for f in os.listdir(base_path) if f.startswith('grading_output_') and os.path.isdir(os.path.join(base_path, f))]
    scores = []
    
    for folder in grading_folders:
        folder_path = os.path.join(base_path, folder)
        folder_number = int(folder.split('_')[-1])
        json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
        assert len(json_files) == 1, f"Expected exactly one JSON file in {folder_path}, but found {len(json_files)}"
        
        json_path = os.path.join(folder_path, json_files[0])
        with open(json_path, 'r') as file:
            data = json.load(file)
            competition_reports = data.get('competition_reports', [])
            score = competition_reports[0].get('score') if competition_reports else None
            scores.append({'folder_number': folder_number, 'score_loop': score})
    
    scores_df = pd.DataFrame(scores)
    if not scores_df.empty:
        scores_df = scores_df.sort_values(by='folder_number').reset_index(drop=True)
        score_dict = scores_df.set_index('folder_number')['score_loop'].to_dict()
        df['score_loop'] = df['step'].map(score_dict)
    else:
        df['score_loop'] = None
    
    df['score'] = df['score_loop'].copy()

    for i in range(1, len(df)):
        if df.at[i, 'sota'] != True:
            df.at[i, 'score'] = df.at[i-1, 'score']
    
    df['competition'] = competition
    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)

In [None]:
final_df