In [2]:
import pandas as pd
import json
from intertrans.data import load_as_df, get_percentage_timeout, read_engine_output

pd.options.display.precision = 1

## RQ1: How effective is INTERTRANS compared to direct translation and other baselines?

#### Files for the baselines

In [3]:
df_codellama_13b_transcoder_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_transcoder_results_all_depth4_ca10.json')
df_magicoder_transcoder_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_transcoder_results_all_depth4_ca10.json')
df_starcoder2_transcoder_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_transcoder_results_all_depth4_ca10.json')

In [4]:
df_codellama_13b_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_humanevalx_results_sub_depth4_ca85.json')
df_magicoder_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_humanevalx_results_sub_depth4_ca85.json')
df_starcoder2_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_humanevalx_results_sub_depth4_ca85.json')

In [5]:
df_codellama_13b_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_codenet_results_sub_depth4_ca10.json')
df_magicoder_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_codenet_results_sub_depth4_ca10.json')
df_starcoder2_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_codenet_results_sub_depth4_ca10.json')

#### Files for InterTrans

In [6]:
df_codellama_13b_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/codellama_13b_transcoder_results_all_depth4.csv')
df_magicoder_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/magicoder_transcoder_results_all_depth4.csv')
df_starcoder2_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/starcoder2_transcoder_results_all_depth4.csv')

In [7]:
df_codellama_13b_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/codellama_13b_humanevalx_results_sub_depth4.csv')
df_magicoder_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/magicoder_humanevalx_results_sub_depth4.csv')
df_starcoder2_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/starcoder2_humanevalx_results_sub_depth4.csv')

In [8]:
df_codellama_13b_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/codellama_13b_codenet_results_sub_depth4.csv')
df_magicoder_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/magicoder_codenet_results_sub_depth4.csv')
df_starcoder2_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/starcoder2_codenet_results_sub_depth4.csv')

In [9]:
data_dict = {
    'humanevalx': {
        'magicoder': {
            'intertrans': df_magicoder_humanevalx_noverify,
            'baseline' : df_magicoder_humanevalx_noverify_baselines
        },
        'codellama_13b': {
            'intertrans': df_codellama_13b_humanevalx_noverify,
            'baseline' : df_codellama_13b_humanevalx_noverify_baselines
        },
        'starcoder2': {
            'intertrans': df_starcoder2_humanevalx_noverify,
            'baseline' : df_starcoder2_humanevalx_noverify_baselines
        }
    },
    'codenet': {
        'magicoder': {
            'intertrans': df_magicoder_codenet_noverify,
            'baseline' : df_magicoder_codenet_noverify_baselines
        },
        'codellama_13b': {
            'intertrans': df_codellama_13b_codenet_noverify,
            'baseline' : df_codellama_13b_codenet_noverify_baselines
        },
        'starcoder2': {
            'intertrans': df_starcoder2_codenet_noverify,
            'baseline' : df_starcoder2_codenet_noverify_baselines
        }
    },
    'transcoder': {
        'magicoder': {
            'intertrans': df_magicoder_transcoder_noverify,
            'baseline' : df_magicoder_transcoder_noverify_baselines
        },
        'codellama_13b': {
            'intertrans': df_codellama_13b_transcoder_noverify,
            'baseline' : df_codellama_13b_transcoder_noverify_baselines
        },
        'starcoder2': {
            'intertrans': df_starcoder2_transcoder_noverify,
            'baseline' : df_starcoder2_transcoder_noverify_baselines
        }
    }
}

In [10]:
def get_metrics(df):
    extracted = df.shape[0] - df[df.status == "FAILED_NO_EXTRACTED"].shape[0]
    msr =  extracted / df.shape[0] * 100

    #Failed due to timeout
    timeout = df[df.failed_timeout == True].shape[0]
    total_timeout =  timeout / df.shape[0] * 100

    #Calculate CA@10
    direct_translations = df[(df['status'] == 'TRANSLATION_FOUND')]

    total = df.groupby('request_id')['status'].any().sum().item()
    count_total = total

    count_direct_translations = direct_translations.groupby('request_id')['status'].any().sum().item()

    ca_direct = count_direct_translations / count_total * 100

    return ca_direct, msr, total_timeout, total

In [11]:
def get_table_metrics(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            df_intertrans = verify_status_dict['intertrans']
            df_baseline = verify_status_dict['baseline']

            #Group by request and keep up to K candidates (CA@K)
            df_ca_at_1 = df_baseline.groupby('request_id').head(1)
            df_ca_at_10 = df_baseline.groupby('request_id').head(10)
            
            ca_at_1, msr_at_1, timeout_at_1, total_1 = get_metrics(df_ca_at_1)
            ca_at_10, msr_at_10, timeout_at_10, total_10 = get_metrics(df_ca_at_10)

            ca_intertrans, msr_intertrans, timeout_intertrans, total_intertrans = get_metrics(df_intertrans)

            relative_increase = (ca_intertrans-ca_at_10) / ca_at_10 * 100


            index_tuples.append((evaluation_type, model_name))
            data_values.append([ca_at_1, ca_at_10, ca_intertrans, ca_intertrans-ca_at_10, relative_increase])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name"])

    # Create the dataframe
    # df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Total Samples", "MSR", "Timeout", "Direct Translation (baseline CA@85)"])
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Direct Translation (baseline CA@1)", "Direct Translation (baseline CA@10)", "InterTrans (CA)", "Diff from @10", "Rel Diff@10"]).sort_values(by=["Evaluation Dataset", "Model Name"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [12]:
get_table_metrics(data_dict)

Evaluation Dataset,codenet,codenet,codenet,humanevalx,humanevalx,humanevalx,transcoder,transcoder,transcoder
Model Name,codellama_13b,magicoder,starcoder2,codellama_13b,magicoder,starcoder2,codellama_13b,magicoder,starcoder2
Direct Translation (baseline CA@1),25.7,48.2,35.0,60.5,65.9,48.9,72.9,59.5,72.2
Direct Translation (baseline CA@10),34.6,49.0,41.0,71.3,66.9,51.0,74.9,59.5,73.2
InterTrans (CA),60.8,87.3,84.4,89.7,95.4,82.5,93.2,90.8,93.8
Diff from @10,26.2,38.3,43.3,18.4,28.6,31.5,18.3,31.3,20.6
Rel Diff@10,75.8,78.1,105.6,25.8,42.7,61.9,24.5,52.6,28.2


In [13]:
def get_table_metrics_source(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            df_intertrans = verify_status_dict['intertrans']
            df_baseline = verify_status_dict['baseline']

            for name, group_intertrans in df_intertrans.groupby(["seed_language"]):
                df_baseline_source = df_baseline[df_baseline.seed_language == name[0]]

                #Group by request and keep up to K candidates (CA@K)
                df_ca_at_1 = df_baseline_source.groupby('request_id').head(1)
                df_ca_at_10 = df_baseline_source.groupby('request_id').head(10)
                
                ca_at_1, msr_at_1, timeout_at_1, total_1 = get_metrics(df_ca_at_1)
                ca_at_10, msr_at_10, timeout_at_10, total_10 = get_metrics(df_ca_at_10)

                ca_intertrans, msr_intertrans, timeout_intertrans, total_intertrans = get_metrics(group_intertrans)

                relative_increase = (ca_intertrans-ca_at_10) / ca_at_10 * 100

                index_tuples.append((evaluation_type, model_name, name[0]))
                data_values.append([ca_at_1, ca_at_10, ca_intertrans, ca_intertrans-ca_at_10, relative_increase])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name", "Source"])

    # Create the dataframe
    # df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Total Samples", "MSR", "Timeout", "Direct Translation (baseline CA@85)"])
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["CA@1", "CA@10", "InterTrans CA", "Abs Diff", "Rel Diff"]).sort_values(by=["Model Name", "Evaluation Dataset", "Source"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [14]:
get_table_metrics_source(data_dict).T

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CA@1,CA@10,InterTrans CA,Abs Diff,Rel Diff
Evaluation Dataset,Model Name,Source,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
codenet,codellama_13b,C++,32.0,42.9,61.1,18.3,42.7
codenet,codellama_13b,Go,30.3,34.3,61.1,26.9,78.3
codenet,codellama_13b,Java,25.7,38.9,55.4,16.6,42.6
codenet,codellama_13b,JavaScript,22.3,33.7,64.6,30.9,91.5
codenet,codellama_13b,Python,14.3,19.4,57.1,37.7,194.1
codenet,codellama_13b,Rust,29.7,38.3,65.1,26.9,70.1
humanevalx,codellama_13b,C++,70.3,78.9,91.4,12.6,15.9
humanevalx,codellama_13b,Go,64.0,71.4,90.3,18.9,26.4
humanevalx,codellama_13b,Java,58.3,68.0,87.4,19.4,28.6
humanevalx,codellama_13b,JavaScript,57.1,73.1,93.1,20.0,27.3
