In [2]:
import pandas as pd
import json
from intertrans.data import load_as_df, get_percentage_timeout, read_engine_output

pd.options.display.precision = 1

## How effective is INTERTRANS compared to direct translation on fine-tuned models?

#### Files for the SFT baselines

In [3]:
codellama_humanevalx_sft = pd.read_json('../data/raw_outputs/sft/codellama-13b-humanevalx-sft-ca10.jsonl', lines=True)
magicoder_humanevalx_sft = pd.read_json('../data/raw_outputs/sft/magicoder-humanevalx-sft-ca10.jsonl', lines=True)
starcoder_humanevalx_sft = pd.read_json('../data/raw_outputs/sft/starcoder-humanevalx-sft-ca10.jsonl', lines=True)

In [4]:
magicoder_codenet_sft = pd.read_json('../data/raw_outputs/sft/magicoder-codenet-sft-ca10.jsonl', lines=True)
starcoder_codenet_sft = pd.read_json('../data/raw_outputs/sft/starcoder-codenet-sft-ca10.jsonl', lines=True)
codellama_codenet_sft = pd.read_json('../data/raw_outputs/sft/codellama-codenet-sft-ca10.jsonl', lines=True)

In [5]:
codellama_vllm = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_humanevalx_sft_ca10_tpu.json')

### Files for the original baselines

In [6]:
df_codellama_13b_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_humanevalx_results_sub_depth4_ca85.json')
df_magicoder_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_humanevalx_results_sub_depth4_ca85.json')
df_starcoder2_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_humanevalx_results_sub_depth4_ca85.json')

In [7]:
df_codellama_13b_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_codenet_results_sub_depth4_ca10.json')
df_magicoder_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_codenet_results_sub_depth4_ca10.json')
df_starcoder2_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_codenet_results_sub_depth4_ca10.json')

In [8]:
data_dict = {
    'humanevalx': {
        'magicoder': {
            'sft': magicoder_humanevalx_sft,
            'baseline' :  df_magicoder_humanevalx_noverify_baselines
        },
        # 'magicoder-vllm': {
        #     'sft': magicoder_vllm,
        #     'baseline' :  magicoder_humanevalx_baselines
        # },
        'codellama_13b': {
            'sft': codellama_humanevalx_sft,
            'baseline' :  df_codellama_13b_humanevalx_noverify_baselines
        },
        # 'codellama_13b-vllm-gc': {
        #     'sft': codellama_vllm,
        #     'baseline' : codellama_humanevalx_baselines
        # },
        'starcoder2': {
            'sft': starcoder_humanevalx_sft,
            'baseline' : df_starcoder2_humanevalx_noverify_baselines
        }
    },
    'codenet': {
        'magicoder': {
            'sft': magicoder_codenet_sft,
            'baseline' : df_magicoder_codenet_noverify_baselines
        },
        'codellama_13b': {
            'sft': codellama_codenet_sft,
            'baseline' : df_codellama_13b_codenet_noverify_baselines
        },
        'starcoder2': {
            'sft': starcoder_codenet_sft,
            'baseline' : df_starcoder2_codenet_noverify_baselines
        }
    },
    # 'transcoder': {
    #     'magicoder': {
    #         'intertrans': df_magicoder_transcoder_noverify,
    #         'baseline' : df_magicoder_transcoder_noverify_baselines
    #     },
    #     'codellama_13b': {
    #         'intertrans': df_codellama_13b_transcoder_noverify,
    #         'baseline' : df_codellama_13b_transcoder_noverify_baselines
    #     },
    #     'starcoder2': {
    #         'intertrans': df_starcoder2_transcoder_noverify,
    #         'baseline' : df_starcoder2_transcoder_noverify_baselines
    #     }
    # }
}

In [9]:
def get_metrics(df):
    extracted = df.shape[0] - df[df.status == "FAILED_NO_EXTRACTED"].shape[0]
    msr =  extracted / df.shape[0] * 100

    #Failed due to timeout
    timeout = df[df.failed_timeout == True].shape[0]
    total_timeout =  timeout / df.shape[0] * 100

    #Calculate CA@10
    direct_translations = df[(df['status'] == 'TRANSLATION_FOUND')]

    total = df.groupby('request_id')['status'].any().sum().item()
    count_total = total

    count_direct_translations = direct_translations.groupby('request_id')['status'].any().sum().item()

    ca_direct = count_direct_translations / count_total * 100

    return ca_direct, msr, total_timeout, total

In [10]:
def get_table_metrics(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            df_sft = verify_status_dict['sft']
            df_baseline = verify_status_dict['baseline']

            #Group by request and keep up to K candidates (CA@K)
            df_ca_at_1 = df_baseline.groupby('request_id').head(1)
            df_ca_at_10 = df_baseline.groupby('request_id').head(10)
            df_ca_at_1_sft = df_sft.groupby('request_id').head(1)
            df_ca_at_10_sft = df_sft.groupby('request_id').head(10)
            
            ca_at_1, msr_at_1, timeout_at_1, total_1 = get_metrics(df_ca_at_1)
            ca_at_10, msr_at_10, timeout_at_10, total_10 = get_metrics(df_ca_at_10)

            ca_at_1_sft, msr_at_1_sft, _, _ = get_metrics(df_ca_at_1_sft)
            ca_at_10_sft, msr_at_10_sft, sft_timeout_at_10, _ = get_metrics(df_ca_at_10_sft)

            #print(sft_timeout_at_10)

            relative_increase = (ca_at_10_sft-ca_at_10) / ca_at_10 * 100


            index_tuples.append((evaluation_type, model_name))
            data_values.append([ca_at_1, ca_at_10, ca_at_1_sft, ca_at_10_sft, ca_at_10_sft-ca_at_10, relative_increase, msr_at_10, msr_at_10_sft])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name"])

    # Create the dataframe
    # df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Total Samples", "MSR", "Timeout", "Direct Translation (baseline CA@85)"])
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Direct Translation (baseline CA@1)", "Direct Translation (baseline CA@10)", "SFT baseline CA@1", "SFT baseline CA@10)", "Diff from @10", "Rel Diff@10", 'MSR Base', 'MSR SFT']).sort_values(by=["Evaluation Dataset", "Model Name"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [None]:
get_table_metrics(data_dict)

In [12]:
def get_table_metrics_source(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            df_sft = verify_status_dict['sft']
            df_baseline = verify_status_dict['baseline']

            for name, group_sft in df_sft.groupby(["seed_language"]):
                df_baseline_source = df_baseline[df_baseline.seed_language == name[0]]

                #Group by request and keep up to K candidates (CA@K)
                df_ca_at_1 = df_baseline_source.groupby('request_id').head(1)
                df_ca_at_10 = df_baseline_source.groupby('request_id').head(10)
                
                ca_at_1, msr_at_1, timeout_at_1, total_1 = get_metrics(df_ca_at_1)
                ca_at_10, msr_at_10, timeout_at_10, total_10 = get_metrics(df_ca_at_10)

                ca_intertrans, msr_intertrans, timeout_intertrans, total_intertrans = get_metrics(group_sft)

                relative_increase = (ca_intertrans-ca_at_10) / ca_at_10 * 100

                index_tuples.append((evaluation_type, model_name, name[0]))
                data_values.append([ca_at_1, ca_at_10, ca_intertrans, ca_intertrans-ca_at_10, relative_increase])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name", "Source"])

    # Create the dataframe
    # df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Total Samples", "MSR", "Timeout", "Direct Translation (baseline CA@85)"])
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["CA@1", "CA@10", "SFT CA@10", "Abs Diff", "Rel Diff"]).sort_values(by=["Model Name", "Evaluation Dataset", "Source"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [None]:
get_table_metrics_source(data_dict).T