In [None]:
import pandas as pd
import json
from intertrans.data import load_as_df, get_percentage_timeout, read_engine_output

pd.options.display.precision = 1

## How effective is INTERTRANS (Clipped) compared to direct translation and other baselines?

#### Files for the baselines

In [None]:
df_codellama_13b_transcoder_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_transcoder_results_all_depth4_ca10.json')
df_magicoder_transcoder_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_transcoder_results_all_depth4_ca10.json')
df_starcoder2_transcoder_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_transcoder_results_all_depth4_ca10.json')

In [None]:
df_codellama_13b_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_humanevalx_results_sub_depth4_ca85.json')
df_magicoder_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_humanevalx_results_sub_depth4_ca85.json')
df_starcoder2_humanevalx_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_humanevalx_results_sub_depth4_ca85.json')

In [None]:
df_codellama_13b_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/codellama_13b_codenet_results_sub_depth4_ca10.json')
df_magicoder_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/magicoder_codenet_results_sub_depth4_ca10.json')
df_starcoder2_codenet_noverify_baselines = load_as_df('../data/raw_outputs/engine/noverify/starcoder2_codenet_results_sub_depth4_ca10.json')

#### Files for InterTrans

In [None]:
df_codellama_13b_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/codellama_13b_transcoder_results_all_depth4.csv')
df_magicoder_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/magicoder_transcoder_results_all_depth4.csv')
df_starcoder2_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/starcoder2_transcoder_results_all_depth4.csv')

In [None]:
df_codellama_13b_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/codellama_13b_humanevalx_results_sub_depth4.csv')
df_magicoder_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/magicoder_humanevalx_results_sub_depth4.csv')
df_starcoder2_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/starcoder2_humanevalx_results_sub_depth4.csv')

In [None]:
df_codellama_13b_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/codellama_13b_codenet_results_sub_depth4.csv')
df_magicoder_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/magicoder_codenet_results_sub_depth4.csv')
df_starcoder2_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/noverify/starcoder2_codenet_results_sub_depth4.csv')

In [None]:
data_dict = {
    'humanevalx': {
        'magicoder': {
            'intertrans': df_magicoder_humanevalx_noverify,
            'baseline' : df_magicoder_humanevalx_noverify_baselines
        },
        'codellama_13b': {
            'intertrans': df_codellama_13b_humanevalx_noverify,
            'baseline' : df_codellama_13b_humanevalx_noverify_baselines
        },
        'starcoder2': {
            'intertrans': df_starcoder2_humanevalx_noverify,
            'baseline' : df_starcoder2_humanevalx_noverify_baselines
        }
    },
    'codenet': {
        'magicoder': {
            'intertrans': df_magicoder_codenet_noverify,
            'baseline' : df_magicoder_codenet_noverify_baselines
        },
        'codellama_13b': {
            'intertrans': df_codellama_13b_codenet_noverify,
            'baseline' : df_codellama_13b_codenet_noverify_baselines
        },
        'starcoder2': {
            'intertrans': df_starcoder2_codenet_noverify,
            'baseline' : df_starcoder2_codenet_noverify_baselines
        }
    },
    'transcoder': {
        'magicoder': {
            'intertrans': df_magicoder_transcoder_noverify,
            'baseline' : df_magicoder_transcoder_noverify_baselines
        },
        'codellama_13b': {
            'intertrans': df_codellama_13b_transcoder_noverify,
            'baseline' : df_codellama_13b_transcoder_noverify_baselines
        },
        'starcoder2': {
            'intertrans': df_starcoder2_transcoder_noverify,
            'baseline' : df_starcoder2_transcoder_noverify_baselines
        }
    }
}

In [None]:
def get_metrics(df):
    extracted = df.shape[0] - df[df.status == "FAILED_NO_EXTRACTED"].shape[0]
    msr =  extracted / df.shape[0] * 100

    #Failed due to timeout
    timeout = df[df.failed_timeout == True].shape[0]
    total_timeout =  timeout / df.shape[0] * 100

    #Calculate CA@10
    direct_translations = df[(df['status'] == 'TRANSLATION_FOUND')]

    total = df.groupby('request_id')['status'].any().sum().item()
    count_total = total

    count_direct_translations = direct_translations.groupby('request_id')['status'].any().sum().item()

    ca_direct = count_direct_translations / count_total * 100

    return ca_direct, msr, total_timeout, total

In [None]:
def clip_at_k_inferences(df, clip_k):
    kept_rows = []

    groups = df.groupby('request_id')

    for name, group in groups:
        counter = 0

        for index, row in group.iterrows():
            if counter >= clip_k:
                break

            if 'SKIPPED' in row['status'] or row['memoized']:
                continue
            else:
                kept_rows.append(row)
                counter += 1

    return pd.DataFrame(kept_rows)
    

In [None]:
def get_table_metrics(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            df_intertrans = verify_status_dict['intertrans']
            df_baseline = verify_status_dict['baseline']

            #Group by request and keep up to K candidates (CA@K)
            df_ca_at_1 = df_baseline.groupby('request_id').head(1)
            df_ca_at_10 = df_baseline.groupby('request_id').head(10)
            
            ca_at_1, msr_at_1, timeout_at_1, total_1 = get_metrics(df_ca_at_1)
            ca_at_10, msr_at_10, timeout_at_10, total_10 = get_metrics(df_ca_at_10)

            df_intertrans_clipped = clip_at_k_inferences(df_intertrans, 10)
            ca_intertrans, msr_intertrans, timeout_intertrans, total_intertrans = get_metrics(df_intertrans_clipped)

            relative_increase = (ca_intertrans-ca_at_10) / ca_at_10 * 100


            index_tuples.append((evaluation_type, model_name))
            data_values.append([ca_at_1, ca_at_10, ca_intertrans, ca_intertrans-ca_at_10, relative_increase])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name"])

    # Create the dataframe
    # df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Total Samples", "MSR", "Timeout", "Direct Translation (baseline CA@85)"])
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Direct Translation (baseline CA@1)", "Direct Translation (baseline CA@10)", "InterTrans Clipped (CA)", "Diff from @10", "Rel Diff@10"]).sort_values(by=["Evaluation Dataset", "Model Name"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [None]:
import numpy as np
def pass_at_k(n, c, k): 
  """ 
  :param n: total number of samples 
  :param c: number of correct samples 
  :param k: k in pass@$k$ 
  """ 
  if n - c < k: 
    return 1.0 
  return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

In [None]:
pass_at_k(10, 1, 10)

In [None]:
get_table_metrics(data_dict)

In [None]:
def get_table_metrics_source(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            df_intertrans = verify_status_dict['intertrans']
            df_baseline = verify_status_dict['baseline']

            for name, group_intertrans in df_intertrans.groupby(["seed_language"]):
                df_baseline_source = df_baseline[df_baseline.seed_language == name[0]]

                #Group by request and keep up to K candidates (CA@K)
                df_ca_at_1 = df_baseline_source.groupby('request_id').head(1)
                df_ca_at_10 = df_baseline_source.groupby('request_id').head(10)
                
                ca_at_1, msr_at_1, timeout_at_1, total_1 = get_metrics(df_ca_at_1)
                ca_at_10, msr_at_10, timeout_at_10, total_10 = get_metrics(df_ca_at_10)

                ca_intertrans, msr_intertrans, timeout_intertrans, total_intertrans = get_metrics(group_intertrans)

                relative_increase = (ca_intertrans-ca_at_10) / ca_at_10 * 100

                index_tuples.append((evaluation_type, model_name, name[0]))
                data_values.append([ca_at_1, ca_at_10, ca_intertrans, ca_intertrans-ca_at_10, relative_increase])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name", "Source"])

    # Create the dataframe
    # df_multi = pd.DataFrame(data_values, index=multi_index, columns=["Total Samples", "MSR", "Timeout", "Direct Translation (baseline CA@85)"])
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["CA@1", "CA@10", "InterTrans CA", "Abs Diff", "Rel Diff"]).sort_values(by=["Model Name", "Evaluation Dataset", "Source"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [None]:
get_table_metrics_source(data_dict).T