In [1]:
import pandas as pd
from intertrans.data import load_as_df
import itertools
from scipy.stats import chi2_contingency
import numpy as np

# Set the display precision globally
pd.options.display.precision = 1

## RQ2: How could varying the maxDepth affect the performance of INTERTRANS?

In [2]:
df_codellama_13b_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/codellama_13b_transcoder_results_all_depth4.csv')
df_magicoder_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/magicoder_transcoder_results_all_depth4.csv')
df_starcoder2_transcoder_noverify = pd.read_csv('../data/raw_outputs/engine/starcoder2_transcoder_results_all_depth4.csv')

df_codellama_13b_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/codellama_13b_humanevalx_results_sub_depth4.csv')
df_magicoder_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/magicoder_humanevalx_results_sub_depth4.csv')
df_starcoder2_humanevalx_noverify = pd.read_csv('../data/raw_outputs/engine/starcoder2_humanevalx_results_sub_depth4.csv')

df_codellama_13b_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/codellama_13b_codenet_results_sub_depth4.csv')
df_magicoder_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/magicoder_codenet_results_sub_depth4.csv')
df_starcoder2_codenet_noverify = pd.read_csv('../data/raw_outputs/engine/starcoder2_codenet_results_sub_depth4.csv')

In [3]:
def get_table_count_per_depth(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            for verify_status, df in verify_status_dict.items():

                #Calculate metrics
                with_intermediate_translations_one_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 1)]
                with_intermediate_translations_two_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 2)]
                with_intermediate_translations_three_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 3)]
                with_intermediate_translations_four_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 4)]

                total = df.groupby('request_id')['status'].any().sum().item()
                count_total = total

                count_intermediate_translations_one_level = with_intermediate_translations_one_level.groupby('request_id')['status'].any().sum().item()
                count_intermediate_translations_two_level = with_intermediate_translations_two_level.groupby('request_id')['status'].any().sum().item()
                count_intermediate_translations_three_level = with_intermediate_translations_three_level.groupby('request_id')['status'].any().sum().item()
                count_intermediate_translations_four_level = with_intermediate_translations_four_level.groupby('request_id')['status'].any().sum().item()

                index_tuples.append((evaluation_type, model_name, verify_status))
                data_values.append([count_intermediate_translations_one_level, count_intermediate_translations_two_level, count_intermediate_translations_three_level, count_intermediate_translations_four_level, count_total])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name", "Verify Status"])

    # Create the dataframe
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["1", "2", "3", "4", "Total"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [4]:
def get_table_metrics_per_depth(data_dict):
    # Extract values and construct the multi-level index
    index_tuples = []
    data_values = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            for verify_status, df in verify_status_dict.items():

                #Calculate metrics
                with_intermediate_translations_one_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 1)]
                with_intermediate_translations_two_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 2)]
                with_intermediate_translations_three_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 3)]
                with_intermediate_translations_four_level = df[(df['status'] == 'TRANSLATION_FOUND') & (df['level'] <= 4)]

                total = df.groupby('request_id')['status'].any().sum().item()
                count_total = total

                count_intermediate_translations_one_level = with_intermediate_translations_one_level.groupby('request_id')['status'].any().sum().item()
                count_intermediate_translations_two_level = with_intermediate_translations_two_level.groupby('request_id')['status'].any().sum().item()
                count_intermediate_translations_three_level = with_intermediate_translations_three_level.groupby('request_id')['status'].any().sum().item()
                count_intermediate_translations_four_level = with_intermediate_translations_four_level.groupby('request_id')['status'].any().sum().item()

                ca_direct = count_intermediate_translations_one_level / count_total * 100
                ca_intermediates_one_level = count_intermediate_translations_one_level / count_total * 100
                ca_intermediates_two_level = count_intermediate_translations_two_level / count_total * 100
                ca_intermediates_three_level = count_intermediate_translations_three_level / count_total * 100
                ca_intermediate_translations_four_level = count_intermediate_translations_four_level / count_total * 100

                diff_from_baseline_one_level = ca_intermediates_one_level - ca_direct
                diff_from_baseline_two_level = ca_intermediates_two_level - ca_direct
                diff_from_baseline_three_level = ca_intermediates_three_level - ca_direct
                diff_from_baseline_four_level = ca_intermediate_translations_four_level - ca_direct  

                if ca_direct != 0:
                    relative_diff_from_baseline_one_level = (count_intermediate_translations_one_level - count_intermediate_translations_one_level) / (count_intermediate_translations_one_level) * 100
                    relative_diff_from_baseline_two_level = (count_intermediate_translations_two_level - count_intermediate_translations_one_level) / (count_intermediate_translations_two_level) * 100
                    relative_diff_from_baseline_three_level = (count_intermediate_translations_three_level - count_intermediate_translations_one_level) / (count_intermediate_translations_three_level) * 100
                    relative_diff_from_baseline_four_level = (count_intermediate_translations_four_level - count_intermediate_translations_one_level) / (count_intermediate_translations_four_level) * 100
                else:
                    relative_diff_from_baseline_one_level = float("inf")
                    relative_diff_from_baseline_two_level = float("inf")
                    relative_diff_from_baseline_three_level = float("inf")
                    relative_diff_from_baseline_four_level = float("inf")

                index_tuples.append((evaluation_type, model_name, verify_status))
                data_values.append([ca_intermediates_one_level, ca_intermediates_two_level, ca_intermediates_three_level, ca_intermediate_translations_four_level, diff_from_baseline_two_level, diff_from_baseline_three_level, diff_from_baseline_four_level, relative_diff_from_baseline_two_level, relative_diff_from_baseline_three_level, relative_diff_from_baseline_four_level])

    # Create a multi-index from the tuples
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Evaluation Dataset", "Model Name", "Verify Status"])

    # Create the dataframe
    df_multi = pd.DataFrame(data_values, index=multi_index, columns=["InterTrans 1 Intermediate (Direct) (CA)", "InterTrans 2 Intermediates (CA)", "InterTrans 3 Intermediates (CA)", "InterTrans 4 Intermediates (CA)", "Absolute Diff 2 Intermediate (CA)", "Absolute Diff 3 Intermediate (CA)", "Absolute Diff 4 Intermediate (CA)", "Relative Diff 2 Intermediates (CA)", "Relative Diff 3 Intermediates (CA)", "Relative Diff 4 Intermediates (CA)"])

    # Transpose the dataframe
    df_transposed = df_multi.transpose()

    return df_transposed

In [5]:
#Source: Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd ed).
def determine_cramer_category(df, cramer_v):
    # Define the table of Cramer's V thresholds
    cramer_table = {
        1: {'small': 0.10, 'medium': 0.30, 'large': 0.50}
    }
    
    if df not in cramer_table:
        raise ValueError(f"Degrees of freedom {df} not supported by the table")
    
    thresholds = cramer_table[df]
    
    if cramer_v <= thresholds['small']:
        return '*'
    elif cramer_v <= thresholds['medium']:
        return '**'
    elif cramer_v <= thresholds['large']:
        return '***'
    else:
        return '****'

In [6]:
def verify_significance_chisquare(df_metrics):
    # models = ['codellama_13b', 'magicoder','starcoder2']
    datasets = ['codenet', 'humanevalx', 'transcoder']
    models = ['codellama_13b', 'magicoder', 'starcoder2']
    depth = ['1','2','3','4']

    results = []

    for dataset in datasets:
        for model in models:
            p_values = []
            dfs = []

            combinations = itertools.combinations(depth, 2)

            #Bonferroni
            adj_alpha = 0.05 / 6

            for combination in combinations:
                depth1_succ = df_metrics.loc[combination[0], ([dataset], [model], ['noverify'])].item()
                depth2_succ = df_metrics.loc[combination[1], ([dataset], [model], ['noverify'])].item()

                total = df_metrics.loc['Total', ([dataset], [model], ['noverify'])].item()

                observed = np.array([[depth1_succ, total - depth1_succ], [depth2_succ, total - depth2_succ]])

                chi2, p, dof, expected = chi2_contingency(observed)

                # Calculate Cramer's V
                n = observed.sum()
                min_dim = min(observed.shape)
                cramer_v = np.sqrt(chi2 / (n * (min_dim - 1)))

                obj = {}

                obj["dataset"] = dataset
                obj["model"] = model
                obj["combination_1"] = combination[0]
                obj["combination_2"] = combination[1]
                obj["cramer_statistic"] = cramer_v
                obj["cramer_effect"] = determine_cramer_category(dof, cramer_v)
                obj["dof"] = dof
                obj["pvalue"] = p
                obj["adj_alpha"] = adj_alpha
                obj["reject"] = p < adj_alpha
                obj["combination_1_succ_count"] = depth1_succ
                obj["combination_2_succ_count"] = depth2_succ
                obj["samples"] = total

                results.append(obj)

    return pd.DataFrame(results)
           


In [7]:
data_dict = {
    'humanevalx': {
        'magicoder': {
            'noverify': df_magicoder_humanevalx_noverify,
        },
        'codellama_13b': {
            'noverify': df_codellama_13b_humanevalx_noverify,
        },
        'starcoder2': {
            'noverify': df_starcoder2_humanevalx_noverify,
        }
    },
    'codenet': {
        'magicoder': {
            'noverify': df_magicoder_codenet_noverify,
        },
        'codellama_13b': {
            'noverify': df_codellama_13b_codenet_noverify,
        },
        'starcoder2': {
            'noverify': df_starcoder2_codenet_noverify,
        }
    },
    'transcoder': {
        'magicoder': {
            'noverify': df_magicoder_transcoder_noverify,
        },
        'codellama_13b': {
            'noverify': df_codellama_13b_transcoder_noverify,
        },
        'starcoder2': {
            'noverify': df_starcoder2_transcoder_noverify,
        }
    }
}

# RQ2: How would varying the maximum number of intermediate translations affect the performance of InterTrans? 

### CA including one intermediate (direct translation) up to the maximum indicated

In [8]:
metrics = get_table_metrics_per_depth(data_dict)

In [9]:
metrics

Evaluation Dataset,humanevalx,humanevalx,humanevalx,codenet,codenet,codenet,transcoder,transcoder,transcoder
Model Name,magicoder,codellama_13b,starcoder2,magicoder,codellama_13b,starcoder2,magicoder,codellama_13b,starcoder2
Verify Status,noverify,noverify,noverify,noverify,noverify,noverify,noverify,noverify,noverify
InterTrans 1 Intermediate (Direct) (CA),69.0,56.2,49.0,47.2,27.1,44.2,60.7,75.2,74.6
InterTrans 2 Intermediates (CA),89.7,79.9,68.9,72.1,44.1,67.1,77.0,87.9,90.9
InterTrans 3 Intermediates (CA),93.4,86.5,77.2,81.8,54.0,78.1,86.4,91.4,93.0
InterTrans 4 Intermediates (CA),95.4,89.7,82.5,87.3,60.8,84.4,90.8,93.2,93.8
Absolute Diff 2 Intermediate (CA),20.7,23.7,19.9,24.9,17.0,23.0,16.4,12.7,16.3
Absolute Diff 3 Intermediate (CA),24.4,30.3,28.3,34.6,26.9,33.9,25.7,16.3,18.3
Absolute Diff 4 Intermediate (CA),26.4,33.5,33.5,40.1,33.6,40.2,30.1,18.0,19.2
Relative Diff 2 Intermediates (CA),23.0,29.7,28.9,34.5,38.4,34.2,21.3,14.5,17.9
Relative Diff 3 Intermediates (CA),26.1,35.0,36.6,42.3,49.7,43.4,29.8,17.8,19.7
Relative Diff 4 Intermediates (CA),27.6,37.4,40.6,45.9,55.3,47.6,33.2,19.4,20.5
