In [24]:
import pandas as pd
from intertrans.data import read_engine_output
import numpy as np

In [None]:
df_codellama_13b_transcoder_noverify = read_engine_output('../data/raw_outputs/engine/codellama_13b_transcoder_results_all_depth4.json')
df_magicoder_transcoder_noverify = read_engine_output('../data/raw_outputs/engine/magicoder_transcoder_results_all_depth4.json')
df_starcoder2_transcoder_noverify = read_engine_output('../data/raw_outputs/engine/starcoder2_transcoder_results_all_depth4.json')

In [None]:
df_codellama_13b_humanevalx_noverify = read_engine_output('../data/raw_outputs/engine/codellama_13b_humanevalx_results_sub_depth4.json')
df_magicoder_humanevalx_noverify = read_engine_output('../data/raw_outputs/engine/magicoder_humanevalx_results_sub_depth4.json')
df_starcoder2_humanevalx_noverify = read_engine_output('../data/raw_outputs/engine/starcoder2_humanevalx_results_sub_depth4.json')

In [None]:
df_codellama_13b_codenet_noverify = read_engine_output('../data/raw_outputs/engine/codellama_13b_codenet_results_sub_depth4.json')
df_magicoder_codenet_noverify = read_engine_output('../data/raw_outputs/engine/magicoder_codenet_results_sub_depth4.json')
df_starcoder2_codenet_noverify = read_engine_output('../data/raw_outputs/engine/starcoder2_codenet_results_sub_depth4.json')

In [None]:
data_dict = {
    'humanevalx': {
        'magicoder': {
            'noverify': df_magicoder_humanevalx_noverify
        },
        'codellama_13b': {
            'noverify': df_codellama_13b_humanevalx_noverify
        },
        'starcoder2': {
            'noverify': df_starcoder2_humanevalx_noverify
        }
    },
    'codenet': {
        'magicoder': {
            'noverify': df_magicoder_codenet_noverify
        },
        'codellama_13b': {
            'noverify': df_codellama_13b_codenet_noverify
        },
        'starcoder2': {
            'noverify': df_starcoder2_codenet_noverify
        }
    },
    'transcoder': {
        'magicoder': {
            'noverify': df_magicoder_transcoder_noverify,
        },
        'codellama_13b': {
            'noverify': df_codellama_13b_transcoder_noverify,
        },
        'starcoder2': {
            'noverify': df_starcoder2_transcoder_noverify,
        }
    }
}

In [None]:
def get_count_k(data_dict):
    # Extract values and construct the multi-level index
    found_ks = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            for verify_status, raw_output in verify_status_dict.items():
                for rindex, response in enumerate(raw_output['translation_responses']):
                    k_found = 1
                    for path in response['paths']:
                        found = False

                        for index, edge in enumerate(path["translation_edges"]):
                            if edge["status"] == "TRANSLATION_FOUND":
                                found = True
                                break

                        #There is one candidate per path
                        if found:
                            found_ks.append(k_found)
                            break
                        else:
                            k_found+=1

    return found_ks

In [None]:
found_ks = get_count_k(data_dict)

In [None]:
len(found_ks)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate statistics
min_value = np.min(found_ks)
max_value = np.max(found_ks)
mean_value = np.mean(found_ks)
std_dev = np.std(found_ks)
quartiles = np.percentile(found_ks, [25, 50, 75, 99, 99.9])

print("Statistics:")
print(f"Min: {min_value}")
print(f"Max: {max_value}")
print(f"Mean: {mean_value:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"25th Percentile: {quartiles[0]}")
print(f"Median (50th Percentile): {quartiles[1]}")
print(f"75th Percentile: {quartiles[2]}")
print(f"99th Percentile: {quartiles[3]}")
print(f"99.9th Percentile: {quartiles[4]}")

# Count the occurrences of each unique value
unique, frequency = np.unique(found_ks, return_counts=True)

# Create a bar plot
plt.bar(unique, frequency, color='blue')

# Add labels and title
plt.xlabel('Found@K')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Candidates Evaluated For Successful Translations')

# Show the plot
plt.show()

In [None]:
mean_k = np.max(found_ks)

In [None]:
mean_k = np.mean(found_ks)

In [None]:
print(f"The mean number of candidates evaluated before finding a translation in InterTrans is: {mean_k:.1f}")

In [None]:
def get_count_inferences(data_dict):
    # Extract values and construct the multi-level index
    found_inference = []

    for evaluation_type, verify_dict in data_dict.items():
        for model_name, verify_status_dict in verify_dict.items():
            for verify_status, raw_output in verify_status_dict.items():
                for rindex, response in enumerate(raw_output['translation_responses']):
                    inference_count = 0

                    for path in response['paths']:
                        found = False

                        for index, edge in enumerate(path["translation_edges"]):
                            if index not in path['edge_index_memoized'] and "SKIPPED" not in edge['status']:
                                
                                inference_count+=1
                                
                                if edge["status"] == "TRANSLATION_FOUND":
                                    found = True
                                    break
                                
                        #Only count cases where translation is successful
                        if found:
                            found_inference.append(inference_count)
                            break


                        #found_inference.append(inference_count)

    return found_inference

In [None]:
found_inference = get_count_inferences(data_dict)

In [None]:
import seaborn as sns

# Calculate statistics
min_value = np.min(found_inference)
max_value = np.max(found_inference)
mean_value = np.mean(found_inference)
std_dev = np.std(found_inference)
percentiles = [25, 50, 75, 90, 95, 99, 99.9]
quartiles = np.percentile(found_inference, percentiles)

print("Statistics:")
print(f"Min: {min_value}")
print(f"Max: {max_value}")
print(f"Mean: {mean_value:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"25th Percentile: {quartiles[0]}")
print(f"Median (50th Percentile): {quartiles[1]}")
print(f"75th Percentile: {quartiles[2]}")
print(f"90th Percentile: {quartiles[3]}")
print(f"95th Percentile: {quartiles[4]}")
print(f"99th Percentile: {quartiles[5]}")
print(f"99.9th Percentile: {quartiles[6]}")

# Create a histogram
sns.histplot(found_inference, bins=30, kde=False, color='blue')

# Add vertical lines for the quantiles
for percentile, value in zip(percentiles, quartiles):
    plt.axvline(x=value, color='red', linestyle='--', linewidth=1)
    plt.text(value, plt.ylim()[1] * 0.9, f'{percentile}%', rotation=90, verticalalignment='center', color='red')

# Add labels and title
plt.xlabel('Total Inferences')
plt.ylabel('Frequency')
plt.title('Histogram of Total Number of Inferences Done For Successful Translations (Not all inferences)')

# Show the plot
plt.show()
