In [51]:
import pandas as pd
from intertrans.data import read_engine_output
import grpc
import intertrans.protos_pb2_grpc as ptgrpc
import intertrans.protos_pb2 as ptpb

## RQ4: How do semantic errors propagate in INTERTRANS?

In [52]:
import random
import numpy as np

# Set the seed for the random module
random.seed(1)

# Set the seed for numpy
np.random.seed(1)

In [53]:
df_codellama_13b_codenet_noverify = read_engine_output('../data/raw_outputs/engine/codellama_13b_codenet_results_sub_depth4.json')
df_magicoder_codenet_noverify = read_engine_output('../data/raw_outputs/engine/magicoder_codenet_results_sub_depth4.json')
df_starcoder2_codenet_noverify = read_engine_output('../data/raw_outputs/engine/starcoder2_codenet_results_sub_depth4.json')

In [54]:
codenet_dataset = pd.read_json('../../datasets/codenet_dataset_subset.jsonl', orient='records', lines=True)

In [55]:
def get_paths_list(raw_output):
    # Extract values and construct the multi-level index
    all_paths  = {
        'success' : [],
        'fail' : [],
        'edges_count' : 0
    }

    for rindex, response in enumerate(raw_output['translation_responses']):
        found_translation = False
        count_translated = 0

        for path in response['paths']:
            found_translation = False
            count_translated = 0

            # We don't want direct translations   
            if len(path["translation_edges"]) == 1:
                continue

            for index, edge in enumerate(path["translation_edges"]):
                all_paths['edges_count'] += 1

                if edge["status"] == "TRANSLATION_FOUND":
                    count_translated += 1
                    found_translation = True
                    #We don't continue exploring
                    break
                elif "SKIP" not in edge["status"]:
                    count_translated += 1

            if found_translation:
                #Done with this request
                all_paths['success'].append((response, path))
                break
            else:
                #Only paths that have no skipped edges
                if count_translated == len(path["translation_edges"]):
                    all_paths['fail'].append((response, path))

    return all_paths

In [56]:
codellama_paths_codenet = get_paths_list(df_codellama_13b_codenet_noverify)
magicoder_paths_codenet = get_paths_list(df_magicoder_codenet_noverify)
starcoder2_paths_codenet = get_paths_list(df_starcoder2_codenet_noverify)

In [57]:
print(codellama_paths_codenet['edges_count'] + magicoder_paths_codenet['edges_count'] + starcoder2_paths_codenet['edges_count']) 

283262


In [58]:
def sample_fail_paths(paths):
    """
    Sample a subset of 'fail' paths to match the number of 'success' paths.

    Parameters:
    codellama_paths (dict): Dictionary containing 'fail' and 'success' paths.
    seed (int): Seed for the random number generator.

    Returns:
    list: Sampled 'fail' paths.
    """    
    # Ensure there are more 'fail' paths than 'success' paths
    print("Number of 'fail' paths:", len(paths['fail']))
    print("Number of 'success' paths:", len(paths['success']))
    assert len(paths['fail']) > len(paths['success']), "Number of 'fail' paths should be greater than 'success' paths."
    
    # Sample 'fail' paths
    random_fail = random.sample(paths['fail'], len(paths['success']))
    
    return random_fail

In [59]:
combined_codellama_codenet = codellama_paths_codenet['success'] + sample_fail_paths(codellama_paths_codenet)
combined_magicoder_codenet = magicoder_paths_codenet['success'] + sample_fail_paths(magicoder_paths_codenet)
combined_starcoder2_codenet = starcoder2_paths_codenet['success'] + sample_fail_paths(starcoder2_paths_codenet)

Number of 'fail' paths: 41025
Number of 'success' paths: 627
Number of 'fail' paths: 16142
Number of 'success' paths: 914
Number of 'fail' paths: 19805
Number of 'success' paths: 875


In [60]:
from tqdm import tqdm

def process_paths(paths, dataset_all, is_codenet):
    all_edges = []
    id_counter = 1
    
    for response, path in tqdm(paths):  
        id_counter += 1
        path_length = len(path["translation_edges"])

        seed_language = response['translation_request']['seed_language']
        target_language = response['translation_request']['target_language']
        id_request = response['translation_request']['id']
        seed_code = response['translation_request']['seed_code']

        # Prepare edges for execution
        for index, edge in enumerate(path["translation_edges"]):
            obj = {}
            obj["request_id"] =  str(id_counter) + "-" + str(index + 1) + "-" + str(path_length)  + "-" + id_request
            obj["source_lang"] = edge["input_language"]
            obj["target_lang"] = edge["target_language"]
            obj["input_code"] = edge["source_code"]
            obj["inference_output"] = edge.get('inference_output', "")

            if is_codenet:
                # Get the test cases. These are independent of the PL used.
                retrieved = dataset_all[(dataset_all['input_code'] == seed_code)]
                assert len(retrieved) > 0

                # Get any test case, they are the same. These are independent of the PL used.
                r = retrieved.iloc[0]

                obj["stdin_input_1"] = r.get('stdin_input_1', None)
                obj["stdin_input_2"] = r.get('stdin_input_2', None)
                obj["stdin_input_3"] = r.get('stdin_input_3', None)

                obj["expected_output_1"] = r.get('expected_output_1', None)
                obj["expected_output_2"] = r.get('expected_output_2', None)
                obj["expected_output_3"] = r.get('expected_output_3', None)
            else:                
                retrieved = dataset_all[
                    (dataset_all.input_code.str.strip() == seed_code.strip()) &
                    (dataset_all.target_lang == target_language) &
                    (dataset_all.source_lang == seed_language)
                ]
                
                obj['test_code'] = retrieved.iloc[0]['test_code']

            
            all_edges.append(obj)
    
    return pd.DataFrame(all_edges)


In [61]:
all_edges_codellama_codenet = process_paths(combined_codellama_codenet, codenet_dataset, True)
all_edges_magicoder_codenet = process_paths(combined_magicoder_codenet, codenet_dataset, True)
all_edges_starcoder2_codenet = process_paths(combined_starcoder2_codenet, codenet_dataset, True)

100%|██████████| 1254/1254 [00:01<00:00, 1186.31it/s]
100%|██████████| 1828/1828 [00:01<00:00, 1293.29it/s]
100%|██████████| 1750/1750 [00:01<00:00, 1270.03it/s]


In [62]:
def send_request_fuzzy(df):
    
    batch_request = ptpb.BatchVerificationRequest()
    batch_request.id = "1"

    batch_request = ptpb.BatchVerificationRequest()

    for index, row in df.iterrows():
        verification_request = ptpb.VerificationRequest()
        verification_request.inferenceOutput = row['inference_output']
        verification_request.targetLanguage = row['target_lang']
        verification_request.sourceLanguage = row['source_lang']
        verification_request.id = str(row['request_id'])

        fuzzytest1 = ptpb.FuzzyTestCase()
        fuzzytest1.stdin_input = row['stdin_input_1']
        fuzzytest1.expected_output = row['expected_output_1']

        fuzzytest2 = ptpb.FuzzyTestCase()
        fuzzytest2.stdin_input = row['stdin_input_2']
        fuzzytest2.expected_output = row['expected_output_2']

        fuzzytest3 = ptpb.FuzzyTestCase()
        fuzzytest3.stdin_input = row['stdin_input_3']
        fuzzytest3.expected_output = row['expected_output_3']

        verification_request.test_suite.fuzzy_suite.append(fuzzytest1)
        verification_request.test_suite.fuzzy_suite.append(fuzzytest2)
        verification_request.test_suite.fuzzy_suite.append(fuzzytest3)

        batch_request.verification_requests.append(verification_request)

    options = [
    ('grpc.max_send_message_length', 1000 * 1024 * 1024 * 2), 
    ('grpc.max_receive_message_length', 1000 * 1024 * 1024 * 2) 
    ]

    with grpc.insecure_channel('localhost:50051', options=options) as channel:
        stub = ptgrpc.TranslationServiceStub(channel)
        response = stub.BatchRunVerification(batch_request)

    return response


In [63]:
exec_responses_codellama_codenet = send_request_fuzzy(all_edges_codellama_codenet)
exec_responses_magicoder_codenet = send_request_fuzzy(all_edges_magicoder_codenet)
exec_responses_starcoder2_codenet = send_request_fuzzy(all_edges_starcoder2_codenet)

In [64]:
import pandas as pd

def parse_verification_responses(exec_responses, is_codenet):
    parsed_responses = []

    responses = exec_responses.verification_responses

    for verification_response in responses:
        obj = {}
        obj["request_id"] = verification_response.verification_request.id
        obj["input_languages"] = verification_response.verification_request.sourceLanguage
        obj["target_languages"] = verification_response.verification_request.targetLanguage
        obj["status"] = verification_response.status

        if verification_response.unit_tests:
            tests = verification_response.unit_tests
        else:
            tests = verification_response.fuzzy_tests

        number_passed = 0
        failed_ids = []

        for index, test in enumerate(tests):
            if test.passed:
                number_passed += 1
            else:
                failed_ids.append(index)
            if is_codenet:
                obj[f"test_{index}_input"] = test.stdin_input
                obj[f"test_{index}_output"] = test.expected_output
                obj[f"test_{index}_actual_output"] = test.actual_output
                obj[f"test_{index}_executed_code"] = test.executed_code

        obj["number_passed"] = number_passed
        obj["failed_execution"] = verification_response.status == "FAILED_EXECUTION"
        obj["total_tests"] = len(tests)
        obj["failed_ids"] = failed_ids

        parsed_responses.append(obj)

    df = pd.DataFrame(parsed_responses)
    return df

In [65]:
df_parsed_codellama_codenet = parse_verification_responses(exec_responses_codellama_codenet, True)
df_parsed_magicoder_codenet = parse_verification_responses(exec_responses_magicoder_codenet, True)
df_parsed_starcoder2_codenet = parse_verification_responses(exec_responses_starcoder2_codenet, True)

In [98]:
#Number of exections
df_parsed_codellama_codenet.shape[0] + df_parsed_magicoder_codenet.shape[0] + df_parsed_starcoder2_codenet.shape[0]

14270

In [99]:
def any_propagating_ones(arr):
    group_count = 0
    current_group_length = 0

    for num in arr:
        if num == 1:
            current_group_length += 1
        else:
            if current_group_length > 1:
                group_count += 1
            current_group_length = 0

    if current_group_length > 1:
        group_count += 1

    return int(group_count > 0)

In [100]:
import pandas as pd

def find_patterns_propagation(df, model_name):
    df['path_id'] = df['request_id'].apply(lambda x: x.split('-')[0])
    df['level'] = df['request_id'].apply(lambda x: x.split('-')[1])
    propagation_examples = []

    result = df.groupby(['path_id'])

    propagations = []
    assertion_fail_distribution = []

    for name, group in result:
        error_happened = {
            0 : [0 for i in range(len(group))],
            1 : [0 for i in range(len(group))],
            2 : [0 for i in range(len(group))],
            #This fourth one is execution error propagation
            3 : [0 for i in range(len(group))]
        }
        
        for index, row in group.reset_index(drop=True).iterrows():
            if not row['failed_execution']:
                for failed in row['failed_ids']:
                    error_happened[failed][index] = 1
            else:
                pass
                error_happened[3][index] = 1

        fail_dist = []
        assert_sequence_propagations = 0

        for i in range(len(group)):
            fail_dist.append(
                error_happened[0][i] + error_happened[1][i] + error_happened[2][i] + error_happened[3][i]
            )
            assert_sequence_propagations += any_propagating_ones(error_happened[i])

        #Get example of how it propagates that are not execution errors
        if assert_sequence_propagations == 1 and sum(error_happened[3]) == 0:
            propagation_examples.append(group)

        assertion_fail_distribution.append(fail_dist)
        propagations.append(error_happened)

    return propagations, assertion_fail_distribution, propagation_examples

In [101]:
len(errors_magicoder) + len(errors_codellama) + len(errors_starcoder2)

4832

In [102]:
errors_magicoder, fd_magicoder, examples_magicoder = find_patterns_propagation(df_parsed_magicoder_codenet, "magicoder")
errors_codellama, fd_codellama, examples_codellama = find_patterns_propagation(df_parsed_codellama_codenet, "codellama")
errors_starcoder2, fd_starcoder, examples_starcoder = find_patterns_propagation(df_parsed_starcoder2_codenet, "starcoder2")

#Merge the errors
errors = errors_magicoder + errors_codellama + errors_starcoder2
fd = fd_magicoder + fd_codellama + fd_starcoder

In [137]:
def does_path_fail(error):
    for keys in range(3):
        if error[keys][-1] == 1:
            return True
    return False

In [138]:
def is_reappearing_error(arr):
    transitions = 0
    observed_one = False

    for i in range(len(arr) - 1):
        if arr[i] == 1:
            observed_one = True
        if arr[i] == 0 and observed_one and arr[i + 1] == 1:
            transitions += 1

    return int(transitions > 0) 

In [139]:
def is_failure_due_to_propagation(error):
    for keys in range(3):
        if error[keys][-2] == 1 and error[keys][-1] == 1:
            return True
    return False

In [140]:
all_reapearing = 0
propagates_til_end = 0
propagates_at_least_once = 0
propagates_at_least_once_evaluated = 0
total_processed_paths = 0
skipped_paths = 0
total_paths = len(errors)
assertions_skipped = 0
total_assertions = 0
processed_assertions = 0

total_appearances = 0
total_eval_propagations = 0

num_failing_paths = 0
skipped_assertions_compilationerr = 0

macro_failures_due_propagation = 0
total_appearances_evaluations = 0

for error in errors:
    total_assertions += 3
    #We exclude paths where there is compilation error in the middle at any point
    if sum(error[3]) != 0:
        skipped_paths += 1
        skipped_assertions_compilationerr += 3
        continue

    total_processed_paths += 1
    
    if does_path_fail(error):
        num_failing_paths += 1
        if is_failure_due_to_propagation(error):
            macro_failures_due_propagation += 1

    for key in range(3):
        processed_assertions += 1
        times_reapearing = 0
        last_observed = -1
        len_path = len(error[key])

        #We exclude cases where there are no errors at all
        if sum(error[key]) == 0:
            assertions_skipped += 1
            continue

        propagates_at_least_once += any_propagating_ones(error[key])
        propagates_at_least_once_evaluated += 1

        if error[key][-2] == 1 and error[key][-1] == 1:
            propagates_til_end += 1

        total_eval_propagations += 1

        total_appearances_evaluations += 1

        all_reapearing += is_reappearing_error(error[key])
        

In [146]:
import numpy as np

times_propagates_all = propagates_at_least_once / propagates_at_least_once_evaluated
print(f"Percentage of Assertions sequences that have error propagation: {times_propagates_all:.2f}")


perc_propagates_til_end = propagates_til_end / total_eval_propagations
print(f"Percentage of Assertion sequences that have error propagation and in the end: {perc_propagates_til_end:.2f}")

average_reapearing = all_reapearing / total_appearances_evaluations
print(f"Percentage of Assertion failures (individual or propagated) Reappearing After Being Fixed (At least once): {average_reapearing:.2f}")

failed_propagation_perc = macro_failures_due_propagation / num_failing_paths
print(f"Percentage of Failed Paths that Failed Due to an Assertion Error Propagation: {failed_propagation_perc:.2f}")




skipped_paths_perc = skipped_paths / total_paths
#path_balance = num_failing_paths / total_processed_paths

print(f"Percentage of Assertions Skipped Due to No Errors at All: {assertions_skipped / processed_assertions:.2f}")
print(f"Total Number of Assertion Sequences: {total_assertions:.2f}")
print(f"Total Number of Processed Assertions Sequences : {processed_assertions:.2f}")
print(f"Total Number of Skipped Paths due to Compilation Error: {skipped_paths_perc:.2f}")
#print(f"Proportion failing paths: {path_balance}")

Percentage of Assertions sequences that have error propagation: 0.31
Percentage of Assertion sequences that have error propagation and in the end: 0.27
Percentage of Assertion failures (individual or propagated) Reappearing After Being Fixed (At least once): 0.04
Percentage of Failed Paths that Failed Due to an Assertion Error Propagation: 0.48
Percentage of Assertions Skipped Due to No Errors at All: 0.76
Total Number of Assertion Sequences: 14496.00
Total Number of Processed Assertions Sequences : 4770.00
Total Number of Skipped Paths due to Compilation Error: 0.67


In [147]:
total_appearances_evaluations

1140