In [1]:
import pandas as pd
import pickle
import os
import hashlib
from collections import defaultdict
from tqdm import tqdm
import xxhash
import re

In [2]:
def re_encode(df):
    df['input_code'] = df['input_code'].astype(str)
    df['input_code'] = df['input_code'].apply(lambda x: x.encode('latin1').decode('utf-8'))
    return df.copy()

In [3]:
df_vre = re_encode(pd.read_csv('forge24-code-translation/RQ3/inference_output_11_models_vre.csv'))
df_cre = re_encode(pd.read_csv('forge24-code-translation/RQ3/inference_output_11_models_cre.csv'))
df_vde = re_encode(pd.read_csv('forge24-code-translation/RQ3/inference_output_11_models_vde.csv'))

In [26]:
df_vre.shape

(42020, 14)

In [4]:
def hash_string(input_string):
    trimmed = input_string.strip()
    input_bytes = trimmed.encode('utf-8')
    hash_object = xxhash.xxh64()
    hash_object.update(input_bytes)
    return hash_object.hexdigest()

In [5]:
def hash_file(file_path):
    """Compute the xxHash64 hash of the file after trimming spaces"""
    with open(file_path, "r") as f:
        return hash_string(f.read())
        

def explore_and_hash(root_dir):
    """Recursively explore folders and hash the content of files"""
    file_hashes = defaultdict(list)
    
    # First, count the total number of files to hash
    total_files = sum(len(files) for _, _, files in os.walk(root_dir))
    
    # Use tqdm for the progress bar
    with tqdm(total=total_files, desc="Hashing files") as pbar:
        for dirpath, _, filenames in os.walk(root_dir):
            for filename in filenames:
                if filename.endswith(('.c', '.java', '.cpp', '.py', '.go')):
                    file_path = os.path.join(dirpath, filename)
                    file_hash = hash_file(file_path)
                    file_hashes[file_hash].append(file_path)
                pbar.update(1)
    
    return file_hashes

In [6]:
def save_to_pickle(data, filename):
    """Save the dictionary to a pickle file."""
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

In [7]:
def read_from_pickle(filename):
    """Read data from a pickle file."""
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [8]:
# Save the hashes_dict to a pickle file
pickle_file = "file_hashes_strip.pkl"
hashes_dict = read_from_pickle(pickle_file)

In [9]:
def extract_problem_and_extension(path):
    # Regular expression to match 'p' followed by digits and capture the extension after the last dot
    match = re.search(r'/p\d+/.*\.(\w+)$', path)
    if match:
        # Extract the 'p' code and extension
        code = re.search(r'p\d+', path).group(0)
        extension = match.group(1)
        return code, extension
    raise Exception('Cpde should not reach this point')

In [12]:
def match_problems(df, hashes_dict):
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        prog = row['input_code']
        hash_val = hash_string(prog.strip())
        df.at[index, 'input_code_hash'] = str(hash_val)

        if hash_val in hashes_dict.keys():
            path = hashes_dict[hash_val][0]
            problem_id, extension = extract_problem_and_extension(path)
            df.at[index, 'problem_id'] = problem_id
    return df

In [13]:
df_vre_matched = match_problems(df_vre, hashes_dict)
df_vde_matched = match_problems(df_vde, hashes_dict)
df_cre_matched = match_problems(df_cre, hashes_dict)

100%|██████████| 42020/42020 [00:03<00:00, 11753.30it/s]
100%|██████████| 42020/42020 [00:03<00:00, 11960.71it/s]
100%|██████████| 42020/42020 [00:03<00:00, 12429.00it/s]


In [14]:
def assert_all_problems_found(df):
    assert df[df.problem_id.isna()].shape[0] == 0

In [15]:
assert_all_problems_found(df_vre_matched)
assert_all_problems_found(df_vde_matched)
assert_all_problems_found(df_cre_matched)

In [16]:
languages_codenet = ['Python', 'Java', 'C++', 'Go', 'C']

extension_map = {
    'Python' : 'py',
    'Java' : 'java',
    'C++' : 'cpp',
    'Go' : 'go',
    'C' : 'c',
}

In [17]:
def get_gt(df):
    entries = []
    current_gt = {}

    for problemid in tqdm(df.problem_id.unique()): 
        #We read the metadata for this problem
        metadata_path = f'./Project_CodeNet/metadata/{problemid}.csv'
        df = pd.read_csv(metadata_path)
        df = df[(df.status == 'Accepted')]
        skipit = False
        local_entries = []
        
        if df.empty == True:
            continue

        obj = {}
        obj['problem_id'] = problemid

        for lang in languages_codenet:
            dflang = df[(df.language == lang)]
            
            #We keep programs that are less than 1kb (as a proxy to token length. This is to ensure we don't have problems with context length)
            #dflang = dflang[(dflang.code_size < 1024)]
            
            if dflang.empty:
                #No parallel data in this language. Moving on to the next problem
                skipit = True
                break
            
            #Sample one random solutions to this problem, from all the solutions
            row = dflang.sample(n=1, random_state=1).iloc[0]
            submission_id = row['submission_id']
            
            if (problemid, lang) in current_gt.keys():
                obj[f'{lang}_gt'] = current_gt[(problemid, lang)]
            else:
                #Get the content from the submission
                with open(f'./Project_CodeNet/data/{problemid}/{lang}/{submission_id}.{extension_map[lang]}') as f:
                    content = f.read()
                    obj[f'{lang}_gt'] = content
                    current_gt[(problemid, lang)] = content

        if not skipit:
            entries.append(obj)

    return pd.DataFrame(entries)

In [18]:
gt_vre = get_gt(df_vre_matched)
gt_cre = get_gt(df_cre_matched)
gt_vde = get_gt(df_vde_matched)

100%|██████████| 586/586 [00:12<00:00, 47.81it/s]
100%|██████████| 586/586 [00:12<00:00, 47.88it/s]
100%|██████████| 586/586 [00:12<00:00, 47.98it/s]


In [23]:
len(gt_vre['problem_id'].unique()) 

546

In [24]:
len(df_vre_matched['problem_id'].unique())

586

In [19]:
vre_perc = len(gt_vre['problem_id'].unique()) /  len(df_vre_matched['problem_id'].unique()) * 100
vde_perc = len(gt_vde['problem_id'].unique()) /  len(df_vde_matched['problem_id'].unique()) * 100
cre_perc = len(gt_cre['problem_id'].unique()) /  len(df_cre_matched['problem_id'].unique()) * 100

In [20]:
#Verify percentage of problems that have paralell data in all PLs
print(vre_perc)
print(vde_perc)
print(cre_perc)

93.1740614334471
93.1740614334471
93.1740614334471


In [28]:
gt_vre

Unnamed: 0,problem_id,Python_gt,Java_gt,C++_gt,Go_gt,C_gt
0,p02817,str = input()\nstr = str.split()\nstr = str[1]...,import java.util.Scanner;\n\npublic class Main...,#include<stdio.h>\n\nint main (){\n\tchar x[10...,// Package main provides ...\npackage main\n\n...,#include <stdio.h>\n#include <string.h>\nint m...
1,p03635,"A,B=map(int, input().split())\nprint((A-1)*(B-1))",import java.util.*;\nimport java.lang.*;\n\npu...,#include<bits/stdc++.h>\nusing namespace std;\...,"package main\n\nimport ""fmt""\n\nfunc main(){\n...",#include <stdio.h>\n#include <stdlib.h>\n#incl...
2,p03284,"n, k = map(int, input().split())\nif k == 1:\n...",import java.util.Scanner;\n\npublic class Main...,#include <iostream>\n#include <vector>\n#inclu...,"package main\n\nimport (\n\t""fmt""\n)\n\nfunc m...",#include <stdio.h>\n\n\nint main(){\n int n...
3,p03023,n = int(input())\nprint(180*(n-2)),import java.util.Scanner;\nclass Main{\npublic...,#include <iostream>\n#include <algorithm>\n#in...,"package main\n\nimport (\n ""fmt""\n)\n\nfunc m...",#include <stdio.h>\nint main(void){\n int N...
4,p02957,"def main():\n a, b = map(int, input().split...",import java.util.*;\n\nclass Main {\n publi...,#include<bits/stdc++.h> //ABC 135_a \nusing na...,"package main\n\nimport (\n ""fmt""\n)\n\nfunc...","#include <stdio.h>\n\nint main()\n{\n int i,t..."
...,...,...,...,...,...,...
541,p03386,"a,b,k = map(int, input().split())\nfor i in ra...",import java.util.*;\nimport java.io.*;\nimport...,#include<bits/stdc++.h>\nusing namespace std;\...,// Package main provides\n//\n// File: b.go\n...,#include <stdio.h>\n\nint main(void){\n long ...
542,p03148,"#!/usr/bin/env python3\nN, K = map(int, input(...",import java.io.BufferedReader;\nimport java.io...,#include <bits/stdc++.h>\nusing namespace std;...,"package main\n\nimport (\n\t""bufio""\n\t""fmt""\n...",#include <stdio.h>\n#include <string.h>\n#incl...
543,p03567,"s = input()\nif ""AC"" in s:\n print(""Yes"")\nel...",import java.util.Scanner;\n\npublic class Main...,#include <iostream>\n#include <string>\nusing ...,"package main\n\nimport (\n\t""bufio""\n\t""fmt""\n...",#include <stdio.h>\n#include <string.h>\n#incl...
544,p03096,N = int(input())\nC = [int(input()) for _ in r...,import java.io.BufferedReader;\nimport java.io...,#include <iostream>\n#include <string>\n#inclu...,"package main\n\nimport (\n\t""bufio""\n\t""fmt""\n...",#include<stdio.h>\n#define MOD 1000000007\n#de...


In [21]:
df_vre_extended = pd.merge(df_vre_matched, gt_vre, on='problem_id', how='inner')
df_vde_extended = pd.merge(df_vde_matched, gt_vde, on='problem_id', how='inner')
df_cre_extended = pd.merge(df_cre_matched, gt_cre, on='problem_id', how='inner')

In [29]:
df_vre_extended.shape

(40227, 19)

In [31]:
df_vde_extended.shape

(40227, 19)

In [32]:
df_cre_extended.shape

(40227, 19)

In [30]:
40227 / 11

3657.0

In [22]:
df_vre_extended.to_json('forge24-code-translation/RQ4/inference_output_11_models_vre_extended.json', orient='records', lines=True)
df_vde_extended.to_json('forge24-code-translation/RQ4/inference_output_11_models_vde_extended.json', orient='records', lines=True)
df_cre_extended.to_json('forge24-code-translation/RQ4/inference_output_11_models_cre_extended.json', orient='records', lines=True)