# Process the Datasets
This notebook takes as input the CodeNet and HumanEval-X datasets and creates the subsets that are used throughout the paper

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import os
import re 

In [2]:
cwd = os.getcwd()

In [11]:
languages_humanevalx = ['Python', 'Java', 'C++', 'Go', 'Rust', 'JavaScript']
languages_codenet = ['Python', 'Java', 'C++', 'Go', 'Rust', 'JavaScript']

extension_map = {
    'Python' : 'py',
    'Java' : 'java',
    'C++' : 'cpp',
    'Go' : 'go',
    'JavaScript' : 'js',
    'Rust' : 'rs',
    'C#' : 'cs'
}

## HumanEval-X Subset

In [12]:
rust_df = pd.read_json(str(os.path.join(cwd, "datasets/codegeex/humaneval-x/rust/data/humaneval_rust.jsonl")), orient='records', lines=True)
rust_df['lang'] = 'Rust'

js_df = pd.read_json(os.path.join(cwd, "datasets/codegeex/humaneval-x/js/data/humaneval_js.jsonl"), orient='records', lines=True)
js_df['lang'] = 'JavaScript'

cpp_df = pd.read_json(os.path.join(cwd, "datasets/codegeex/humaneval-x/cpp/data/humaneval_cpp.jsonl"), orient='records', lines=True)
cpp_df['lang'] = 'C++'

go_df = pd.read_json(os.path.join(cwd, "datasets/codegeex/humaneval-x/go/data/humaneval_go.jsonl"), orient='records', lines=True)
go_df['lang'] = 'Go'

java_df = pd.read_json(os.path.join(cwd, "datasets/codegeex/humaneval-x/java/data/humaneval_java.jsonl"), orient='records', lines=True)
java_df['lang'] = 'Java'

pt_df = pd.read_json(os.path.join(cwd, "datasets/codegeex/humaneval-x/python/data/humaneval_python.jsonl"), orient='records', lines=True)
pt_df['lang'] = 'Python'

In [13]:
humanevalx = pd.concat([rust_df, js_df, cpp_df, go_df, java_df, pt_df])
humanevalx['id'] = humanevalx['task_id'].apply(lambda x : x.split('/')[1])

In [14]:
humanevalx.head()

Unnamed: 0,task_id,prompt,declaration,canonical_solution,test,example_test,lang,import,docstring,test_setup,text,id
0,Rust/0,"\n/*\n Check if in given list of numbers, are ...","\nuse std::{slice::Iter, cmp::{max, self}, mem...",\n for i in 0..numbers.len(){\n for ...,\n#[cfg(test)]\nmod tests {\n use super::*;...,,Rust,,,,,0
1,Rust/1,\n/*\n Input to this function is a string cont...,"\nuse std::{slice::Iter, cmp::{max, self}, mem...",\n let mut result:Vec<String> = vec![];\n ...,\n#[cfg(test)]\nmod tests {\n use super::*;...,,Rust,,,,,1
2,Rust/2,"\n/*\n Given a positive floating point number,...","\nuse std::{slice::Iter, cmp::{max, self}, mem...",\n return number % 1.0;\n}\n,\n#[cfg(test)]\nmod tests {\n use super::*;...,,Rust,,,,,2
3,Rust/3,\n/*\n You're given a list of deposit and with...,"\nuse std::{slice::Iter, cmp::{max, self}, mem...",\n\nlet mut balance:i32 = 0;\nfor op in operat...,\n#[cfg(test)]\nmod tests {\n use super::*;...,,Rust,,,,,3
4,Rust/4,"\n/*\n For a given list of input numbers, calc...","\nuse std::{slice::Iter, cmp::{max, self}, mem...","\n let mean:f32 = numbers.iter().fold(0.0,|...",\n#[cfg(test)]\nmod tests {\n use super::*;...,,Rust,,,,,4


In [15]:
len(humanevalx.groupby('task_id'))

984

In [16]:
translation_pairs = []

for task_id, group in humanevalx.groupby('id'):
    
    for source in languages_humanevalx:
        for target in languages_humanevalx:
            obj = {}
            obj['id'] = task_id
            
            if source != target:
                obj['source_lang'] = source
                mask = group['lang'].str.lower() == source.lower()
                input_row = group[mask].iloc[0]
                obj['input_code'] = input_row['declaration'] + input_row['canonical_solution']
                obj['target_lang'] = target
                mask = group['lang'].str.lower() == target.lower()
                target_row = group[mask].iloc[0]
                obj['ground_truth'] = target_row['declaration'] + target_row['canonical_solution']
                obj['target_signature'] = target_row['declaration']
                obj['test_code'] = target_row['test']
                
                translation_pairs.append(obj)
                
translation_df = pd.DataFrame(translation_pairs)

In [17]:
translation_df.shape

(4920, 7)

In [18]:
translation_df.head()

Unnamed: 0,id,source_lang,input_code,target_lang,ground_truth,target_signature,test_code
0,0,Python,from typing import List\n\n\ndef has_close_ele...,Java,import java.util.*;\nimport java.lang.*;\n\ncl...,import java.util.*;\nimport java.lang.*;\n\ncl...,public class Main {\n public static void ma...
1,0,Python,from typing import List\n\n\ndef has_close_ele...,C++,#include<stdio.h>\n#include<vector>\n#include<...,#include<stdio.h>\n#include<vector>\n#include<...,#undef NDEBUG\n#include<assert.h>\nint main(){...
2,0,Python,from typing import List\n\n\ndef has_close_ele...,Go,"\nfunc HasCloseElements(numbers []float64, thr...","\nfunc HasCloseElements(numbers []float64, thr...",func TestHasCloseElements(t *testing.T) {\n ...
3,0,Python,from typing import List\n\n\ndef has_close_ele...,Rust,"\nuse std::{slice::Iter, cmp::{max, self}, mem...","\nuse std::{slice::Iter, cmp::{max, self}, mem...",\n#[cfg(test)]\nmod tests {\n use super::*;...
4,0,Python,from typing import List\n\n\ndef has_close_ele...,JavaScript,"\nconst hasCloseElements = (numbers, threshold...","\nconst hasCloseElements = (numbers, threshold...",const testHasCloseElements = () => {\n consol...


#### 888 is the minimum sample size for a population of 4920 translation pairs with 99.9% CI

In [19]:
stratified = translation_df.groupby(['source_lang', 'target_lang'])

dfs = []

for name, group in stratified:
    dfs.append(group.sample(n=35, random_state=1))
    
humaneval_x_pairs_subset = pd.concat(dfs).reset_index(drop=True)

In [20]:
humaneval_x_pairs_subset.shape

(1050, 7)

In [21]:
humaneval_x_pairs_subset.to_json('./datasets/humanevalx_dataset_subset.jsonl', orient='records', lines=True)

In [22]:
translation_df.to_json('./datasets/humanevalx_dataset_all.jsonl', orient='records', lines=True)

## CodeNet

In [None]:
problem_descriptions_path = './datasets/Project_CodeNet/problem_descriptions'

In [None]:
import os
import random

# Set the seed for reproducibility
seed = 1
random.seed(seed)

# Get a list of all files in the directory
all_files = [os.path.join(problem_descriptions_path, f) for f in os.listdir(problem_descriptions_path) if os.path.isfile(os.path.join(problem_descriptions_path, f))]

print(len(all_files))

selected_files = all_files

In [None]:
# Function to extract the fuzzy tests from HTML content
def extract_input_output(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    soup = BeautifulSoup(content, 'html.parser')
    
    # Extract input
    input_tag = soup.find('h3', string='Sample Input 1')
    if input_tag:
        input_value = input_tag.find_next('pre').get_text(strip=True)
    else:
        input_value = None
        
    input_tag = soup.find('h3', string='Sample Input 2')
    if input_tag:
        input_value_2 = input_tag.find_next('pre').get_text(strip=True)
    else:
        input_value_2 = None
        
    input_tag = soup.find('h3', string='Sample Input 3')
    if input_tag:
        input_value_3 = input_tag.find_next('pre').get_text(strip=True)
    else:
        input_value_3 = None
    
    # Extract output
    output_tag = soup.find('h3', string='Sample Output 1')
    if output_tag:
        output_value = output_tag.find_next('pre').get_text(strip=True)
    else:
        output_value = None
        
    # Extract output
    output_tag = soup.find('h3', string='Sample Output 2')
    if output_tag:
        output_value_2 = output_tag.find_next('pre').get_text(strip=True)
    else:
        output_value_2 = None
        
    # Extract output
    output_tag = soup.find('h3', string='Sample Output 3')
    if output_tag:
        output_value_3 = output_tag.find_next('pre').get_text(strip=True)
    else:
        output_value_3 = None
    
    return input_value, output_value, input_value_2, output_value_2, input_value_3, output_value_3

# Dictionary to store the (input, output) pairs
input_output_dict = {}

# Process each selected file
for file_path in selected_files:
    input_value, output_value, input_value_2, output_value_2, input_value_3, output_value_3 = extract_input_output(file_path)
    filename = filename = os.path.basename(file_path).split('.')[0]
    if input_value is not None and input_value_2 is not None and output_value is not None and output_value_2 is not None and input_value_3 is not None and output_value_3 is not None:
        input_output_dict[filename] = (input_value, output_value, input_value_2, output_value_2, input_value_3, output_value_3)

In [None]:
#Number of problems with three input-output fuzzy tests
len(input_output_dict)

In [None]:
#Given that CodeNet works through stdin. We need to make sure there is a newline so the data in stdin gets passed to the function's reader
def add_newline_if_missing(s):
    if not s.endswith('\n'):
        s += '\n'
    return s

In [None]:
entries = []

for problemid, inputoutput in input_output_dict.items():    
    #We read the metadata for this problem
    metadata_path = f'./datasets/Project_CodeNet/metadata/{problemid}.csv'
    df = pd.read_csv(metadata_path)
    df = df[(df.status == 'Accepted')]
    skipit = False
    local_entries = []
    
    if df.empty == True:
        continue
    
    for lang in languages_codenet:
        obj = {}

        obj['problem'] = problemid
        obj['stdin_input_1'] = add_newline_if_missing(inputoutput[0])
        obj['expected_output_1'] = inputoutput[1]
        obj['stdin_input_2'] = add_newline_if_missing(inputoutput[2])
        obj['expected_output_2'] = inputoutput[3]
        obj['stdin_input_3'] = add_newline_if_missing(inputoutput[4])
        obj['expected_output_3'] = inputoutput[5]
        dflang = df[(df.language == lang)]
        
        #We keep programs that are less than 1kb (as a proxy to token length. This is to ensure we don't have problems with context length)
        dflang = dflang[(dflang.code_size < 1024)]
        
        if dflang.empty:
            #No parallel data in this language. Moving on to the next problem
            skipit = True
            break
        
        #Sample one random solutions to this problem, from all the solutions
        row = dflang.sample(n=1, random_state=1).iloc[0]
        submission_id = row['submission_id']
        obj['size_bytes'] = row['code_size']
        
        #Get the content from the submission
        with open(f'./datasets/Project_CodeNet/data/{problemid}/{lang}/{submission_id}.{extension_map[lang]}') as f:
            content = f.read()
            obj['language'] = lang
            obj['code'] = content
        
        local_entries.append(obj)
        
    if not skipit:
        entries.extend(local_entries)

In [None]:
codenet_subset = pd.DataFrame(entries)

In [None]:
codenet_subset.shape

In [None]:
codenet_subset.head()

In [None]:
codenet_translation_pairs = []

#Build the possible translation pairs using all the problems found previously
for problem, group in codenet_subset.groupby('problem'):
    
    for source in languages_codenet:
        for target in languages_codenet:
            obj = {}
            obj['task_id'] = problem
            
            if source != target:
                obj['source_lang'] = source
                mask = group['language'].str.lower() == source.lower()
                input_row = group[mask].iloc[0]
                obj['input_code'] = input_row['code']
                obj['target_lang'] = target
                mask = group['language'].str.lower() == target.lower()
                target_row = group[mask].iloc[0]
                obj['ground_truth'] = target_row['code']
                
                obj['stdin_input_1'] = input_row['stdin_input_1']
                obj['expected_output_1'] = input_row['expected_output_1']
                obj['stdin_input_2'] = input_row['stdin_input_2']
                obj['expected_output_2'] = input_row['expected_output_2']
                obj['stdin_input_3'] = input_row['stdin_input_3']
                obj['expected_output_3'] = input_row['expected_output_3']
                
                #Sanity check for alignment
                assert target_row['expected_output_1'] == input_row['expected_output_1']
                assert target_row['expected_output_2'] == input_row['expected_output_2']
                assert target_row['expected_output_3'] == input_row['expected_output_3']
                      
                codenet_translation_pairs.append(obj)
                
codenet_translation_df = pd.DataFrame(codenet_translation_pairs)

In [None]:
codenet_translation_df.shape

#### We need at least 1013 samples from 15660 translation pairs for 99.9% CI

In [None]:
codenet_stratified = codenet_translation_df.groupby(['source_lang', 'target_lang'])

codenet_dfs = []

for name, group in codenet_stratified:
    codenet_dfs.append(group.sample(n=35, random_state=1))
    
codenet_pairs_subset = pd.concat(codenet_dfs).reset_index(drop=True)

In [None]:
codenet_pairs_subset.shape

In [None]:
codenet_pairs_subset.head()

In [None]:
codenet_translation_df.to_json('./datasets/codenet_pairs_all.jsonl', orient='records', lines=True)

In [None]:
codenet_pairs_subset.to_json('./datasets/codenet_pairs_subset.jsonl', orient='records', lines=True)

## TransCoder Test Set (curated)

In [5]:
unitrans_df = pd.read_json('./datasets/FSE-24-UniTrans/cleaned_data/testable_samples.jsonl', orient='records', lines=True)

In [6]:
unitrans_df.head()

Unnamed: 0,id,cpp,java,python
0,ADD_1_TO_A_GIVEN_NUMBER,int addOne ( int x ) {\n int m = 1;\n while ...,static int addOne ( int x ) {\n int m = 1 ;\n...,def addOne ( x ) :\n m = 1\n while ( x &...
1,ADD_1_TO_A_GIVEN_NUMBER_1,int addOne ( int x ) {\n return ( - ( ~ x ) )...,static int addOne ( int x ) {\n return ( - ( ...,def addOne ( x ) :\n return ( - ( ~ x ) )
2,ANALYSIS_OF_ALGORITHMS_SET_2_ASYMPTOTIC_ANALYSIS,"int search ( int arr [ ], int n, int x ) {\n ...","static int search ( int arr [ ] , int n , int ...","def search ( arr , n , x ) :\n i = 0\n f..."
3,AREA_OF_THE_CIRCLE_THAT_HAS_A_SQUARE_AND_A_CIR...,float getArea ( int a ) {\n float area = ( M_...,static float getArea ( int a ) {\n float area...,def getArea ( a ) :\n area = ( math.pi * a ...
4,AREA_SQUARE_CIRCUMSCRIBED_CIRCLE,int find_Area ( int r ) {\n return ( 2 * r * ...,static int find_Area ( int r ) {\n return ( 2...,def find_Area ( r ) :\n return ( 2 * r * r )


In [21]:
unitrans_df.shape

(568, 4)

In [7]:
df_melted = pd.melt(unitrans_df, id_vars=['id'], var_name='language', value_name='value')

In [8]:
df_melted.head()

Unnamed: 0,id,language,value
0,ADD_1_TO_A_GIVEN_NUMBER,cpp,int addOne ( int x ) {\n int m = 1;\n while ...
1,ADD_1_TO_A_GIVEN_NUMBER_1,cpp,int addOne ( int x ) {\n return ( - ( ~ x ) )...
2,ANALYSIS_OF_ALGORITHMS_SET_2_ASYMPTOTIC_ANALYSIS,cpp,"int search ( int arr [ ], int n, int x ) {\n ..."
3,AREA_OF_THE_CIRCLE_THAT_HAS_A_SQUARE_AND_A_CIR...,cpp,float getArea ( int a ) {\n float area = ( M_...
4,AREA_SQUARE_CIRCUMSCRIBED_CIRCLE,cpp,int find_Area ( int r ) {\n return ( 2 * r * ...


In [9]:
df_melted["id"].nunique()

568

In [10]:
import os
import random

translation_pairs = []
languages = ["python", "java", "cpp"]

map_unitrans = {
    "python" : "Python",
    "java" : "Java",
    "cpp" : "C++"
}

# Set the seed for reproducibility
seed = 1
random.seed(seed)

results = []

for name, group in df_melted.groupby("id"):
    group_entries = []
    python_found = False
    cpp_found = False
    java_found = False


    for index, row in group.iterrows():
        problem = row['id']
        cpp_path = f"./datasets/FSE-24-UniTrans/cleaned_data/transcoder_evaluation_gfg/cpp/{problem}.cpp"
        java_path = f"./datasets/FSE-24-UniTrans/cleaned_data/transcoder_evaluation_gfg/java/{problem}.java"
        python_path = f"./datasets/FSE-24-UniTrans/cleaned_data/transcoder_evaluation_gfg/python/{problem}.py"

        obj = {}

        source = row['language']
        
        for target in languages:
            obj = {}
            obj['id'] = problem
            
            if source != target:

                try:
                    if target == "cpp":
                        with open(cpp_path, "r") as f:
                            obj["test_code"] = f.read()
                    if target == "java":
                        with open(java_path, "r") as f:
                            obj["test_code"] = f.read()
                    if target == "python":
                        with open(python_path, "r") as f:
                            obj["test_code"] = f.read()

                    obj["source_lang"] = map_unitrans[source]
                    obj['input_code'] = row['value']
                    obj['target_lang'] = map_unitrans[target]
                    mask = group['language'].str.lower() == target.lower()
                    target_row = group[mask].iloc[0]
                    obj['ground_truth'] = target_row['value']
                    obj['target_signature'] = "f_filled"

                    translation_pairs.append(obj)

                except Exception as e:
                    #Parallel data is missing for this problem. This is expected
                    continue

In [11]:
translation_df = pd.DataFrame(translation_pairs)

In [17]:
translation_df.shape

(2826, 7)

In [18]:
len(translation_df.groupby("id"))

568

In [19]:
stratified = translation_df.groupby(['source_lang', 'target_lang'])

dfs = []

for name, group in stratified:
    dfs.append(group.sample(n=175, random_state=1))
    
unitrans_pairs_subset = pd.concat(dfs)

In [20]:
unitrans_pairs_subset.head()

Unnamed: 0,id,test_code,source_lang,input_code,target_lang,ground_truth,target_signature
186,CHECK_LARGE_NUMBER_DIVISIBLE_11_NOT,"// Copyright (c) 2019-present, Facebook, Inc.\...",C++,int check ( string str ) {\n int n = str . le...,Java,static boolean check ( String str ) {\n int n...,f_filled
1457,LONGEST_REPEATING_SUBSEQUENCE,"// Copyright (c) 2019-present, Facebook, Inc.\...",C++,int findLongestRepeatingSubSeq ( string str ) ...,Java,static int findLongestRepeatingSubSeq ( String...,f_filled
1087,FIND_SUM_UNIQUE_SUB_ARRAY_SUM_GIVEN_ARRAY,"// Copyright (c) 2019-present, Facebook, Inc.\...",C++,"long long int findSubarraySum ( int arr [ ], i...",Java,"static int findSubarraySum ( int [ ] arr , int...",f_filled
631,C_PROGRAM_FACTORIAL_NUMBER,"// Copyright (c) 2019-present, Facebook, Inc.\...",C++,unsigned int factorial ( unsigned int n ) {\n ...,Java,static int factorial ( int n ) {\n if ( n == ...,f_filled
515,COUNT_PALINDROMIC_SUBSEQUENCE_GIVEN_STRING,"// Copyright (c) 2019-present, Facebook, Inc.\...",C++,int countPS(string str)\n{\n int N = str.leng...,Java,static int countPS ( String str ) {\n int N =...,f_filled


In [None]:
unitrans_pairs_subset.shape

In [None]:
unitrans_pairs_subset.to_json('./datasets/unitrans_dataset_subset.jsonl', orient='records', lines=True)
translation_df.to_json('./datasets/unitrans_dataset_all.jsonl', orient='records', lines=True)