# Data Pre-processing

In [1]:
import os

def all_paths_exist(paths: list[str])->bool:
    for path in paths:
        if not os.path.exists(path):
            return False
    return True

In [2]:
CODE_DIR_NAME = 'CPP_Files'
LABEL_DIR_NAME = 'Labels'

file_dir = os.path.dirname(os.path.realpath('__file__'))
proj_dir = os.path.dirname(file_dir)

code_dir = f'{proj_dir}/{CODE_DIR_NAME}'
label_dir = f'{proj_dir}/{LABEL_DIR_NAME}'

if not all_paths_exist([code_dir, label_dir]):
    raise Exception(f'Could not find "{CODE_DIR_NAME}" and "{LABEL_DIR_NAME}" under "{proj_dir}"')

In [3]:
file_names = []

for _, _, files in os.walk(code_dir):
    for file in files:
        try:
            if file[-4:] == '.cpp':
                file_names.append(file[:-4])
        except:
            # Exceptions likely occur due to the filename being less than 4 chars long,
            # so we can skip since they cannot be the code files we're looking for.
            continue
        
print(file_names)

['additional_functions', 'function_data']


In [4]:
def set_dataset_dir(new_dir: str, base_path: str = proj_dir)->str:
    dir_path = f'{base_path}/{new_dir}'
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        
    return dir_path

In [5]:
import pandas as pd
import tree_sitter_cpp as tscpp

from tree_sitter import Language, Parser

LINE_DATASET = 'line_dataset'
CUTOFF_DATASET = 'cutoff_dataset'
AST_DATASET = 'ast_dataset'

parser = Parser(Language(tscpp.language()))

def create_data_file(dataset_name: str)->None:
    dataset_dir = set_dataset_dir(dataset_name)
    data_file = f'{dataset_dir}/data.csv'
    container = []

    for name in file_names:
        cpp_file = f'{code_dir}/{name}.cpp'
        txt_file = f'{label_dir}/{name}.txt'
        
        if not all_paths_exist([cpp_file, txt_file]):
            raise Exception(f'Could not find {cpp_file} and {txt_file}')
        
        with (open(cpp_file, 'r') as code, 
            open(txt_file, 'r') as labels,
            open(data_file, 'w') as data_storage):
            code_lines = code.read().splitlines()
            label_lines = labels.readlines()        
            assert(len(code_lines) == len(label_lines))
            
            for i in range(len(code_lines)):
                curr_label = int(label_lines[i])
                
                if dataset_name == LINE_DATASET:
                    curr_line = code_lines[i]
                    curr_code_block = '\t'.join([line.strip() for line in code_lines])
                elif dataset_name == CUTOFF_DATASET:
                    curr_line = ''
                    curr_code_block = '\t'.join([line.strip() for line in code_lines[:i+1]])
                elif dataset_name == AST_DATASET:
                    reduced_code_block = [line.strip() for line in code_lines[:i+1]]
                    tree = parser.parse(bytes('\n'.join(reduced_code_block), encoding='utf-8'))
                    
                    curr_line = str(tree.root_node)
                    curr_code_block = '\t'.join(reduced_code_block)
                else:
                    raise Exception(f'Dataset name "{dataset_name}" unknown.')
                
                write_str = [curr_line, curr_code_block, curr_label]
                container.append(write_str)
            
    df = pd.DataFrame(container, columns=['line', 'code', 'label'])
    df.to_csv(data_file)

In [6]:
create_data_file(LINE_DATASET)
create_data_file(CUTOFF_DATASET)
create_data_file(AST_DATASET)