In [1]:
import sys  
sys.path.insert(0, './utils')

In [2]:
import seaborn as sns
import pandas as pd
import json, os
from tqdm import tqdm
import queue
import code_tokenize as ctok
import math
import random

In [None]:
def group_consecutive_repeated_ranges(lst):
    grouped_ranges = []
    
    if len(lst) == 0:
        return grouped_ranges
    
    begin = 0
    
    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            grouped_ranges.append((begin, i - 1))
            begin = i
    
    grouped_ranges.append((begin, len(lst) - 1))  # Append the last range
    
    return grouped_ranges

In [None]:
def map_tokens_to_hf_idx(code_tokens):
    hf_tokens = hf_tokenizer.tokenize(' '.join(code_tokens))
    enc_plus = hf_tokenizer.encode_plus(' '.join(code_tokens))
    sub_tokenized_ids = enc_plus.word_ids()[1:-1] # First position and last positions are None since we don't apped [CLS] and [SEP]
    grouped = group_consecutive_repeated_ranges(sub_tokenized_ids)
    hf_mapping = []
    idx = 0 # Because we will add [CLS] later on at the beginning
    for i, tok in enumerate(code_tokens):
        l = grouped[i][0]
        u = grouped[i][1]
        _range = (u - l)+1
        hf_idx = []
        for _ in range(_range):
            idx += 1
            hf_idx.append(idx)
        hf_mapping.append([tok, hf_idx])
    # Ref: https://github.com/huggingface/tokenizers/issues/266
    # With CodeBERT's tokenizer if a token with string quotes such "foo bar", it is not
    # possbile to revert it's subtokenization. Hence we discard the sample if the detokenization process
    # is failed.

    # Doing assertions for sanity checks. 
    # The following lines iterates over the original set of tokens `code_tokens`
    # and their corresponding set of huggingface tokens' IDs, and checks whether
    # the detokenized string from these IDs matches the token from `code_tokens`
    for i, tok in enumerate(code_tokens):
        l_bound = hf_mapping[i][1][0]-1 # Again because of [CLS]
        u_bound = hf_mapping[i][1][-1]
        decoded_token = hf_tokenizer.convert_tokens_to_string(hf_tokens[l_bound: u_bound]).replace(' ','')
        if decoded_token != tok:
            return [], []
    
    return hf_mapping, hf_tokens

In [3]:
def save_jsonl(filename, data_list):
    """
    Save a list of dictionaries as JSON Lines (JSONL) format.

    Args:
        filename (str): The name of the output JSONL file.
        data_list (list): List of dictionaries to be saved.
    """
    with open(filename, 'w') as file:
        for data_dict in data_list:
            json.dump(data_dict, file)
            file.write('\n')

In [4]:
def read_jsonl(file_path, split=''):
    content = []
    with open(file_path, 'r') as jsonl_file:
    # Read the lines of the file
        lines = jsonl_file.readlines()
    for line in lines:
        # Parse the JSON object
        json_data = json.loads(line)
        content.append(json_data)
        
    return content

In [5]:
data_path = os.path.join("..", "raw_data", "csn_java_65k.jsonl")

In [6]:
data = read_jsonl(data_path)

In [7]:
len(data)

65000

In [8]:
data[0].keys()

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])

In [9]:
print(data[3]['whole_func_string'])

public static void configureFiles(Iterable<File> files)
	{
		for (File file : files)
		{
			if (file != null && file.exists() && file.canRead())
			{
				setup(file);
				return;
			}
		}

		System.out.println("(No suitable log config file found)");
	}


In [10]:
# Types of java tokens:
# 1) Keywords
# 2) Identifiers
# 3) Literals
# 4) Operators
# 5) Separators
keywords = ['abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum', 'extends', 'final', 'finally', 'float', 'for', 'goto', 'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long', 'native', 'new', 'package', 'private', 'protected', 'public', 'return', 'short', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'this', 'throw', 'throws', 'transient', 'try', 'void', 'volatile', 'while']
operators = [
# Arithmetic Operators
'+', '-', '*', '/', '%',
# Unary Operators 
'+', '-', '++', '--',
# Assignment Operators
'=', '+=', '-=', '*=', '/=', '%=', '<<=', '>>=', '>>>=', '&=', '^=', '|=',
# Relational Operators
'==', '!=', '>', '<', '>=', '<=',
# Logical Operators
'&&', '||', '!',
# Bitwise Operators
'&', '|', '^', '~', '<<', '>>', '>>>',
# Ternary Operator
'?', ':',
# Other operators
'.', '...', '::'
]
symbols = [";", ',', "{", "}", "(", ")", "[", "]", "'", '"', "@", "->"]

In [11]:
len(keywords)

50

In [12]:
diffs = []

In [13]:
def class_wrapper(code):
    # Using this so that ctok stops complaining
    backslash_char = "\\"
    return "class Dummy {\n" + code + "\n}"


In [None]:
for code in tqdm(data):
    wrapped_code = class_wrapper(code['whole_func_string'])
    ctok_tokens = []
    try:
        ctok_tokens = ctok.tokenize(wrapped_code, lang="java")[3:-1]
    except:
        ctok_tokens = []
    original_tokens = code['func_code_tokens']
    diff = abs(len(ctok_tokens) - len(original_tokens))
    if not len(ctok_tokens):
        diffs.append(-1)
    else:
        diffs.append(diff)

 99%|████████████████████████████████████████████████████████████████████████████████████████████████████ | 64420/65000 [03:09<00:00, 1033.25it/s]

In [None]:
idx = []

In [16]:
for i, diff in enumerate(diffs):
    if diff != 0:
        idx.append(i)

In [17]:
len(idx)

5035

In [18]:
filtered_data = []

In [19]:
for i, data_sample in enumerate(data):
    if i not in idx:
        filtered_data.append(data_sample)

In [20]:
len(filtered_data)

59965

In [21]:
new_diffs = []

In [None]:
for code in tqdm(filtered_data):
    wrapped_code = class_wrapper(code['whole_func_string'])
    ctok_tokens = []
    try:
        ctok_tokens = ctok.tokenize(wrapped_code, lang="java")[3:-1]
    except:
        ctok_tokens = []
    original_tokens = code['func_code_tokens']
    diff = abs(len(ctok_tokens) - len(original_tokens))
    if not len(ctok_tokens):
        new_diffs.append(-1)
    else:
        new_diffs.append(diff)

 45%|█████████████████████████████████████████████▎                                                       | 26903/59965 [00:25<00:29, 1114.30it/s]

In [25]:
new_idx = []

In [26]:
for i, diff in enumerate(new_diffs):
    if diff != 0:
        new_idx.append(i)

In [27]:
assert len(new_idx) == 0

In [28]:
samples_with_no_comments = []

In [29]:
# Discard code with comments
for code in tqdm(filtered_data):
    wrapped_code = class_wrapper(code['whole_func_string'])
    ctok_tokens = ctok.tokenize(wrapped_code, lang="java")[3:-1]
    ctok_types = set([tok.type for tok in ctok_tokens])
    if ('line_comment' in ctok_types) or ('block_comment' in ctok_types) or ('comment' in ctok_types):
        continue
    else:
        samples_with_no_comments.append(code)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 59965/59965 [00:59<00:00, 1004.10it/s]


In [30]:
len(samples_with_no_comments)

46787

In [31]:
type_to_id = {
    'Keyword': -1,
    'Operator': -2,
    'SpecialSymbol': -3,
    'Literal': -4,
    'Identifier': -5
}

In [15]:
temp_types = set()

In [32]:
for sample in tqdm(samples_with_no_comments):
    wrapped_code = class_wrapper(sample['whole_func_string'])
    ctok_tokens = ctok.tokenize(wrapped_code, lang="java")[3:-1]
    token2cat = []
    cat_ids = []
    for tok in ctok_tokens:
        if tok.type in keywords:
            token2cat.append([str(tok), "Keyword"])
            cat_ids.append(-1)
        if ('boolean_type' in tok.type) or ('void_type' in tok.type): # for boolean type (i.e boolean k = ...), void (i.e. private void ..)
            token2cat.append([str(tok), "Keyword"])
            cat_ids.append(-1)
        if tok.type in operators:
            token2cat.append([str(tok), "Operator"])
            cat_ids.append(-2)
        if tok.type in symbols:
            token2cat.append([str(tok), "SpecialSymbol"])
            cat_ids.append(-3)
        if '_literal' in tok.type:
            token2cat.append([str(tok), "Literal"])
            cat_ids.append(-4)
        if 'false' == tok.type:
            token2cat.append([str(tok), "Literal"])
            cat_ids.append(-4)
        if 'true' == tok.type:
            token2cat.append([str(tok), "Literal"])
            cat_ids.append(-4)
        if 'identifier' in tok.type:
            token2cat.append([str(tok), "Identifier"])
            cat_ids.append(-5)
    sample["tokens"] = token2cat
    sample["cat_ids"] = cat_ids
    assert len(token2cat) == len(ctok_tokens)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 46787/46787 [00:57<00:00, 817.95it/s]


In [None]:
detokenizable_samples = []

In [None]:
for sample in tqdm(samples_with_no_comments):
    hf_mapping, hf_tokens = map_tokens_to_hf_idx(sample['func_code_tokens'])
    if (not len(hf_mapping)) or (not len(hf_tokens)):
        continue
    else:
        sample['hf_mapping'] = hf_mapping
        sample['hf_tokens'] = hf_tokens
        detokenizable_samples.append(sample)

In [None]:
under_512 = list(filter(lambda x: len(x['hf_tokens']) <= 512, detokenizable_samples))

In [None]:
study_sample = random.sample(under_512, 5000)

In [None]:
save_jsonl(os.path.join("..", "data", "5k_csn_java.jsonl"), study_sample)