In [1]:
import sys  
sys.path.insert(0, './utils')

In [2]:
import seaborn as sns
import pandas as pd
import json, os
from tqdm import tqdm
import queue
import code_tokenize as ctok
from transformers import AutoTokenizer
import math
import torch
from itertools import chain
import random
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_tag = "microsoft/codebert-base"
hf_tokenizer = AutoTokenizer.from_pretrained(model_tag)

In [4]:
def save_jsonl(filename, data_list):
    """
    Save a list of dictionaries as JSON Lines (JSONL) format.

    Args:
        filename (str): The name of the output JSONL file.
        data_list (list): List of dictionaries to be saved.
    """
    with open(filename, 'w') as file:
        for data_dict in data_list:
            json.dump(data_dict, file)
            file.write('\n')

In [5]:
def read_jsonl(file_path, split=''):
    content = []
    with open(file_path, 'r') as jsonl_file:
    # Read the lines of the file
        lines = jsonl_file.readlines()
    for line in lines:
        # Parse the JSON object
        json_data = json.loads(line)
        content.append(json_data)
        # Now you can work with the JSON data
        # For example, you can access values using keys
        #project, commit_hash, func, target = json_data['project'], json_data['commit_id'], json_data['func'], json_data['target']
        #content.append({'func': func, 'label': target, 'project': project, 'hash_id': commit_hash, 'split': split})
        
    return content

In [6]:
def group_consecutive_repeated_ranges(lst):
    grouped_ranges = []
    
    if len(lst) == 0:
        return grouped_ranges
    
    begin = 0
    
    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            grouped_ranges.append((begin, i - 1))
            begin = i
    
    grouped_ranges.append((begin, len(lst) - 1))  # Append the last range
    
    return grouped_ranges

In [7]:
def group_indices(tokens, raw_tokens):
    """
    Adapted from: Wan et al. (https://github.com/CGCL-codes/naturalcc)
    """
    mask = []
    raw_i = 0
    collapsed = ''
    special ='Ġ'

    for i in range(len(tokens)):
        token = tokens[i]

        while len(token) > 0 and token[0] == special:
            token = token[1:]
        collapsed += token
        mask.append(raw_i)
        if collapsed == raw_tokens[raw_i]:
            raw_i += 1
            collapsed = ''
    if raw_i != len(raw_tokens):
        raise Exception(f'Token mismatch: \n{tokens}\n{raw_tokens}')
    return torch.tensor(mask)

In [8]:
def map_tokens_to_hf_idx(code_tokens):
    hf_tokens = hf_tokenizer.tokenize(' '.join(code_tokens))
    enc_plus = hf_tokenizer.encode_plus(' '.join(code_tokens))
    sub_tokenized_ids = group_indices(hf_tokens, code_tokens)
    grouped = group_consecutive_repeated_ranges(sub_tokenized_ids)
    hf_mapping = []
    idx = 0 # Because we will add [CLS] later on at the beginning
    for i, tok in enumerate(code_tokens):
        l = grouped[i][0]
        u = grouped[i][1]
        _range = (u - l)+1
        hf_idx = []
        for _ in range(_range):
            idx += 1
            hf_idx.append(idx)
        hf_mapping.append([tok, hf_idx])
    # Ref: https://github.com/huggingface/tokenizers/issues/266
    # With CodeBERT's tokenizer if a token with string quotes such "foo bar", it is not
    # possbile to revert its subtokenization. Hence we discard the sample if the detokenization process
    # has failed.

    # Doing assertions for sanity checks. 
    # The following lines iterates over the original set of tokens `code_tokens`
    # and their corresponding set of huggingface tokens' IDs, and checks whether
    # the detokenized string from these IDs matches the token from `code_tokens`
    for i, tok in enumerate(code_tokens):
        l_bound = hf_mapping[i][1][0]-1 # Again because of [CLS]
        u_bound = hf_mapping[i][1][-1]
        decoded_token = hf_tokenizer.convert_tokens_to_string(hf_tokens[l_bound: u_bound]).replace(' ','')
        if decoded_token != tok:
            return [], []
    
    return hf_mapping, hf_tokens

In [9]:
data_paths = [os.path.join("..", "raw_data", "train.json")]

In [10]:
data = list(map(read_jsonl, data_paths))

In [11]:
data = list(chain(*data))

In [12]:
data[0].keys()

dict_keys(['length', 'code_tokens', 'code', 'feature_map'])

In [13]:
len(data)

50876

In [14]:
# Valid samples are those whose set of tokens generated by ctok is the same as the original code_tokens
# The reason is because we are using ctok for token type tagging.
valid_samples = []

In [15]:
for code in tqdm(data):
    src_code = code['code']
    ctok_tokens = []
    try:
        ctok_tokens = ctok.tokenize(src_code, lang="python")
        # Remove #NEWLINE#, #DEDENT# and #INDENT#
        removable_tokens = ["#NEWLINE#", "#DEDENT#", "#INDENT#"]
        ctok_tokens = [tok for tok in ctok_tokens if str(tok) not in removable_tokens]
    except:
        ctok_tokens = []
    original_tokens = code['code_tokens']
    diff = abs(len(ctok_tokens) - len(original_tokens))
    if diff == 0:
        valid_samples.append(code)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 50876/50876 [00:23<00:00, 2194.55it/s]


In [16]:
len(valid_samples)

49380

In [17]:
detokenizable_samples = []

In [18]:
for sample in tqdm(valid_samples):
    hf_mapping, hf_tokens = map_tokens_to_hf_idx(sample['code_tokens'])
    if (not len(hf_mapping)) or (not len(hf_tokens)):
        continue
    else:
        sample['hf_mapping'] = hf_mapping
        sample['hf_tokens'] = hf_tokens
        detokenizable_samples.append(sample)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 49380/49380 [00:33<00:00, 1456.16it/s]


In [19]:
len(detokenizable_samples)

49380

In [20]:
# The reason is that strings with qutation marks cannot be reverted back from the set of Huggingface tokens to
# the original set of tokens
samples_with_no_comments = []

In [21]:
# Discard code with comments
for code in tqdm(detokenizable_samples):
    src_code = code['code']
    ctok_tokens = ctok.tokenize(src_code, lang="python")
    ctok_types = set([tok.type for tok in ctok_tokens])
    if ('line_comment' in ctok_types) or ('block_comment' in ctok_types) or ('comment' in ctok_types):
        continue
    else:
        samples_with_no_comments.append(code)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 49380/49380 [00:21<00:00, 2278.97it/s]


In [22]:
len(samples_with_no_comments)

49380

In [23]:
# Types of python tokens:
# 1) Keywords
# 2) Identifiers
# 3) Literals
# 4) Operators
# 5) symbols
keywords = ['False', 'await', 'else', 'import', 'pass', 'None', 'break', 'except', 'in', 'raise', 'True', 'class', 'finally', 'is', 'return',
           'and', 'continue', 'for', 'lambda', 'try', 'as', 'def', 'from', 'nonlocal', 'while', 'assert', 'del', 'global', 'not', 'with',
           'async', 'elif', 'if', 'or', 'yield', 'not in', 'is not', 'none', 'print'] # (caveat: print was no longer a keyword in Python 3)
operators = [
'+', '-', '*', '**', '/', '//', '%', '@',
    '<<', '>>', '&', '|', '^', '~', ':=',
    '<', '>', '<=', '>=', '==', '!=', 'unary_operator'
]
symbols = ["(",")","[","]","{","}", ",", ":" ,"." ,";" ,"=" ,"->", "+=", "-=", "*=", "/=", "//=", "%=", "@=", "&=", "|=", "^=", ">>=", "<<=", "**=", "line_continuation", "ellipsis"]
literals = ["float", "integer", "true", "false"]    

In [24]:
type_to_id = {
    'Keyword': -1,
    'Operator': -2,
    'SpecialSymbol': -3,
    'Literal': -4,
    'Identifier': -5
}

In [None]:
for sample in tqdm(samples_with_no_comments):
    src_code = sample['code']
    ctok_tokens = ctok.tokenize(src_code, lang="python")
    removable_tokens = ["#NEWLINE#", "#DEDENT#", "#INDENT#"]
    ctok_tokens = [tok for tok in ctok_tokens if str(tok) not in removable_tokens]
    token2cat = []
    cat_ids = []
    c_tok_idx = 0
    for tok in ctok_tokens:
        if tok.type in keywords:
            token2cat.append([str(tok), "Keyword"])
            cat_ids.append(-1)
            
        if tok.type in operators:
            token2cat.append([str(tok), "Operator"])
            cat_ids.append(-2)
            
        if tok.type in symbols:
            token2cat.append([str(tok), "SpecialSymbol"])
            cat_ids.append(-3)
            
        if tok.type in literals:
            token2cat.append([str(tok), "Literal"])
            cat_ids.append(-4)
            
        if 'identifier' == tok.type:
            token2cat.append([str(tok), "Identifier"])
            cat_ids.append(-5)
        
        c_tok_idx += 1
            
    sample["tokens"] = token2cat
    sample["cat_ids"] = cat_ids
    if len(token2cat) != len(ctok_tokens):
        print(token2cat)
        print(cat_ids)
        break
    assert len(token2cat) == len(ctok_tokens)

 57%|█████████████████████████████████████████████████████████▍                                           | 28056/49380 [00:18<00:13, 1531.83it/s]

In [None]:
sample = list(filter(lambda x: len(x['hf_tokens']) <= 512, samples_with_no_comments))

In [None]:
len(sample)

In [29]:
study_sample = random.sample(sample, 5000)

In [None]:
save_jsonl(os.path.join("..", "data", "5k_csn_python.jsonl"), study_sample)