# tokenization methods

In [1]:

from nltk.tokenize import MWETokenizer, TweetTokenizer, WordPunctTokenizer 
from nltk.tokenize import word_tokenize
import re

VALUES = ['>>>=', '>>=', '<<=',  '%=', '^=', '|=', '&=', '/=',
                  '*=', '-=', '+=', '<<', '--', '++', '||', '&&', '!=',
                  '>=', '<=', '==', '%', '^', '|', '&', '/', '*', '-',
                  '+', ':', '?', '~', '!', '<', '>', '=', '...', '->', '::', '\\', '\\\\', '\*', '*\\', '\\\\\\']
INFIX = ['||', '&&', '|', '^', '&', '==', '!=', '<', '>', '<=', '>=', 
             '<<', '>>', '>>>', '+', '-', '*', '/', '%']
PREFIX = ['++', '--', '!', '~', '+', '-']
POSTFIX = ['++', '--']
ASSIGNMENT = ['=', '+=', '-=', '*=', '/=', '&=', '|=', '^=', '%=', '<<=', '>>=', '>>>=']
LAMBDA = ['->']
COMMENT = ['//', '/*', '*/']
METHOD_REFERENCE = ['::']

WHITESPACE_DICT = {"                ":'<|16-s|>', 
                   "            ":'<|12-s|>', 
                   "        ":'<|8-s|>', 
                   "    ":'<|4-s|>', 
                   "  ":'<|2-s|>', " ":'<|s|>',
                   "\t\t\t\t":'<|4-t|>',"\t\t\t":'<|3-t|>',"\t\t":'<|2-t|>',"\t":'<|t|>',"\n":'<|nl|>'}

WHITESPACES = [WHITESPACE_DICT[x] for x in WHITESPACE_DICT]

REVERSE_WHITESPACE_DICT = {}
for key in WHITESPACE_DICT:
    value = WHITESPACE_DICT[key]
    REVERSE_WHITESPACE_DICT[value] = key

CUSTOM_TOKEN = ["<|startcode|>", "<|endcode|>", "<|startfocus|>", "<|endfocus|>", "<|startcomment|>", "<|endcomment|>", 
                   "<|stringliteral|>", "<|singlelinecomment|>", "<|multilinecomment|>", "<|del|>"]

values = list(set(INFIX+PREFIX+POSTFIX+ASSIGNMENT+LAMBDA+COMMENT+METHOD_REFERENCE))
token_phrases = []
word_punct_tokenizer = WordPunctTokenizer()
for tok in CUSTOM_TOKEN+WHITESPACES:
    temp = tuple(word_punct_tokenizer.tokenize(tok))
    token_phrases.append(temp)
for w in values:
    temp = tuple(MWETokenizer().tokenize(w))
    if len(temp) > 1:
        token_phrases.append(temp)

word_punct_tokenizer = WordPunctTokenizer()
tweet_tokenizer = TweetTokenizer()
mwe_tokenizer = MWETokenizer(token_phrases, separator="")

def state(c):
    n = ord(c)
    if n>=97 and n<=122: # lower case
        return 1
    elif n>=65 and n<=90: # upper case
        return 2
    elif n>=48 and n<=57: # numbers
        return 3
    elif c.isspace(): # whitespaces
        return 4
    elif c in ['_', '$']: 
        return 5
    elif n < 128:
        return 6
    else:
        return 7

def space_up(s):
    if s is None or s == "":
        return ""
    new_s = s[0]
    for i in range(1,len(s)):
        prev_state = state(s[i-1])
        curr_state = state(s[i])
        if prev_state in [1,2] and curr_state in [3]:
            new_s += " "
        elif prev_state in [1] and curr_state in [2]:
            new_s += " "
        elif prev_state in [3] and curr_state in [1,2]:
            new_s += " "
        elif prev_state in [1,2,3] and curr_state in [5]:
            new_s += " "
        elif prev_state in [5] and curr_state in [1,2,3]:
            new_s += " "
        new_s+=s[i]
    return new_s

def white_space_tokenize(s):
    for x in WHITESPACE_DICT:
        s = s.replace(x, WHITESPACE_DICT[x])
    for key in REVERSE_WHITESPACE_DICT:
        #val = REVERSE_WHITESPACE_DICT[x]
        s = s.replace(key, " "+key+" ")
    return s

def extreme_tokenization(comment):
    comment = white_space_tokenize(comment)
    comment = space_up(comment)
    tokenized = tweet_tokenizer.tokenize(comment)
    tokenized = word_punct_tokenizer.tokenize(' '.join(tokenized))
    tokenized = mwe_tokenizer.tokenize(tokenized)
    tokenized_comment = ' '.join(tokenized)
    tokenized_comment = re.sub(r'[^\x00-\x7f]',r'', tokenized_comment)
    tokenized = tokenized_comment.split()
    return tokenized

def extreme_detokenization(tokens):
    s = ""
    for token in tokens:
        if token in REVERSE_WHITESPACE_DICT:
            s+= REVERSE_WHITESPACE_DICT[token]
        else:
            s+= token
    return s


# main function for data extraction from java files with additional information 

In [2]:
from os import path
import json
import re
import subprocess
import nltk
from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
nltk.download('punkt')

def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


BASE="E:/codes/"
unique_data = getJsonData('unique_data_with_date_and_index.json')
data = unique_data[1020]
code_file_name = data['file_name']
base_patch_number = data['base_patch_number']
main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
main_code = read(main_code_path)

# jar for localizing the focus and giving the either it is a insert, delete update with source and target added with special focus tokens
def get_source_target(file1, file2, line_number, change_window_size=5):
    source_target = subprocess.check_output(['java', '-jar', 'ChangedLine_status.jar', \
                                file1, file2, str(line_number), str(change_window_size)]).decode("utf-8")
    (status, source, target) = source_target.split("<|sep|>")
    return status, source, target

# for getting the end of function scope
def end_scope(start_index, lines):
    #print(lines)
    counter = 0
    end_index = -1
    found = False
    for index in range(start_index, len(lines), 1):
        #print(lines[index])
        #print(counter)
        for char in lines[index]:
            if(char=='{'):
                counter+=1
                found = True
            elif(char == "}"):
                counter-=1
                found = True
            if(counter==0 and found):
                end_index = index
                break
        if end_index!=-1:
            break
    return end_index

unique_data_processed = []
MAX_NUM_TOKEN = 400
UP_TOKEN = MAX_NUM_TOKEN//2
c = 0
ec = 0
errors = []
#for idx, data in enumerate(unique_data):#[2428:2429]#[1000:1001]
# for creating the metadata for future training 
def process(data):    
    code_file_name = data['file_name']
    base_patch_number = data['base_patch_number']
    changed_patch_number = data['changed_patch_number']
    main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
    changed_code_path = BASE+code_file_name+'/'+str(changed_patch_number)+'.java'
    line_no = data['line_number']
    main_code = read(main_code_path)
    changed_code = read(changed_code_path)
    sample = main_code
    message = data['message']
    comment_id = data['comment_id']
    source = ""
    target=''
    try:
        status, source, target = get_source_target(main_code_path, changed_code_path,line_no)
    except:
        return
    
    sample = source
    data_dic = {}
    data_dic['status'] = status
    data_dic['message'] = message
    
    data_dic['comment_id'] = comment_id
    data_dic['target'] = target[:-2]
    #data_dic['source'] = source
    #data_dic['idx'] = c
    func_list = []
    data_dic['code_snippet'] = ""
    data_dic['prime_var_dic'] = {}
    #print(sample)
    classes = re.findall(r"(?:(public\s))?(class)\s([^\n\s]*)", sample)
    lines = sample.split('\n')
    class_list = []
    sp_start_func= -1
    sp_end_func= len(sample.split('\n'))   
    for cls_tpl in classes:
        s = "".join(cls_tpl[-1])
        s = s.strip()
        class_list.append(s) ## class list
        cls = s
        #print(s)
        start_index = -1
        for i in range(len(lines)):
            if cls in lines[i]:
                start_index = i
                break
        #print(start_index)
        end_index = end_scope(start_index, lines)
        #print(end_index)
        class_scope_dic = {}
        for i in range(start_index, end_index+1):
            class_scope_dic[i] = True
        funcs = re.findall(r"(public|protected|private|static|\s) +([\w\<\>\[\]]+\s+(\w+)) *\([^\)]*\) *(\{?|[^;])", "\n".join(lines[start_index:end_index+1]))
        func_scopes = []
        #print(funcs)
        #continue
        visited_func = set()
        for func in funcs:
            #print(func)
            if(func[-1]!= "{"):
                continue
            fc = list(func)
            fc = fc[-3]
            s = "".join(fc)
            s = s.strip()
            #func_list.append(s) ## function list
            #print(func_list)
            
            for index in range(start_index, end_index+1, 1):
                if s in lines[index]:
                    start_func = index
                    end_func = end_scope(index, lines)
                    
                    func_scopes.append((start_func,end_func))
                    if s not in visited_func:
                        visited_func.add(s)
                        func_list.append(lines[index].replace("{", "").strip())

                    if(start_func<=line_no<=end_func):
                        special_func = "\n".join(lines[start_func:end_func+1])
                        #data_dic['code_snippet'] = special_func
                        #nonlocal sp_start_func
                        sp_start_func= start_func
                        #nonlocal sp_end_func
                        sp_end_func= end_func

        for func_sc in func_scopes:
            for i in range(func_sc[0], func_sc[1]+1):
                class_scope_dic[i] = False
        prime_var_list = []
        dic_vars = {}
        for i in range(start_index, end_index+1):
            if class_scope_dic[i]==True:
                #prime_vars = re.findall(r""""[^"]*"|((?=_[a-z_0-9]|[a-z])[a-z_0-9]+((?=\s*=)))""",lines[i])
                if '(' not in lines[i] and "return" not in lines[i] and "extends" not in lines[i]:
                    prime_vars = re.findall(r"""(\w+\s+)([a-zA-Z_][a-zA-Z0-9_]*)""", lines[i])
                    if len(prime_vars)==2:
                        if(len(prime_vars[1])==2 and prime_vars[1][1] in dic_vars.keys()):
                            dic_vars[prime_vars[1][1]]+=1
                        elif len(prime_vars[1])==2:
                            dic_vars[prime_vars[1][1]] =1
        #if(len(list(dic_vars.keys()))>0):
            #print(dic_vars)
        data_dic['prime_var_dic'] = dic_vars
    data_dic['class_list'] = class_list
    data_dic['func_list'] = func_list
    
    #print("specials   ###", sp_start_func, sp_end_func)
    
    if not (0 < len(extreme_tokenization(data_dic['code_snippet']))<MAX_NUM_TOKEN): 
        up_count = 0
        up_done = False
        down_count = 0
        down_done = False
        splitted = sample.split("\n")
        while(1):
            #print(max(0, line_no-up_count),line_no+1)
            if len(extreme_tokenization("\n".join(splitted[max(0,sp_start_func, line_no-up_count):line_no+1])))<UP_TOKEN:
                if(up_count==line_no):
                    break
                up_count+=1
            else:
                break
                #print("something is wrong")

        while(1):
            if len(extreme_tokenization("\n".join(splitted[(line_no-up_count):min(line_no+down_count+1,sp_end_func, len(splitted))])))<MAX_NUM_TOKEN:
                if(down_count==len(splitted)):
                    break
                down_count+=1
            else:
                break
    if sp_start_func != -1:
        #data_dic['code_snippet'] = "\n".join(splitted[sp_start_func:sp_end_func+1])
        
        if("startfocus" in splitted[sp_start_func-1]):
            sp_start_func-=1
        if sp_end_func <=len(splitted)-2 and "endfocus" in splitted[sp_end_func+1]:
            sp_end_func+=2
        if sp_end_func <=len(splitted)-1 and "endfocus" in splitted[sp_end_func]:
            sp_end_func+=1
        #print(" ".join(splitted[sp_start_func:sp_end_func+1]))
        #print(sp_start_func)
        #print(sp_end_func)
        code_snippet = "\n".join(splitted[sp_start_func:sp_end_func+1])
        #print("in function")
        
    else:
        #data_dic['code_snippet'] = "\n".join(splitted[(line_no-up_count):(line_no+down_count+1)])
        #print("not in function")
        if("startfocus" in splitted[(line_no-up_count-1)]):
            up_count+=1
        if line_no+down_count <=len(splitted)-1 and "endfocus" in splitted[line_no+down_count]:
            down_count+=1
        code_snippet = "\n".join(splitted[(line_no-up_count):(line_no+down_count+1)])
        #print(line_no-up_count)
        #print(line_no+down_count+1)
        #print(code_snippet)
    #print(code_snippet)
    #print(data_dic['func_list'])
    if "<|startfocus|>" in code_snippet and "<|endfocus|>" not in code_snippet:
        code_snippet =code_snippet+  "\n<|endfocus|>"
    if "<|startfocus|>" not in code_snippet and "<|endfocus|>" in code_snippet:
        code_snippet = "<|startfocus|>\n"+code_snippet
        
    data_dic['tokenized_code_snippet'] = extreme_tokenization(code_snippet)#data_dic['code_snippet'])
    data_dic['tokenized_target'] = extreme_tokenization(data_dic['target'])
    data_dic['tokenized_comment'] = extreme_tokenization(data_dic['message'])
    data_dic['code_snippet'] = code_snippet
    #print("specials   ###", sp_start_func, sp_end_func)

    
    #print(code_snippet)
    #print("#################")
    #print(data_dic)
    #print(source)
    #data_dic['actual_line_number'] = data['line_number']
    #data_dic['code_snippet'] = data['code_snippet']
    
    data_dic['global_index'] = data['global_index']
    data_dic['base_code_line_number'] = data['line_number']
    data_dic['base_patch_number'] = data['base_patch_number']
    data_dic['changed_patch_number'] = data['changed_patch_number']
    data_dic['code_file_name'] = data['file_name']
    data_dic['comment_id'] = data['comment_id']
    data_dic['message'] = data['message']
    data_dic['line_change'] = data['line_change']
    data_dic['written_on'] = data['written_on']
    data_dic['project_name'] = data['project_name']
    if "<|startfocus|>" in code_snippet and "<|endfocus|>" in code_snippet:
        unique_data_processed.append(data_dic)
    if(len(unique_data_processed)%100==0):
        print("data size = ", len(unique_data_processed))
    '''
    print(idx)
    if(idx%1000==0):
        print(idx)
        print(c)
        pprint(data_dic)
    '''
'''
{'actual_line_number': 71, 'base_code_line_number': 73, 'base_patch_number': 3, 'changed_code': '    /*', 'changed_patch_number': 7, 'code_file_name': 'android_3478', 'comment_id': '3745e284_1e49cdaa', 'line_change': 2, 'message': '{@link android.icu.impl.OlsonTimeZone} ?', 'previous_code': '     /*'}
'''

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


"\n{'actual_line_number': 71, 'base_code_line_number': 73, 'base_patch_number': 3, 'changed_code': '    /*', 'changed_patch_number': 7, 'code_file_name': 'android_3478', 'comment_id': '3745e284_1e49cdaa', 'line_change': 2, 'message': '{@link android.icu.impl.OlsonTimeZone} ?', 'previous_code': '     /*'}\n"

#  necessary functions for running in multithread for data creation

In [3]:
import sys,json
import csv
import _thread
import threading
import time
from time import sleep
import pandas as pd

import json
def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


BASE="E:/codes/"

file_size = len(unique_data)
batch_size = 1000
#
def translate(file_no):
    for x in range(file_no*batch_size, min(file_size,(file_no+1)*batch_size)):
        process(unique_data[x])

### Multithread Run

In [4]:
thrds = []
for i in range(0,(file_size//batch_size)+1):
    #translate(str(i))
    try:
        '''
        if(False):
           #_thread.start_new_thread(translate, (str(i)))
           print(str(i))
           my = M()
           td =  threading.Thread(target=M.t, args = (str(i),))
           td.start()
           thrds.append(td)
        else:
        '''
        print("processing file = ", str(i))
        td =  threading.Thread(target=translate, args = (i,))
        td.start()
        thrds.append(td)
       
    except:
        print("Error: unable to start thread")

for td in thrds:
    td.join()

processing file =  0
processing file =  1
processing file =  2
processing file =  3
processing file =  4
processing file =  5
processing file =  6
processing file =  7
processing file =  8
processing file =  9
processing file =  10
processing file =  11
processing file =  12
processing file =  13
processing file =  14
processing file =  15
processing file =  16
processing file =  17
processing file =  18
processing file =  19
processing file =  20
processing file =  21
processing file =  22
processing file =  23
processing file =  24
processing file =  25
processing file =  26
processing file =  27
processing file =  28data size = 
 processing file = 100 29

processing file =  30
processing file =  31
processing file =  32
processing file =  33
processing file =  34
processing file =  35
processing file =  36
processing file =  data size =  37
200
processing file =  38
processing file =  39
processing file =  40
processing file =  41
processing file =  42
processing file =  43
processi

data size =  74700
data size =  74800
data size =  74900
data size =  75000
data size =  75100
data size =  75200
data size =  75300
data size =  75400
data size =  75500
data size =  75600
data size =  75700
data size =  75800
data size =  75900
data size =  76000
data size =  76100
data size =  76200
data size =  76300
data size =  76400
data size =  76500
data size =  76600
data size =  76700
data size =  76800
data size =  76900
data size =  77000
data size =  77100
data size =  77200
data size =  77300
data size =  77400
data size =  77500
data size =  77600
data size =  77700
data size =  77800
data size =  77900
data size =  78000
data size =  78100
data size =  78200
data size =  78300
data size =  78400
data size =  78500
data size =  78600
data size =  78700
data size =  78800
data size =  78900
data size =  79000
data size =  79100
data size =  79200
data size =  79300
data size =  79400
data size =  79500
data size =  79600
data size =  79700
data size =  79800
data size = 

In [5]:
with open('unique_data_processed_with_Date_multithread_idx_400_feb_03.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(unique_data_processed, f)

In [7]:
len(unique_data_processed)
unique_data_processed[-1]

{'status': 'unchanged',
 'message': 'how about:\n\nassertEquals(Collections.singletonList("foo=bar", cookieLists);\n\n? That way, the test will also fail if the list (unexpectedly) contains additional elements.',
 'comment_id': 'be0279f5_3c6451b3',
 'target': '\n',
 'code_snippet': '        List<String> cookieList = cookies.values().iterator().next();\n        assertEquals(Collections.singletonList("foo=bar"), cookieList);\n    }\n\n    // http://b/31039416. openJdk cookies implementation is more\n    // strict about the "expires" field parsing and requires\n    // the "GMT" prefix before timezone value.\n    public void testLenientExpiresParsing() throws Exception {\n        CookieManager cm = new CookieManager(createCookieStore(), null);\n\n        URI uri = URI.create("https://test.com");\n        Map<String, List<String>> header = new HashMap<>();\n        List<String> value = new ArrayList<>();\n<|startfocus|>\n\n<|endfocus|>\n        value.add("cookie=1234567890test; domain=.test