## Accessory functions

In [2]:
import json
def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data

In [3]:
## Only keeps the datapoints that has less tokens than a particular threshold both in focus and target

def limit_focus(data, limit):
    data_new = []
    for i in range(len(data)):
        try:
            focus_len = data[i]['soft_tokenized_code_snippet'].index('<|endfocus|>') \
            - data[i]['soft_tokenized_code_snippet'].index('<|startfocus|>')
            target_len = len(data[i]['soft_tokenized_target'])
        except:
            focus_len = 300
        if focus_len < limit and target_len < limit:
            data_new.append(data[i])
    return data_new

In [4]:
def dump_dict(data, name):
    with open(name, 'w', encoding="utf8") as f:
        json.dump(data, f)

### Soft Detokenization

We create the soft tokenized code from the Hard tokenized code

In [4]:
REVERSE_WHITESPACE_DICT = ['<|16-s|>', '<|12-s|>', '<|8-s|>', '<|4-s|>', '<|2-s|>', '<|s|>', '<|4-t|>', '<|3-t|>', '<|2-t|>', '<|t|>', '<|nl|>']

def state(c):
    n = ord(c)
    if n>=97 and n<=122: # lower case
        return 1
    elif n>=65 and n<=90: # upper case
        return 2
    elif n>=48 and n<=57: # numbers
        return 3
    elif c.isspace(): # whitespaces
        return 4
    elif c in ['_', '$']: 
        return 5
    elif n < 128:
        return 6
    else:
        return 7

def soft_detokenize(tokens):
    new_tokens = ['#']
    whitespace_on = 0
    for i in range(len(tokens)):
        token = tokens[i]
        if state(token[0]) in [1,2,3,5] and state(new_tokens[-1][-1]) in [1,2,3,5] and not whitespace_on:
            new_tokens[-1]+=token
            whitespace_on = 0
        elif token in REVERSE_WHITESPACE_DICT:
            whitespace_on = 1
            continue
        else:
            new_tokens.append(token)
            whitespace_on = 0
            
    return new_tokens[1:]

## Load Dataset

In [7]:
train = getJsonData('jsons/train_soft.json')
test = getJsonData('jsons/test_soft.json')

### Renaming Mild to Soft
Renaming 'mild_tokenization' to 'soft_tokenization'

In [13]:
for i in range(len(train)):
    train[i]['soft_tokenized_code_snippet'] = train[i]['mild_tokenized_code_snippet']
    train[i]['soft_tokenized_comment'] = train[i]['mild_tokenized_comment']
    train[i]['soft_tokenized_target'] = train[i]['mild_tokenized_target']
    del train[i]['mild_tokenized_code_snippet']
    del train[i]['mild_tokenized_comment']
    del train[i]['mild_tokenized_target']

In [14]:
for i in range(len(test)):
    test[i]['soft_tokenized_code_snippet'] = test[i]['mild_tokenized_code_snippet']
    test[i]['soft_tokenized_comment'] = test[i]['mild_tokenized_comment']
    test[i]['soft_tokenized_target'] = test[i]['mild_tokenized_target']
    del test[i]['mild_tokenized_code_snippet']
    del test[i]['mild_tokenized_comment']
    del test[i]['mild_tokenized_target']

### Add Soft Tokenized

In [None]:
for i in range(len(train)):
    train[i]['soft_tokenized_code_snippet'] = soft_detokenize(train[i]['tokenized_code_snippet'])
    train[i]['soft_tokenized_comment'] = soft_detokenize(train[i]['tokenized_comment'])
    train[i]['soft_tokenized_target'] = soft_detokenize(train[i]['tokenized_target'])

In [None]:
for i in range(len(test)):
    test[i]['soft_tokenized_code_snippet'] = soft_detokenize(test[i]['tokenized_code_snippet'])
    test[i]['soft_tokenized_comment'] = soft_detokenize(test[i]['tokenized_comment'])
    test[i]['soft_tokenized_target'] = soft_detokenize(test[i]['tokenized_target'])

In [None]:
dump_dict(train, "jsons/train.json")
dump_dict(test, "jsons/test.json")

### Limit Focus and Target to 100 Tokens

In [15]:
train_100 = limit_focus(train, 100)
test_100 = limit_focus(test, 100)

In [17]:
dump_dict(train_100, "jsons/train_soft_100.json")
dump_dict(test_100, "jsons/test_soft_100.json")

# Creating Vocab

In [5]:
train_100 = getJsonData('jsons/train_soft_100.json')
test_100 = getJsonData('jsons/test_soft_100.json')

In [10]:
code_freq = {}
comment_freq = {}

In [18]:
for i in range(len(train_100)):
    source_tokens = train_100[i]['soft_tokenized_code_snippet']
    target_tokens = train_100[i]['soft_tokenized_target']
    comment_tokens = train_100[i]['soft_tokenized_comment']
    code_tokens = source_tokens + target_tokens
    
    for x in code_tokens:
        if x not in code_freq:
            code_freq[x] = 0
        code_freq[x] += 1
        
    for x in comment_tokens:
        if x not in comment_freq:
            comment_freq[x] = 0
        comment_freq[x] += 1

In [19]:
sorted_code_freq = {k: v for k, v in sorted(code_freq.items(), key=lambda item: item[1], reverse=True)}

In [20]:
sorted_comment_freq = {k: v for k, v in sorted(comment_freq.items(), key=lambda item: item[1], reverse=True)}

**Dump the file for future reference**

In [21]:
dump_dict(sorted_code_freq, 'jsons/sorted_code_freq_soft.json')

In [22]:
dump_dict(sorted_comment_freq, 'jsons/sorted_comment_freq_soft.json')

**Load from dumped data**

In [8]:
sorted_code_freq = getJsonData('jsons/sorted_code_freq_soft.json')
sorted_comment_freq = getJsonData('jsons/sorted_comment_freq_soft.json')

**Saving vocab text file**

In [None]:
!mkdir vocab

In [125]:
target_out = open("vocab/target_vocab_soft_2000.txt", "w", encoding='utf-8')
source_out = open("vocab/source_vocab_soft_8000.txt", "w", encoding='utf-8')

In [126]:
## Take the first 'num_code_token' from the codes.

num_code_token = 2000
num_total_token = 10000

for x in list(sorted_code_freq)[:num_code_token]:
    target_out.write(x)
    target_out.write("\n")
target_out.close()

src_vocab = list(sorted_code_freq)[:num_code_token]
for tok in sorted_comment_freq:
    if tok not in src_vocab and len(src_vocab) < num_total_token:
        src_vocab.append(tok)

for x in src_vocab:
    source_out.write(x)
    source_out.write("\n")
source_out.close()

# Creating Training Data

In [2]:
!mkdir -p training_data
!mkdir -p training_data/c
!mkdir -p training_data/cc

In [1]:
train_100 = getJsonData('jsons/train_soft_100.json')
test_100 = getJsonData('jsons/test_soft_100.json')

NameError: name 'getJsonData' is not defined

## Code+Comment

Write the train_100 file into OpenNMT training data. First CC, then C

In [43]:
src_train = open("training_data/cc/soft-src-train.txt", 'w')
src_test = open("training_data/cc/soft-src-test.txt", 'w')
tgt_train = open("training_data/cc/soft-tgt-train.txt", 'w')
tgt_test = open("training_data/cc/soft-tgt-test.txt", 'w')

In [None]:
for x in train_100:
    comment = '<|startcomment|> ' + ' '.join(x['soft_tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['soft_tokenized_code_snippet']) + '\n'
    target = ' '.join(x['soft_tokenized_target']) + '\n'
    src_train.write(comment)
    src_train.write(code_snippet)
    tgt_train.write(target)
src_train.close()
tgt_train.close()

In [44]:
for x in test_100:
    comment = '<|startcomment|> ' + ' '.join(x['soft_tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['soft_tokenized_code_snippet']) + '\n'
    target = ' '.join(x['soft_tokenized_target']) + '\n'
    src_test.write(comment)
    src_test.write(code_snippet)
    tgt_test.write(target)
src_test.close()
tgt_test.close()

## Dataset With Code only

In [38]:
src_train = open("training_data/c/soft-src-train.txt", 'w')
src_test = open("training_data/c/soft-src-test.txt", 'w')
tgt_train = open("training_data/c/soft-tgt-train.txt", 'w')
tgt_test = open("training_data/c/soft-tgt-test.txt", 'w') 

In [None]:
for x in train_100:
    #comment = '<|startcomment|> ' + ' '.join(x['soft_tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['soft_tokenized_code_snippet']) + '\n'
    target = ' '.join(x['soft_tokenized_target']) + '\n'
    #src_train.write(comment)
    src_train.write(code_snippet)
    tgt_train.write(target)
src_train.close()
tgt_train.close()

In [39]:
for x in test_100:
    #comment = '<|startcomment|> ' + ' '.join(x['soft_tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['soft_tokenized_code_snippet']) + '\n'
    target = ' '.join(x['soft_tokenized_target']) + '\n'
    #src_test.write(comment)
    src_test.write(code_snippet)
    tgt_test.write(target)
src_test.close()
tgt_test.close()

# Removing Same Target and Focus

In [6]:
len(test_100)

2961

In [22]:
test_100[0].keys()

dict_keys(['status', 'message', 'comment_id', 'target', 'code_snippet', 'prime_var_dic', 'class_list', 'func_list', 'tokenized_code_snippet', 'tokenized_target', 'tokenized_comment', 'global_index', 'base_code_line_number', 'base_patch_number', 'changed_patch_number', 'code_file_name', 'line_change', 'written_on', 'project_name', 'int_date', 'soft_tokenized_code_snippet', 'soft_tokenized_comment', 'soft_tokenized_target'])

In [18]:
start = test_100[0]['soft_tokenized_code_snippet'].index('<|startfocus|>')

In [19]:
end = test_100[0]['soft_tokenized_code_snippet'].index('<|endfocus|>')

In [24]:
focus = test_100[0]['soft_tokenized_code_snippet'][start+1:end]

In [25]:
target = test_100[0]['soft_tokenized_target']

In [28]:
focus == target

False

In [49]:
## Only keeps the datapoints that has less tokens than a particular threshold both in focus and target

def remove_soft_unchanged(data):
    data_new = []
    for i in range(len(data)):
        try:
            start = test_100[i]['soft_tokenized_code_snippet'].index('<|startfocus|>')
            end = test_100[i]['soft_tokenized_code_snippet'].index('<|endfocus|>')
            focus = test_100[i]['soft_tokenized_code_snippet'][start+1:end]
            target = test_100[i]['soft_tokenized_target']
        except:
            print("oops")
            focus = [1]
            target = [2]
            
        if focus != target:
            data_new.append(data[i])
    return data_new

In [50]:
test_unchaged_removed_soft = remove_soft_unchanged(test_100)

In [51]:
len(test_unchaged_removed_soft)

2722

In [52]:
s = []
for x in test_unchaged_removed_soft:
    s.append(x['global_index'])

In [54]:
print(s)

[2381, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2489, 2562, 2575, 2576, 2577, 2620, 2621, 2640, 2617, 2618, 2619, 2643, 2644, 2645, 2648, 2342, 2343, 2393, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2662, 2664, 2665, 2666, 2668, 2688, 2689, 1482, 2582, 2583, 2635, 2087, 1484, 1520, 2683, 2684, 2685, 2686, 1531, 2486, 2487, 2767, 2773, 2774, 2775, 2776, 2777, 2778, 2781, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 1496, 1522, 1523, 1524, 1525, 1527, 1528, 1529, 1530, 2670, 2671, 2672, 2673, 2757, 2759, 2760, 2761, 2762, 2763, 2779, 2780, 2783, 2784, 2807, 2808, 2817, 1546, 2623, 2692, 2693, 2694, 2805, 2806, 2818, 2819, 2820, 2821, 2822, 2824, 2825, 2826, 2827, 2044, 2584, 2691, 2772, 2803, 2811, 2814, 2830, 2834, 2870, 2872, 2465, 2473, 2474, 2475, 2476, 2513, 2604, 2770, 2836, 2869, 2464, 2471, 2524, 1539, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2873, 2875, 1498, 1499, 1502, 1503, 1504, 1514, 1515, 151