## Accessory functions

In [1]:
import json
def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data

In [2]:
## Only keeps the datapoints that has less tokens than a particular threshold both in focus and target

def limit_focus(data, limit):
    data_new = []
    for i in range(len(data)):
        try:
            focus_len = data[i]['tokenized_code_snippet'].index('<|endfocus|>') \
            - data[i]['tokenized_code_snippet'].index('<|startfocus|>')
            target_len = len(data[i]['tokenized_target'])
        except:
            focus_len = 300
        if focus_len < limit and target_len < limit:
            data_new.append(data[i])
    return data_new

In [3]:
def dump_dict(data, name):
    with open(name, 'w', encoding="utf8") as f:
        json.dump(data, f)

## Load Dataset

In [5]:
train = getJsonData('jsons/train_by_project.json')
test = getJsonData('jsons/test_by_project.json')

In [6]:
for x in test[0].keys():
    print(x)

status
message
comment_id
target
code_snippet
prime_var_dic
class_list
func_list
tokenized_code_snippet
tokenized_target
tokenized_comment
global_index
base_code_line_number
base_patch_number
changed_patch_number
code_file_name
line_change
written_on
project_name
int_date


### Limit Focus and Target to 100 Tokens

In [7]:
train_100 = limit_focus(train, 100)
test_100 = limit_focus(test, 100)

In [8]:
dump_dict(train_100, "jsons/train_100.json")
dump_dict(test_100, "jsons/test_100.json")

# Creating Vocab

In [None]:
train_100 = getJsonData('jsons/train_100.json')
test_100 = getJsonData('jsons/test_100.json')

In [44]:
code_freq = {}
comment_freq = {}

In [45]:
for i in range(len(train_100)):
    source_tokens = train_100[i]['tokenized_code_snippet']
    target_tokens = train_100[i]['tokenized_target']
    comment_tokens = train_100[i]['tokenized_comment']
    code_tokens = source_tokens + target_tokens
    
    for x in code_tokens:
        if x not in code_freq:
            code_freq[x] = 0
        code_freq[x] += 1
        
    for x in comment_tokens:
        if x not in comment_freq:
            comment_freq[x] = 0
        comment_freq[x] += 1

In [46]:
sorted_code_freq = {k: v for k, v in sorted(code_freq.items(), key=lambda item: item[1], reverse=True)}

In [48]:
sorted_comment_freq = {k: v for k, v in sorted(comment_freq.items(), key=lambda item: item[1], reverse=True)}

**Dump the file for future reference**

In [52]:
dump_dict(sorted_code_freq, 'jsons/sorted_code_freq.json')

In [53]:
dump_dict(sorted_comment_freq, 'jsons/sorted_comment_freq.json')

**Load from dumped data**

In [4]:
sorted_code_freq = getJsonData('jsons/sorted_code_freq_100.json')
sorted_comment_freq = getJsonData('jsons/sorted_comment_freq_100.json')

**Saving vocab text file**

In [6]:
!mkdir vocab/c/

In [7]:
target_out = open("vocab/c/target_vocab_10000.txt", "w", encoding='utf-8')
#source_out = open("vocab/source_vocab_8000.txt", "w", encoding='utf-8')

In [8]:
## Take the first 'num_code_token' from the codes.

num_code_token = 2000
num_total_token = 10000

for x in list(sorted_code_freq)[:num_code_token]:
    target_out.write(x)
    target_out.write("\n")
target_out.close()

In [None]:
src_vocab = list(sorted_code_freq)[:num_code_token]
for tok in sorted_comment_freq:
    if tok not in src_vocab and len(src_vocab) < num_total_token:
        src_vocab.append(tok)

for x in src_vocab:
    source_out.write(x)
    source_out.write("\n")
source_out.close()

# Creating Training Data

In [None]:
!mkdir -p training_data
!mkdir -p training_data/c
!mkdir -p training_data/cc

In [None]:
train_100 = getJsonData('jsons/train_100.json')
test_100 = getJsonData('jsons/test_100.json')

## Code+Comment

Write the train_100 file into OpenNMT training data. First CC, then C

In [43]:
src_train = open("training_data/cc/src-train.txt", 'w')
src_test = open("training_data/cc/src-test.txt", 'w')
tgt_train = open("training_data/cc/tgt-train.txt", 'w')
tgt_test = open("training_data/cc/tgt-test.txt", 'w')

In [None]:
for x in train_100:
    comment = '<|startcomment|> ' + ' '.join(x['tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['tokenized_code_snippet']) + '\n'
    target = ' '.join(x['tokenized_target']) + '\n'
    src_train.write(comment)
    src_train.write(code_snippet)
    tgt_train.write(target)
src_train.close()
tgt_train.close()

In [44]:
for x in test_100:
    comment = '<|startcomment|> ' + ' '.join(x['tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['tokenized_code']) + '\n'
    target = ' '.join(x['tokenized_target']) + '\n'
    src_test.write(comment)
    src_test.write(code_snippet)
    tgt_test.write(target)
src_test.close()
tgt_test.close()

## Code only

In [38]:
src_train = open("training_data/c/src-train.txt", 'w')
src_test = open("training_data/c/src-test.txt", 'w')
tgt_train = open("training_data/c/tgt-train.txt", 'w')
tgt_test = open("training_data/c/tgt-test.txt", 'w') 

In [None]:
for x in train_100:
    #comment = '<|startcomment|> ' + ' '.join(x['tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['tokenized_code_snippet']) + '\n'
    target = ' '.join(x['tokenized_target']) + '\n'
    #src_train.write(comment)
    src_train.write(code_snippet)
    tgt_train.write(target)
src_train.close()
tgt_train.close()

In [39]:
for x in test_100:
    #comment = '<|startcomment|> ' + ' '.join(x['tokenized_comment'][:200]) + ' <|endcomment|> '
    code_snippet = ' '.join(x['tokenized_code_snippet']) + '\n'
    target = ' '.join(x['tokenized_target']) + '\n'
    #src_test.write(comment)
    src_test.write(code_snippet)
    tgt_test.write(target)
src_test.close()
tgt_test.close()