In [44]:
import utils
import lizard
from diff_match_patch import diff_match_patch
import json
import os
from difflib import SequenceMatcher
import random

In [3]:
def create_deletions(modified_func):

    before_code = modified_func.func_src_before
    before_code_lines = before_code.split('\n')

    # for every entry in deleted and added, extract the code and indeces
    # then count how many new lines come before
    # then check to see if every line in the changed code portion matches completely to an entry in
    # the corresponding source code, and add to either an addition or deletion

    # deletetions will compare to the before code
    # additions will compare to the after code

    char_changes_delete = modified_func.char_changes['deleted']
    for change in char_changes_delete:

        # count how many lines happened before we started changing code
        # then add 1 to describe the current line where we changed code
        start_line_num = before_code[:change['char_start']].count('\n') + 1

        # retrieve the changed code, and split each into their own line
        # fileter out the empty line
        changed_code = before_code[change['char_start']:change['char_end']+1].split('\n')
        changed_code = list(filter(None, changed_code))

        for l in range(len(changed_code)):
            # line num is 1 indexed
            curr_line_num = start_line_num + l

            # access the lines to analyze form the original and 
            # characters which have been deleted from
            changed_line = changed_code[l]
            src_code_line = before_code_lines[curr_line_num - 1]

            # if the characters changed on the current line match extactly up with 
            # the entirety of the line in the src code then we can count this as 
            # a complete line deletion
            if changed_line.strip() == src_code_line.strip():

                # get the indeces of the current line that has been deleted
                start_index = len('\n'.join(before_code_lines[:curr_line_num - 1])) + 1
                end_index = start_index + len(src_code_line)

                deletion = {"line_no": curr_line_num, "char_start": start_index, "char_end": end_index, "line": src_code_line}
                modified_func.line_changes['deleted'].append(deletion)

                

In [4]:
def create_additions(modified_func):
    after_code = modified_func.func_src_after
    after_code_lines = after_code.split('\n')

    # for every entry in deleted and added, extract the code and indeces
    # then count how many new lines come before
    # then check to see if every line in the changed code portion matches completely to an entry in
    # the corresponding source code, and add to either an addition or deletion

    char_changes_add = modified_func.char_changes['added']
    for change in char_changes_add:

        # count how many lines happened before we started changing code
        # then add 1 to describe the current line where we changed code
        start_line_num = after_code[:change['char_start']].count('\n') + 1

        # retrieve the changed code, and split each into their own line
        # fileter out the empty line
        changed_code = after_code[change['char_start']:change['char_end']+1].split('\n')
        changed_code = list(filter(None, changed_code))

        for l in range(len(changed_code)):
            # line num is 1 indexed
            curr_line_num = start_line_num + l

            # access the lines to analyze form the original and 
            # characters which have been deleted from
            changed_line = changed_code[l]
            src_code_line = after_code_lines[curr_line_num - 1]

            # if the characters changed on the current line match extactly up with 
            # the entirety of the line in the src code then we can count this as 
            # a complete line deletion
            if changed_line.strip() == src_code_line.strip():
                
                # get the indeces of the current line that has been deleted
                start_index = len('\n'.join(after_code_lines[:curr_line_num - 1])) + 1
                end_index = start_index + len(src_code_line)

                addition = {"line_no": curr_line_num, "char_start": start_index, "char_end": end_index, "line": src_code_line}
                modified_func.line_changes['added'].append(addition)

In [5]:
# using the character changes and the function source code
# determine which of the character changes constitute a complete line addition or deletion
def get_line_changes(modified_func):

    create_deletions(modified_func)
    create_additions(modified_func)
    pass


In [8]:
def read_functions():
    before_src = None
    after_src = None

    with open('../custom_datasets/currated_data/numpy-main-parsed-functions.py', 'r') as file:
        before_src = file.read()

    with open('../custom_datasets/currated_data/pandas-main-parsed-functions.py', 'r') as file:
        after_src = file.read()

    return before_src, after_src

before_src, after_src = read_functions()

In [12]:
def get_func_src(func, src):
    src_lines = src.split('\n')
    start_line, end_line = func.start_line, func.end_line
    start_line -= 1
    return '\n'.join(src_lines[start_line:end_line]).rstrip()

In [70]:

# get the lizard func objects
before_funcs = lizard.analyze_file.analyze_source_code(
    '../custom_datasets/currated_data/numpy-main-parsed-functions.py', 
    before_src).function_list

random.shuffle(before_funcs)
before_funcs = before_funcs[:100]



after_funcs = lizard.analyze_file.analyze_source_code(
    '../custom_datasets/currated_data/pandas-main-parsed-functions.py', 
    after_src).function_list

random.shuffle(after_funcs)
after_funcs = after_funcs[:100]



In [71]:
before_funcs_src = []
after_funcs_src = []

print('getting before funcs srcs')
for i, func in enumerate(before_funcs):
    before_funcs_src.append(get_func_src(func, before_src))

print('getting after funcs srcs')
for i, func in enumerate(after_funcs):
    after_funcs_src.append(get_func_src(func, after_src))

getting before funcs srcs
getting after funcs srcs


In [73]:
matched_funcs= []
used_indeces = []
for i in range(len(before_funcs_src)):

    best_similarity = 0
    best_index = None

    #get the best matching source code between before and after
    for j in range(len(after_funcs_src)):
        if j in used_indeces:
            continue
        curr_after_func_src = after_funcs_src[j]

        similarity = SequenceMatcher(None, before_funcs_src[i], curr_after_func_src).ratio()
        if similarity > best_similarity:
            best_similarity = similarity
            best_index = j

    used_indeces.append(best_index)
    matched_funcs.append((before_funcs[i], after_funcs[best_index]))



In [74]:
# create modified func opjects which generate the changes noticed
modified_funcs_list = []
for pair in matched_funcs[:50]:
    modified_func = utils.ModifiedFunc(pair[0], pair[1], before_src, after_src)
    get_line_changes(modified_func)
    modified_funcs_list.append(modified_func)

In [None]:
# convert to json to match the expected input
json_out = []
for f in modified_funcs_list:
    json_out.append(f.to_json())

In [None]:
# write the training data to files

with open("../custom_data_train_val/train/functions.jsonl", 'w') as outfile:
    for line in json_out[:30]:
        outfile.write(json.dumps(line))
        outfile.write('\n')


with open("../custom_data_train_val/val/functions.jsonl", 'w') as outfile:
    for line in json_out[30:40]:
        outfile.write(json.dumps(line))
        outfile.write('\n')


In [None]:
# write the testing data to multiple completed.py files per example
for i, func in enumerate(modified_funcs_list[40:]):
    scenario_dir = f'../custom_data_eval/test/example{i+1}'
    os.makedirs(scenario_dir, exist_ok = True)

    with open(f'{scenario_dir}/completed.py', 'w') as outfile:
        outfile.write(func.func_src_after)
    
    # will need to manually modify this to only include function header and docstring
    with open(f'{scenario_dir}/functions.py', 'w') as outfile:
        outfile.write(func.func_src_after)
