In [20]:
import utils
import lizard
from diff_match_patch import diff_match_patch
import json

In [21]:
def create_deletions(modified_func):

    before_code = modified_func.func_src_before
    before_code_lines = before_code.split('\n')

    # for every entry in deleted and added, extract the code and indeces
    # then count how many new lines come before
    # then check to see if every line in the changed code portion matches completely to an entry in
    # the corresponding source code, and add to either an addition or deletion

    # deletetions will compare to the before code
    # additions will compare to the after code

    char_changes_delete = modified_func.char_changes['deleted']
    for change in char_changes_delete:

        # count how many lines happened before we started changing code
        # then add 1 to describe the current line where we changed code
        start_line_num = before_code[:change['char_start']].count('\n') + 1

        # retrieve the changed code, and split each into their own line
        # fileter out the empty line
        changed_code = before_code[change['char_start']:change['char_end']+1].split('\n')
        changed_code = list(filter(None, changed_code))

        for l in range(len(changed_code)):
            # line num is 1 indexed
            curr_line_num = start_line_num + l

            # access the lines to analyze form the original and 
            # characters which have been deleted from
            changed_line = changed_code[l]
            src_code_line = before_code_lines[curr_line_num - 1]

            # if the characters changed on the current line match extactly up with 
            # the entirety of the line in the src code then we can count this as 
            # a complete line deletion
            if changed_line.strip() == src_code_line.strip():

                # get the indeces of the current line that has been deleted
                start_index = len('\n'.join(before_code_lines[:curr_line_num - 1])) + 1
                end_index = start_index + len(src_code_line)

                deletion = {"line_no": curr_line_num, "char_start": start_index, "char_end": end_index, "line": src_code_line}
                modified_func.line_changes['deleted'].append(deletion)

                

In [22]:
def create_additions(modified_func):
    after_code = modified_func.func_src_after
    after_code_lines = after_code.split('\n')

    # for every entry in deleted and added, extract the code and indeces
    # then count how many new lines come before
    # then check to see if every line in the changed code portion matches completely to an entry in
    # the corresponding source code, and add to either an addition or deletion

    char_changes_add = modified_func.char_changes['added']
    for change in char_changes_add:

        # count how many lines happened before we started changing code
        # then add 1 to describe the current line where we changed code
        start_line_num = after_code[:change['char_start']].count('\n') + 1

        # retrieve the changed code, and split each into their own line
        # fileter out the empty line
        changed_code = after_code[change['char_start']:change['char_end']+1].split('\n')
        changed_code = list(filter(None, changed_code))

        for l in range(len(changed_code)):
            # line num is 1 indexed
            curr_line_num = start_line_num + l

            # access the lines to analyze form the original and 
            # characters which have been deleted from
            changed_line = changed_code[l]
            src_code_line = after_code_lines[curr_line_num - 1]

            # if the characters changed on the current line match extactly up with 
            # the entirety of the line in the src code then we can count this as 
            # a complete line deletion
            if changed_line.strip() == src_code_line.strip():
                
                # get the indeces of the current line that has been deleted
                start_index = len('\n'.join(after_code_lines[:curr_line_num - 1])) + 1
                end_index = start_index + len(src_code_line)

                addition = {"line_no": curr_line_num, "char_start": start_index, "char_end": end_index, "line": src_code_line}
                modified_func.line_changes['added'].append(addition)

In [23]:
# using the character changes and the function source code
# determine which of the character changes constitute a complete line addition or deletion
def get_line_changes(modified_func):

    create_deletions(modified_func)
    create_additions(modified_func)
    pass


In [25]:
before_funcs = None
after_funcs = None

before_src = None
after_src = None

with open('../custom_datasets/currated_data/numpy-main-parsed-functions.py', 'r') as file:
    before_src = file.read()

with open('../custom_datasets/currated_data/pandas-main-parsed-functions.py', 'r') as file:
    after_src = file.read()

# get the lizard func objects
before_funcs = lizard.analyze_file.analyze_source_code(
    '../custom_datasets/currated_data/numpy-main-parsed-functions.py', 
    before_src).function_list

after_funcs = lizard.analyze_file.analyze_source_code(
    '../custom_datasets/currated_data/pandas-main-parsed-functions.py', 
    after_src).function_list


i = 1
modified_funcs_list = []
for pair in list(zip(before_funcs, after_funcs))[:1000]:
    print(f"Analyzing function pair {i}")
    i += 1
    modified_func = utils.ModifiedFunc(pair[0], pair[1], before_src, after_src)
    get_line_changes(modified_func)
    modified_funcs_list.append(modified_func)

Analyzing function pair 1
Analyzing function pair 2
Analyzing function pair 3
Analyzing function pair 4
Analyzing function pair 5
Analyzing function pair 6
Analyzing function pair 7
Analyzing function pair 8
Analyzing function pair 9
Analyzing function pair 10
Analyzing function pair 11
Analyzing function pair 12
Analyzing function pair 13
Analyzing function pair 14
Analyzing function pair 15
Analyzing function pair 16
Analyzing function pair 17
Analyzing function pair 18
Analyzing function pair 19
Analyzing function pair 20
Analyzing function pair 21
Analyzing function pair 22
Analyzing function pair 23
Analyzing function pair 24
Analyzing function pair 25
Analyzing function pair 26
Analyzing function pair 27
Analyzing function pair 28
Analyzing function pair 29
Analyzing function pair 30
Analyzing function pair 31
Analyzing function pair 32
Analyzing function pair 33
Analyzing function pair 34
Analyzing function pair 35
Analyzing function pair 36
Analyzing function pair 37
Analyzing 

In [30]:
json_out = []
for f in modified_funcs_list:
    json_out.append(f.to_json())

In [31]:
json_out

[{'func_name': '_get_numpy_tools',
  'func_src_before': "def _get_numpy_tools(filename):\n    filepath = pathlib.Path('tools', filename)\n    spec = importlib.util.spec_from_file_location(filename.stem, filepath)\n    module = importlib.util.module_from_spec(spec)\n    spec.loader.exec_module(module)\n    return module",
  'line_changes': {'deleted': [{'line_no': 2,
     'char_start': 32,
     'char_end': 78,
     'line': "    filepath = pathlib.Path('tools', filename)"},
    {'line_no': 4,
     'char_start': 154,
     'char_end': 204,
     'line': '    module = importlib.util.module_from_spec(spec)'},
    {'line_no': 5,
     'char_start': 205,
     'char_end': 240,
     'line': '    spec.loader.exec_module(module)'},
    {'line_no': 6,
     'char_start': 241,
     'char_end': 258,
     'line': '    return module'}],
   'added': [{'line_no': 2,
     'char_start': 29,
     'char_end': 117,
    {'line_no': 3,
     'char_start': 118,
     'char_end': 144,
     'line': '    filter_level: L

In [34]:
with open("../custom_data_train_val/train/functions.jsonl", 'w') as outfile:
    for line in json_out:
        outfile.write(json.dumps(line))
        outfile.write('\n')

with open("../custom_data_train_val/val/functions.jsonl", 'w') as outfile:
    for line in json_out:
        outfile.write(json.dumps(line))
        outfile.write('\n')

In [35]:
with open('../custom_data_train_val/train/functions.jsonl') as f:
    lines = f.readlines()

In [36]:
len(lines)

1000