In [None]:
import os
import lizard
import json

In [None]:
def get_func_src(func, src):
    src_lines = src.split('\n')
    start_line, end_line = func.start_line, func.end_line
    start_line -= 1
    return '\n'.join(src_lines[start_line:end_line]).rstrip()

In [None]:
def extract_functions(keyword = ''):
    src_code = ['numpy-main', 'pandas-main']

    # will search for any python file in our repos, extract the functions and
    for repo in src_code:

        print(f"reading for {repo}")

        # search for py files
        files = os.popen(f'find ../custom_datasets/sources/{repo} -type f -name \*{keyword}*.py').read().split('\n')
        # this will have the fullepath to the function 
        # name as the key and then the function code as a value
        functions = {}

        # iterate over the strings of file paths for all python functions
        for f in files:
            try:

                # open and read the python file
                file_code = ""
                with open(f, 'r') as file:
                    file_code = file.read()

                # truncate the fname with the repo-dire as the base dir
                # remove the .py extension so we can know where the function comes from
                truncated_fname = f[f.find(repo):].replace('.py', '')

                # generate a lizard function object reading from the filepath string
                func_objs = lizard.analyze_file.analyze_source_code(f, file_code).function_list

                # add an entry to the dictionary where the full path to the function 
                # is the key and the src code is the value
                for f in func_objs:
                    func_name = truncated_fname + '.' + f.name
                    func_src = get_func_src(f, file_code)
                    functions[func_name] = func_src

                    print('-------------------------------------------------------------------------------------------')

            except:
                # just print any of the files we were unable to read
                print(f)
                pass

        if keyword != "":
            print(f"{repo} has {len(functions)} functions when searching with the keyword {keyword}")
        else:
            print(f"{repo} has {len(functions)} functions")
                    

        # write a parsed json file for all of the functions in the repository
        if keyword != "":
            out_fname = f"{repo}-{keyword}-parsed-functions.json"
        else:
            out_fname = f"{repo}-parsed-functions.json"

        with open(f"../custom_datasets/currated_data/{out_fname}", 'w') as outfile:
            outfile.write(json.dumps(functions))



In [None]:
extract_functions()

In [None]:
extract_functions('test')