# Extracting information from 41 repositories (test set)

## 1. Installing inspect4py

In [None]:
!pip install inspect4py

In [None]:
!inspect4py --version

## 2. Loading dataset

In [None]:
import pickle

REPOS = {}
with open("../../Dataset/REPOS_test.pkl", "rb") as f:
    REPOS = pickle.load(f)
    f.close()
print(len(REPOS.items()))

## 3. Extracting information: code, docs, structure, requirements, and readme

In [None]:
!mkdir -p output
for repo in REPOS:
    !mkdir -p {repo} & git clone {f"https://github.com/{repo}.git"} {repo}
    !inspect4py -i {repo} -o output/ {repo} -dt -sc -rm -r

In [None]:
# Find codes and codes' documents and append into list.
def traversal_codes_and_docs(funcs, codes_list, docs_list):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            codes_list.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs_list.append(f"{func_name} {func_info['doc'].get(key)}")


# Extract codes and codes' documents
def extract_codes_and_docs(filepath):
    codes_list = []
    docs_list = []

    with open(filepath, "r") as f:
        json_info = json.load(f)

    for element in ["requirements", "directory_tree", "readme_files"]:
        json_info.pop(element, None)

    for name, files in json_info.items():
        for file in files:
            if file.get("functions") is not None:
                traversal_codes_and_docs(file["functions"], codes_list, docs_list)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        traversal_codes_and_docs(class_info["methods"], codes_list, docs_list)

    return codes_list, docs_list

In [None]:
# Extract other information
def extract_other_information(filepath, element):
    element_list = []

    with open(filepath, "r") as f:
        json_info = json.load(f)

    if json_info.get(element) is not None:
        for key, value in json_info.get(element).items():
            if element == "requirements":
                element_list.append(key)
            else:
                element_list.append(value)

    return element_list

In [None]:
import json

repo_info = {}
for repo, topic in REPOS.items():
    repo_info[repo] = {}
    codes_list, docs_list = extract_codes_and_docs(f"output/{repo}/directory_info.json")
    repo_info[repo]["docs"] = docs_list
    repo_info[repo]["codes"] = codes_list
    repo_info[repo]["structure"] = extract_other_information(f"output/{repo}/directory_info.json", "directory_tree")
    repo_info[repo]["requirements"] = extract_other_information(f"output/{repo}/directory_info.json", "requirements")
    repo_info[repo]["readme"] = extract_other_information(f"output/{repo}/directory_info.json", "readme_files")
    repo_info[repo]["topic"] = topic

## 4. Saving repositories' information

In [None]:
with open("repo_info_test.pkl", "wb") as f:
    pickle.dump(repo_info, f)

In [None]:
with open("repo_info_test.pkl", "rb") as f:
    repo_info_check = pickle.load(f)
print(len(repo_info_check))