In [1]:
## Imports and definitions

import rpy2
import rpy2.robjects as robjects
import pandas as pd
import collections
import re
import os
import git
import matplotlib.pyplot as plt
import networkx as nx
import glob

pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 999)


def analyze_obj(fobj):
    # Print all methods and variables of object
    for x in dir(fobj):
        if "_RObjectMixin" in x:
            continue
        print(x)
        print(eval("fobj." + x))
        try:
            print(eval("fobj." + x + "()"))
        except:
            pass
        print("\n\n")
        

def analyze_library(source_path, verbose=True):
    # Analyze R source code file
    # Return dict with keys = top level function / variable names
    if verbose:
        print("Anayzing " + source_path)

    r_source = robjects.r['source']
    r_source(source_path)
    
    with open(source_path, "r") as f:
        source_lines = f.readlines()

    source_lines = [x.replace("\n", "") for x in source_lines]
    fdict = {}
    for fname, fobj in robjects.globalenv.items():
        fdict[fname] = collections.defaultdict()
        fdict[fname]["obj"] = fobj
        fdict[fname]["type_label"] = fobj.rclass[0]
        fdict[fname]["type_technical"] = fobj.typeof

        fdict[fname]["code_raw"] = fobj.r_repr()

        if fdict[fname]["type_label"] == "function":
            fdict[fname]["params"] = {}
            try:
                for param in fobj.formals():
                    param_default = param.r_repr().split("= ")[1][:-1]
                    if param_default == "":
                        param_default = None       
                    fdict[fname]["params"][param.names.r_repr().replace('"', '')] = param_default
            except TypeError as e:
                # Case if function has no parameters
                if e.__str__() != "'NULLType' object is not iterable":
                    raise
            fdict[fname]["params_count"] = len([x for x in fdict[fname]["params"].values()])
            fdict[fname]["params_obligatory_count"] = len([x for x in fdict[fname]["params"].values() if x is None])
            fdict[fname]["params_optional_count"] = len([x for x in fdict[fname]["params"].values() if x is not None])

            fdict[fname]["code_content_str"] = re.sub("(\n$)|(^\n)", "", re.sub(r"(^[^{]*{)|(}[^}]*$)", "", fdict[fname]["code_raw"]))
            code_lines = fdict[fname]["code_content_str"].split("\n")
            fdict[fname]["code_lines_count"] = len(code_lines) - 1

            f_starts = [i for i, line in enumerate(source_lines) if re.match(fname+"=function\(", line.replace(" ", "").replace("<-", "="))]
            if len(f_starts) > 1:
                print("Warning ! Function '" + fname + "' defined multiple times: " + str(f_starts))
                for start in f_starts:
                    print(source_lines[start])
            elif len(f_starts) == 0:
                print(fdict)
                raise ValueError("Function not found: " + fname)
            fdict[fname]["code_starts"] = f_starts
            fdict[fname]["code_start"] = f_starts[-1]
            fdict[fname]["out"] = re.findall("return\([^\)]*\)", fdict[fname]["code_content_str"])

            fdict[fname]["params_unused"] = []
            for param in fdict[fname]["params"]:
                # Param is only used if: 
                # a) it is in the function code, 
                # b) it is not precedented or followed by a char or number
                # c) in the same line there is no # left of the param name
                if re.search("(^|\n)[^#\n]*[^a-zA-Z0-9#\n]" + param + "[^a-zA-Z0-9]", fdict[fname]["code_content_str"]) is None:
                    fdict[fname]["params_unused"] += [param]

        elif fdict[fname]["type_label"] == "matrix":
            fdict[fname]["matrix_dim"] = fobj.dim
            fdict[fname]["matrix_dim_names"] = fobj.dimnames
            fdict[fname]["matrix_col_names"] = fobj.colnames
            fdict[fname]["matrix_col_count"] = fobj.ncol
            fdict[fname]["matrix_row_count"] = fobj.nrow
            fdict[fname]["matrix_len"] = fobj.__len__()
            fdict[fname]["matrix_content"] = fobj.__str__()
            fdict[fname]["matrix_factors"] = fobj.factor()

    f_starts = [x for val in fdict.values() if val["type_label"] == "function"  for x in val["code_starts"]]
    f_starts += [len(source_lines)]
    f_end_indexs = []

    for start in f_starts:
        for i, line in enumerate(reversed(source_lines[0:start])):
            if "}" in line or "function(" in line:
                f_end_indexs += [start-i-1]
                break

    for fname, val in fdict.items():
        if val["type_label"] == "function":
            fdict[fname]["code_end"] = min([x for x in f_end_indexs if x >= val["code_start"]])        
            fdict[fname]["code_with_comments_lines_count"] = fdict[fname]["code_end"] - fdict[fname]["code_start"] + 1
            fdict[fname]["comments_lines_count"] = max(0, fdict[fname]["code_with_comments_lines_count"] - fdict[fname]["code_lines_count"] - 2, 0)
            fdict[fname] = dict(sorted([(key, val) for key, val in fdict[fname].items() if key not in "obj"]) + [("obj",fdict[fname]["obj"])])
            fdict[fname]["functions_called"] = [fname for fname in fdict.keys() if fname + "(" in val["code_content_str"]]

    robjects.globalenv.clear()
    return fdict


def find_functions_called(source_path, munirflow_functions):
    try:
        with open(source_path, "r", encoding="utf-8") as f:
            source_lines = f.readlines()
        with open(source_path, "r", encoding="utf-8") as f:
            source_str = f.read()
    except UnicodeDecodeError:
        print("Warning ! This file is not UTF-8 encoded: " + source_path)
        with open(source_path, "r", encoding="latin-1") as f:
            source_lines = f.readlines()
        with open(source_path, "r", encoding="latin-1") as f:
            source_str = f.read()
        
    source_lines = [x.replace("\n", "") for x in source_lines]

    f_starts = [i for i, line in enumerate(source_lines) if re.match("=function\(", line.replace(" ", "").replace("<-", "="))]
    if len(f_starts) > 0 and "do_" in os.path.basename(source_path):
        print("Warning ! There are functions defined in do_*.R-file. Please define functions properly in a dedicated library file.")

    return [fname for fname in munirflow_functions if fname + "(" in source_str]


def clone_repos_and_checkout_branch(git_urls, clone_path, ssh_private_path = "~/.ssh/id_rsa"):
    git_ssh_identity_file = os.path.expanduser(ssh_private_path)
    git_ssh_cmd = 'ssh -i %s' % git_ssh_identity_file
    for user,git_dict in git_urls.items():
        repo_name = os.path.basename(git_dict["git"]).replace(".git", "")
        local_path = os.path.join(clone_path, repo_name)
        git_urls[user]["local_path"] = local_path
        if repo_name not in os.listdir(clone_path):
            print("Cloning " + repo_name + "...")
            with git.Git().custom_environment(GIT_SSH_COMMAND=git_ssh_cmd):
                repo = git.Repo.clone_from(git_dict["git"], local_path)
        else:
            repo = git.Repo(path=local_path)

        repo.git.checkout(git_dict["branch"])
        with git.Git().custom_environment(GIT_SSH_COMMAND=git_ssh_cmd):
            print("Pulling " + repo_name + "...")
            repo.git.pull()
    print("Git update finished.")
    
    return git_urls

In [2]:
# Print libraries used in library code in a copy-pastable format, to set it to "imports" section in package DESCRIPTION
libraries_used = set()
for file_name in sorted(glob.glob("../R/munirflow_*.R")):
    if "test" in file_name:
        continue
    with open(file_name, "r") as f:
        source_lines = f.read()
    libs = re.findall("(?<=library\()[^\)]+", source_lines)
    requires = re.findall("(?<=requires\()[^\)]+", source_lines)
    imports = re.findall(r"(?<=@import )[^\n]+", source_lines)
    import_froms = re.findall(r"(?<=@importFrom )[^\n ]+", source_lines)
    colons = re.findall("[a-zA-Z0-9]+(?=::|:::)", source_lines)
    #libraries_used = libraries_used | set([x.strip() for x in libs + requires + imports + import_froms + colons])
    libraries_used = libraries_used | set([x.strip() for x in colons])

for x in sorted(libraries_used, key=str.casefold):
    print("    " + x + ",")

    import_froms


    devtools,
    doParallel,
    dplyr,
    foreach,
    ggplot2,
    graphics,
    grDevices,
    gridExtra,
    iterators,
    lpSolveAPI,
    lubridate,
    nlmrt,
    openMalariaUtilities,
    parallel,
    plyr,
    rjags,
    rlang,
    sp,
    stats,
    stringr,
    table,
    tidyr,
    tidyselect,
    utils,
    xml2,


In [3]:
## Analyze Munirflow.R
#munir_dict = analyze_library("../munirflow.R")
lib_dict = {}
for file_name in sorted(glob.glob("../R/munirflow_*.R")):
    if "test" in file_name:
        continue
    lib_dict[file_name] = analyze_library(file_name)


lib_dict["all"] = {fname: {**fvalues, **{"libname": libname}} for libname, libdict in lib_dict.items() for fname, fvalues in libdict.items()}

##Download all Git repos of the team to analyze which functions of Munirflow.R are used
clone_path = "/home/andarin/Git"
git_urls = {"munir": {"git": "ssh://git@git.scicore.unibas.ch:2222/idm/countrymodelling/repo.git", "branch": "handover", "dir": "Munirflow"},
            "clara": {"git": "ssh://git@git.scicore.unibas.ch:2222/idm/countrymodelling/om_tza.git", "branch": "master", "dir": ""},
            "jeanne": {"git": "ssh://git@git.scicore.unibas.ch:2222/idm/countrymodelling/om-benin.git", "branch": "master", "dir": ""},
            "tatiana": {"git": "ssh://git@git.scicore.unibas.ch:2222/idm/countrymodelling/mozsimulations.git", "branch": "master", "dir": ""}
           }

git_urls = clone_repos_and_checkout_branch(git_urls, clone_path, '~/.ssh/id_rsa_stph')

for user,git_dict in git_urls.items():
    local_path_absolut = os.path.join(git_urls[user]["local_path"], git_urls[user]["dir"])
    git_urls[user]["files"] = {}
    for r_file in glob.glob(os.path.join(local_path_absolut, "*.R")):
        if "unirflow" in os.path.basename(r_file):
            #Ignore Munirflow.R files
            print("Skipping Munirflow.R: " + r_file)
            continue
        git_urls[user]["files"][r_file] = find_functions_called(r_file, lib_dict["all"].keys())

Anayzing ../R/munirflow_0_base.R
Anayzing ../R/munirflow_1_cluster.R
Anayzing ../R/munirflow_2_postprocess.R
Anayzing ../R/munirflow_3_fitting.R
Anayzing ../R/munirflow_4_local.R
Anayzing ../R/munirflow_5_unsorted.R
Anayzing ../R/munirflow_6_costing.R
Pulling repo...
Pulling om_tza...
Pulling om-benin...
Pulling mozsimulations...
Git update finished.
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_2_postprocess.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_4_local.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_1_cluster.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_5_unsorted.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_0_base.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_6_costing.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow_3_fitting.R
Skipping Munirflow.R: /home/andarin/Git/repo/Munirflow/munirflow.R


In [4]:
### Analyze which functions of Munirflow are used by whom

## Create edges and nodes for the graph
nodes = {fname: {"file": "munirflow", "user": "munir"} for fname in lib_dict["all"].keys()}
edges = []

## Add internal Munirflow edges
for fname, val in lib_dict["all"].items():
    for f_called in val["functions_called"]:
        edges += [[fname, f_called]]

## Add user scripts nodes and edges
for user,git_dict in git_urls.items():
    for file_rec,functions_called in git_dict["files"].items():
        node_rec = user + "/" + os.path.basename(file_rec)
        nodes[node_rec] = {"file": node_rec, "user": user}
        edges += [[node_rec, fname] for fname in functions_called]

## Create Graph - one node for every Munirflow function and one for each client script
G = nx.DiGraph()

G.add_nodes_from(nodes.keys())
G.add_edges_from(edges)

## Set default values for Munirflow.R-nodes
for node_name in lib_dict["all"].keys():
    G.nodes[node_name].setdefault("used_in", [])
    G.nodes[node_name].setdefault("used_count", 0)
    G.nodes[node_name].setdefault("used_only_by_munir", True)
    
for node_name,node_attrs in nodes.items():
    G.nodes[node_name]["file"] = node_attrs["file"]
    if node_attrs["file"] == "munirflow":
        continue
    for node_reached in nx.single_source_shortest_path(G,node_name).keys():
        if node_reached not in lib_dict["all"].keys():
            continue
        G.nodes[node_reached]["used_in"] += [node_attrs["file"]]
        G.nodes[node_reached]["used_count"] += 1
        if node_attrs["user"] != "munir":
            G.nodes[node_reached]["used_only_by_munir"] = False

            
usage_dict = {node: {"used_count": G.nodes[node]["used_count"], "used_only_by_munir": G.nodes[node]["used_only_by_munir"], "used_in": G.nodes[node]["used_in"]} for node in G.nodes if G.nodes[node]["file"] == "munirflow"}

for fname, fdict in usage_dict.items():
    lib_dict["all"][fname] = {**lib_dict["all"][fname], **fdict}

#nx.draw(G)
#plt.savefig("graph.png")
#plt.show()


In [5]:
### Analyze if their are copy-pasted lines in Munirflow

import Levenshtein
cut_off = 0.9
line_window_comparison_length = 3

fname_list = sorted(lib_dict["all"].keys())
code_window_dict = {}

for fname, fdict in lib_dict["all"].items():
    lines = fdict["code_content_str"].split("\n")
    line_comparison_length_real = min(line_window_comparison_length, len(lines))
    lines_window = [re.sub("[ ]+", "", "".join(lines[i:i+line_comparison_length_real])) for i in range(len(lines)-line_comparison_length_real+1)]
    code_window_dict[fname] = lines_window

similarity_dict = {}
for i, fname1 in enumerate(fname_list):
    for fname2 in fname_list[i+1:len(fname_list)]:
        similarity_dict[(fname1, fname2)] = {(iwindow1,iwindow2):Levenshtein.ratio(window1, window2) for iwindow1,window1 in enumerate(code_window_dict[fname1]) for iwindow2,window2 in enumerate(code_window_dict[fname2])}

similarity_cutoff_dict = {}
for key,val in similarity_dict.items():
    dict_rec = {"similarity_score": 0, 
               "index_list": []} 
    for index,x in val.items():
        if x>cut_off:
            dict_rec["similarity_score"] += x
            dict_rec["index_list"] += [index]
    if dict_rec["similarity_score"] > 0:
        dict_rec["similarity_score"] = round(dict_rec["similarity_score"]/0.9)+line_window_comparison_length-1
        similarity_cutoff_dict[key] = dict_rec
            

similarity_cutoff_sorted_dict = {k: v for k, v in sorted(similarity_cutoff_dict.items(), key=lambda item: item[1]["similarity_score"], reverse=True)}
print({k:v["similarity_score"] for k,v in similarity_cutoff_sorted_dict.items()})


for fname, fdict in lib_dict["all"].items():
    fdict["code_overlap_with"] = []
    lib_dict["all"][fname] = fdict

for (fname1, fname2), fdict in similarity_cutoff_sorted_dict.items():
    lib_dict["all"][fname1]["code_overlap_with"] += [(fname2, fdict["similarity_score"])]
    lib_dict["all"][fname2]["code_overlap_with"] += [(fname1, fdict["similarity_score"])]


{('batch_combine_future', 'batch_extract_outcomes'): 19, ('.do_post_process_cleanup', '.prepare_simulation_dataset'): 15, ('define_IRS', 'define_nothing'): 11, ('batch_cleanup', 'batch_process'): 8, ('define_ITN', 'define_nothing'): 8, ('define_changeHS', 'write_healthsys'): 8, ('.warn_about_bad_names', 'check_vars'): 7, ('.deploy', 'deploy_it'): 6, ('.extract_clean_futrs', '.prepare_visualization'): 6, ('.calculate_arrays_for_fitting', '.store_jags_objects'): 5, ('.calculate_arrays_for_fitting', 'view_calibration'): 5, ('.do_post_process_cleanup', '.do_post_processing'): 5, ('.do_post_processing', '.prepare_simulation_dataset'): 5, ('.do_processing_jags', 'write_JAGSmodel'): 5, ('.store_jags_objects', 'view_calibration'): 5, ('define_ITN', 'deploy_it'): 5, ('deploy_cont', 'deploy_it'): 5, ('batch_combine_future', 'batch_jags'): 4, ('batch_jags', 'batch_scenario'): 4, ('batch_malaria', 'batch_scenario'): 4, ('batch_process', 'batch_subloop'): 4, ('define_IRS', 'define_ITN'): 4, ('defin

In [6]:
munir_df = pd.DataFrame(lib_dict["all"]).transpose().drop(["obj", "code_content_str", "code_starts", "type_technical"], axis=1)

col_order = ['libname', 'code_lines_count', 'code_with_comments_lines_count', 'comments_lines_count', 
 'used_count', 'used_in', "used_only_by_munir",
 'code_start', 'code_end', 
 'out', 'params', 'params_count', 'params_obligatory_count', 'params_optional_count', 'params_unused', 'functions_called', 'code_overlap_with', 'type_label', 'code_raw']

munir_ordered_df = munir_df[col_order]

print("Number of lines of code in Munirflow without comments: " + str(munir_ordered_df["code_lines_count"].sum()))
if len(munir_ordered_df[munir_ordered_df["type_label"]!="function"].index) > 0:
    print("\nThere are other objects than functions defined in Munirflow!")
    display(munir_usage_ordered_df.groupby("type_label").size().reset_index(name='counts').to_string(index=False))

## Calculate unused functions:
f_unused = munir_ordered_df[munir_ordered_df["used_count"]==0]
print("\nThe following functions are not used by any user and can be deleted: " + str(list(f_unused.index)))
print("This will gain " + str(int(f_unused[["code_lines_count"]].sum()["code_lines_count"])) + " LOC.")

## Calculate TO BE DEPRECATED functions
f_deprecatable = munir_ordered_df[(munir_ordered_df["used_count"]>0) & (munir_ordered_df["used_only_by_munir"])]
print("\nThe following functions are only used by Munir and can probably be deprecated: " + str(list(f_deprecatable.index)))
print("This will gain " + str(int(f_deprecatable[["code_lines_count"]].sum()["code_lines_count"])) + " LOC.")


Number of lines of code in Munirflow without comments: 4325

The following functions are not used by any user and can be deleted: ['.dateToTimestep', '.define_decay', '.define_mosquito', '.exponential', '.hill', '.linear', '.smooth_compact', '.step', '.weibull', '.write_GVI_head', '.write_GVI_parameter', 'define_GVI', 'define_IPTi', 'deploy_cont', 'cancel_jobs', 'g_legend', 'goto_group', 'hard_install', 'plot_jobtime', 'save_futz', 'step1_prepare_scenario', 'step2_prepare_malaria', 'who_failed', 'step3_prepare_postprocess', '.add_EIR_zero_row', '.check_simulation_range', '.check_simulation_range_loop', '.date_today', '.join_futrs_csv', '.save_fitdat', '.separate_jags_output', 'extract_outcomes_csv', 'step4_prepare_fitting', 'visualize_prior', '.assign_core_local', '.write_local_bat_loop', '.write_local_loop', 'install_open_malaria', 'make_histvar', '.prepare_visualization', 'assign_value', 'cases', 'linearfit', 'lunique', 'seasonal_decay', 'simplefit', 'sunique', 'update_names', 'view_

In [8]:
## Show complete table
#display(munir_ordered_df[["code_lines_count", "comments_lines_count", "used_count", "used_only_by_munir", "code_overlap_with"]].sort_values(["code_lines_count", "comments_lines_count"], ascending=False))

## Save analysis DF to CSV
munir_ordered_df.reset_index().rename(columns = {'index':'function_name'}).to_csv("openMalariaUtilities_analysis.csv.gz", sep=";", index=False, compression='gzip')
munir_ordered_df.reset_index().rename(columns = {'index':'function_name'}).to_excel("openMalariaUtilities_analysis.xlsx", index=False)