<a href="https://colab.research.google.com/github/RepoAnalysis/RepoSim/blob/main/notebooks/CrossEncoder/HungarianAlgorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install `inspect4py` using Python 3.9.

In [None]:
# Install python 3.9 on colab
!sudo apt-get update -y
!sudo apt-get install python3.9 python3-pip

In [None]:
# Install inspect4py
!git clone https://github.com/SoftwareUnderstanding/inspect4py.git
%cd inspect4py
!git checkout dev
!python3.9 -m pip install -r requirements.txt && python3.9 setup.py install
%cd /content/

In [None]:
!inspect4py --version

inspect4py, version 0.0.6


### Download test repositories and run inspect4py on them

In [None]:
REPOS =[
    "keon/algorithms", 
    "TheAlgorithms/Python"
]

In [None]:
!mkdir -p /content/output
%cd /content/
for repo in REPOS:
    !mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
    !inspect4py -i /content/{repo} -o /content/output/{repo} -sc -rm

### Extract docstrings and functions from repositories.

In [None]:
import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break


def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            # if file.get("classes") is not None:
            #     for class_name, class_info in file["classes"].items():
            #         if class_info.get("methods") is not None:
            #             funcs_to_lists(class_info["methods"], func_codes, docs)

    return func_codes, docs


In [None]:
repo_info = {}
for repo in REPOS:
    repo_info[repo] = {}

    function_list, docstring_list = file_to_lists(f"/content/output/{repo}/directory_info.json")
    repo_info[repo]["docs"] = docstring_list
    repo_info[repo]["funcs"] = function_list

### Download PythonCloneDetection and install requirements

In [None]:
!git clone https://github.com/RepoAnalysis/PythonCloneDetection
!pip install -r PythonCloneDetection/requirements.txt
!pip install -U sentence-transformers

### Inference similarity scores for all code/doc pairs and apply hungarian algorithm

In [None]:
%cd PythonCloneDetection

import torch
import pandas as pd
import itertools as it

from scipy.optimize import linear_sum_assignment
from sentence_transformers import SentenceTransformer, util

from clone_classifier import CloneClassifier


def list_similarity(l1, l2, list_type):
    if not l1 or not l2:
        return

    if list_type == "code":
        # initialize the clone classifier
        clone_classifier = CloneClassifier(fp16=True)

        # calculate the similarity scores of all pairs of functions
        df = pd.DataFrame(it.product(l1, l2), columns=["code1", "code2"])
        output_df = clone_classifier.predict(df, return_score=True)

        # reshape two sets of functions into a matrix of similarity scores
        cost_matrix = output_df.pivot_table(
            index="code1", columns="code2", values="score"
        )
    elif list_type == "docs":
        # calculate the similarity scores of all pairs of docstrings
        model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
        embeddings1 = model.encode(l1, convert_to_tensor=True)
        embeddings2 = model.encode(l2, convert_to_tensor=True)

        cosine_scores = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
        cost_matrix = pd.DataFrame(cosine_scores, index=l1, columns=l2)
    else:
        return

    # apply hungarian algorithm
    row_ind, col_ind = linear_sum_assignment(cost_matrix, maximize=True)

    # retrieve the best matches and their similarity scores
    max_scores = cost_matrix.values[row_ind, col_ind]
    max_l1 = cost_matrix.index[row_ind]
    max_l2 = cost_matrix.columns[col_ind]
    if list_type == "code":
        max_df = pd.DataFrame({"code1": max_l1, "code2": max_l2})
    else:
        max_df = pd.DataFrame({"doc1": max_l1, "doc2": max_l2})
    max_df["score"] = max_scores

    return max_df

/content/PythonCloneDetection


### Assignments based on docstring comparison

In [None]:
with torch.no_grad():
    docs_df = list_similarity(repo_info["keon/algorithms"]["docs"], repo_info["TheAlgorithms/Python"]["docs"], "docs")

In [None]:
docs_df.sort_values("score", ascending=False).reset_index(drop=True)

Unnamed: 0,doc1,doc2,score
0,find_factorial Calculates the factorial of a g...,factorial Find the factorial of a given number n,0.932496
1,cosine_similarity Calculate cosine similarity ...,cosine_similarity Calculates cosine similarity...,0.927878
2,dfs Function that performs DFS,dfs DFS traversal,0.864706
3,decimal_to_binary_util Convert 8-bit decimal n...,decimal_to_binary ['0.00.01.5'],0.852933
4,pancake_sort Sorting a given array\nmutation o...,pancake_sort Sort Array with Pancake Sort.,0.782438
...,...,...,...
211,alice_public_key with her private key.\nThis i...,try_key If the decrypted message contains a in...,0.423209
212,dinic_bfs Check whether sink is reachable only...,check ['0.00.01.5'],0.419797
213,hailstone n: The starting point of the hailsto...,"peak >>> peak([1, 2, 3, 4, 5, 4, 3, 2, 1])\n5\...",0.418473
214,alice_shared_key with her private key and Bob'...,"dencrypt >>> msg = ""My secret bank account num...",0.408872


### Assignments based on code comparison

In [None]:
# As the algorithm is inefficient, it takes too long to run the below code on colab
# with torch.no_grad():
    # code_df = list_similarity(repo_info["keon/algorithms"]["funcs"], repo_info["TheAlgorithms/Python"]["funcs"], "code")

#### The assignment result based on code and docs comparison is generated on a GPU server and uploaded on https://github.com/RepoAnalysis/RepoSim/blob/main/notebooks/CrossEncoder/keonalgorithms-TheAlgorithmsPython.csv
