<a href="https://colab.research.google.com/github/RepoAnalysis/RepoSim/blob/main/notebooks/BiEncoder/UniXCoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install `inspect4py` using Python 3.9

In [None]:
# Install python 3.9 on colab
!sudo apt-get update -y
!sudo apt-get install python3.9 python3-pip

In [None]:
# Install inspect4py
!git clone https://github.com/SoftwareUnderstanding/inspect4py.git
%cd inspect4py
!git checkout dev
!python3.9 -m pip install -r requirements.txt && python3.9 setup.py install
%cd /content/

In [3]:
!inspect4py --version

inspect4py, version 0.0.6


### Download test repositories and run inspect4py on them

In [4]:
# Repo-topic pairs are generated from 
# https://github.com/RepoAnalysis/RepoSim/blob/main/data/repo_topic.py
REPOS = {
    'keon/algorithms': 'Algorithms',
    'prabhupant/python-ds': 'Algorithms',
    'grantjenks/python-sortedcontainers': 'Algorithms',
    'TheAlgorithms/Python': 'Algorithms',
    'beetbox/audioread': 'Audio',
    'worldveil/dejavu': 'Audio',
    'keunwoochoi/kapre': 'Audio',
    'librosa/librosa': 'Audio',
    'sergree/matchering': 'Audio',
    'tyiannak/pyAudioAnalysis': 'Audio',
    'jiaaro/pydub': 'Audio',
    'Parisson/TimeSide': 'Audio',
    'lepture/authlib': 'OAuth',
    'pennersr/django-allauth': 'OAuth',
    'evonove/django-oauth-toolkit': 'OAuth',
    'idan/oauthlib': 'OAuth',
    'joestump/python-oauth2': 'OAuth',
    'omab/python-social-auth': 'OAuth',
    'paramiko/paramiko': 'Cryptography',
    'pyca/pynacl': 'Cryptography',
    'jindaxiang/akshare': 'Downloader',
    's3tools/s3cmd': 'Downloader',
    'bloomreach/s4cmd': 'Downloader',
    'euske/pdfminer': 'PDF',
    'mstamy2/PyPDF2': 'PDF',
    'lepture/mistune': 'Markdown',
    'waylan/Python-Markdown': 'Markdown'
}

In [None]:
!mkdir -p /content/output
%cd /content/
for repo in REPOS:
    !mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
    !inspect4py -i /content/{repo} -o /content/output/{repo} -sc -rm

### Extract docstrings and functions from repositories.

In [6]:
import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break


def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)

    return func_codes, docs


In [7]:
repo_info = {}
for repo, topic in REPOS.items():
    repo_info[repo] = {}

    function_list, docstring_list = file_to_lists(f"/content/output/{repo}/directory_info.json")
    repo_info[repo]["docs"] = docstring_list
    repo_info[repo]["funcs"] = function_list
    repo_info[repo]["topic"] = topic

### Download UniXCoder, fine-tuned model and install requirements

In [None]:
%cd /content/
%pip install -U sentence-transformers
!wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

### Generate embeddings for all repositories

In [None]:
import torch
from unixcoder import UniXcoder
from transformers import RobertaModel
from sentence_transformers import SentenceTransformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
doc_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", device=device)
code_model = UniXcoder("Lazyhope/unixcoder-nine-advtest")
code_model.to(device)

def get_code_embeddings(code):
    tokens_ids = code_model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = code_model(source_ids)

    return embeddings

def get_repo_embeddings(lst, input_type):
    if not lst:
        return None
    with torch.no_grad():
        if input_type == "code":
            embeddings_list = torch.concat([get_code_embeddings(code) for code in lst])
        elif input_type == "doc":
            embeddings_list = doc_model.encode(lst, convert_to_tensor=True)
    
        mean_embeddings = torch.mean(embeddings_list, axis=0)

    return mean_embeddings

In [10]:
from tqdm import tqdm


# Generate code and docstring embeddings for all repositories
for repo_name, repo_dict in tqdm(repo_info.items()):
    print(f" - Generating embeddings for {repo_name} - ")
    if repo_dict.get("code_embeddings") is None:
        repo_dict["code_embeddings"] = get_repo_embeddings(repo_dict["funcs"], input_type="code")
    if repo_dict.get("doc_embeddings") is None:
        repo_dict["doc_embeddings"] = get_repo_embeddings(repo_dict["docs"], input_type="doc")

  0%|          | 0/27 [00:00<?, ?it/s]

 - Generating embeddings for keon/algorithms - 


  4%|▎         | 1/27 [00:26<11:17, 26.05s/it]

 - Generating embeddings for prabhupant/python-ds - 


  7%|▋         | 2/27 [00:31<05:44, 13.79s/it]

 - Generating embeddings for grantjenks/python-sortedcontainers - 


 11%|█         | 3/27 [00:36<03:53,  9.72s/it]

 - Generating embeddings for TheAlgorithms/Python - 


 15%|█▍        | 4/27 [01:31<10:33, 27.55s/it]

 - Generating embeddings for beetbox/audioread - 


 19%|█▊        | 5/27 [01:31<06:32, 17.85s/it]

 - Generating embeddings for worldveil/dejavu - 


 22%|██▏       | 6/27 [01:33<04:19, 12.36s/it]

 - Generating embeddings for keunwoochoi/kapre - 


 26%|██▌       | 7/27 [01:35<03:00,  9.01s/it]

 - Generating embeddings for librosa/librosa - 


 30%|██▉       | 8/27 [01:54<03:51, 12.18s/it]

 - Generating embeddings for sergree/matchering - 


 33%|███▎      | 9/27 [01:55<02:36,  8.70s/it]

 - Generating embeddings for tyiannak/pyAudioAnalysis - 


 37%|███▋      | 10/27 [01:57<01:51,  6.53s/it]

 - Generating embeddings for jiaaro/pydub - 


 41%|████      | 11/27 [02:00<01:30,  5.68s/it]

 - Generating embeddings for Parisson/TimeSide - 


 44%|████▍     | 12/27 [02:14<02:02,  8.19s/it]

 - Generating embeddings for lepture/authlib - 


 48%|████▊     | 13/27 [02:39<03:03, 13.13s/it]

 - Generating embeddings for pennersr/django-allauth - 


 52%|█████▏    | 14/27 [03:01<03:24, 15.71s/it]

 - Generating embeddings for evonove/django-oauth-toolkit - 


 56%|█████▌    | 15/27 [03:10<02:46, 13.90s/it]

 - Generating embeddings for idan/oauthlib - 


 59%|█████▉    | 16/27 [03:27<02:40, 14.62s/it]

 - Generating embeddings for joestump/python-oauth2 - 


 63%|██████▎   | 17/27 [03:29<01:49, 10.92s/it]

 - Generating embeddings for omab/python-social-auth - 


 67%|██████▋   | 18/27 [03:34<01:21,  9.06s/it]

 - Generating embeddings for paramiko/paramiko - 


 70%|███████   | 19/27 [03:45<01:17,  9.72s/it]

 - Generating embeddings for pyca/pynacl - 


 74%|███████▍  | 20/27 [03:51<01:00,  8.62s/it]

 - Generating embeddings for jindaxiang/akshare - 


 78%|███████▊  | 21/27 [04:24<01:35, 15.88s/it]

 - Generating embeddings for s3tools/s3cmd - 


 85%|████████▌ | 23/27 [04:26<00:33,  8.30s/it]

 - Generating embeddings for bloomreach/s4cmd - 
 - Generating embeddings for euske/pdfminer - 


 89%|████████▉ | 24/27 [04:26<00:17,  5.85s/it]

 - Generating embeddings for mstamy2/PyPDF2 - 


 93%|█████████▎| 25/27 [04:36<00:14,  7.06s/it]

 - Generating embeddings for lepture/mistune - 


 96%|█████████▋| 26/27 [04:41<00:06,  6.28s/it]

 - Generating embeddings for waylan/Python-Markdown - 


100%|██████████| 27/27 [04:54<00:00, 10.90s/it]


### Evaluations & Results

In [11]:
from torch.nn import CosineSimilarity
from itertools import combinations

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
for repo1, repo2 in combinations(REPOS, 2):
    code_embeddings1 = repo_info[repo1]["code_embeddings"]
    code_embeddings2 = repo_info[repo2]["code_embeddings"]
    if code_embeddings1 is None or code_embeddings2 is None:
        code_similarity = None
    else:
        code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    doc_embeddings1 = repo_info[repo1]["doc_embeddings"]
    doc_embeddings2 = repo_info[repo2]["doc_embeddings"]
    if doc_embeddings1 is None or doc_embeddings2 is None:
        doc_similarity = None
    else:
        doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info[repo1]["topic"]
    topic2 = repo_info[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity))

In [12]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim"])

# NaN values due to missing code/docstring in the repo will be skipped
df["avg_sim"] = df[["code_sim", "doc_sim"]].mean(axis=1, skipna=True)
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,keon/algorithms,prabhupant/python-ds,Algorithms,Algorithms,0.778547,0.879596,0.829071
1,keon/algorithms,grantjenks/python-sortedcontainers,Algorithms,Algorithms,0.727102,0.867281,0.797192
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.895993,0.900286,0.898139
3,keon/algorithms,beetbox/audioread,Algorithms,Audio,0.008753,0.471483,0.240118
4,keon/algorithms,worldveil/dejavu,Algorithms,Audio,0.176522,0.522964,0.349743
...,...,...,...,...,...,...,...
346,euske/pdfminer,lepture/mistune,PDF,Markdown,0.294981,0.450331,0.372656
347,euske/pdfminer,waylan/Python-Markdown,PDF,Markdown,0.305095,0.414341,0.359718
348,mstamy2/PyPDF2,lepture/mistune,PDF,Markdown,0.333942,0.780140,0.557041
349,mstamy2/PyPDF2,waylan/Python-Markdown,PDF,Markdown,0.532385,0.814950,0.673668


In [13]:
# Sort the table based on code semantic similarity
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.936125,0.959024,0.947574
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.922530,0.932178,0.927354
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.895993,0.900286,0.898139
3,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.891102,0.878009,0.884556
4,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.879883,0.919387,0.899635
...,...,...,...,...,...,...,...
346,sergree/matchering,pennersr/django-allauth,Audio,OAuth,-0.127733,,-0.127733
347,tyiannak/pyAudioAnalysis,pennersr/django-allauth,Audio,OAuth,-0.127950,0.253006,0.062528
348,jindaxiang/akshare,s3tools/s3cmd,Downloader,Downloader,-0.128894,0.235784,0.053445
349,librosa/librosa,pennersr/django-allauth,Audio,OAuth,-0.134879,0.254131,0.059626


In [14]:
# Sort the table based on docstring semantic similarity
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.936125,0.959024,0.947574
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.922530,0.932178,0.927354
2,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.879883,0.919387,0.899635
3,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.895993,0.900286,0.898139
4,lepture/authlib,joestump/python-oauth2,OAuth,OAuth,0.834853,0.899069,0.866961
...,...,...,...,...,...,...,...
346,sergree/matchering,bloomreach/s4cmd,Audio,Downloader,0.003936,,0.003936
347,sergree/matchering,euske/pdfminer,Audio,PDF,0.147579,,0.147579
348,sergree/matchering,mstamy2/PyPDF2,Audio,PDF,0.180949,,0.180949
349,sergree/matchering,lepture/mistune,Audio,Markdown,0.022897,,0.022897


In [15]:
# Sort the table based on average similarity and save it
df = df.sort_values("avg_sim", ascending=False).reset_index(drop=True)
df.to_csv("eval_res.csv", index=False)

df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.936125,0.959024,0.947574
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.922530,0.932178,0.927354
2,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.879883,0.919387,0.899635
3,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.895993,0.900286,0.898139
4,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.891102,0.878009,0.884556
...,...,...,...,...,...,...,...
346,sergree/matchering,lepture/authlib,Audio,OAuth,-0.036973,,-0.036973
347,sergree/matchering,omab/python-social-auth,Audio,OAuth,-0.047977,,-0.047977
348,sergree/matchering,evonove/django-oauth-toolkit,Audio,OAuth,-0.061241,,-0.061241
349,sergree/matchering,s3tools/s3cmd,Audio,Downloader,-0.061914,,-0.061914
