# Embedding similarity ---- all embeddings

## 1. Install inspect4py

In [71]:
!pip3 install inspect4py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable


In [72]:
!inspect4py --version

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
inspect4py, version 0.0.8


## 2. Download dataset and use inspect4py on them

In [73]:
REPOS = {
    'keon/algorithms': 'Algorithms',
    'prabhupant/python-ds': 'Algorithms',
    'grantjenks/python-sortedcontainers': 'Algorithms',
    'TheAlgorithms/Python': 'Algorithms',
    'beetbox/audioread': 'Audio',
    'worldveil/dejavu': 'Audio',
    'keunwoochoi/kapre': 'Audio',
    'librosa/librosa': 'Audio',
    'sergree/matchering': 'Audio',
    'tyiannak/pyAudioAnalysis': 'Audio',
    'jiaaro/pydub': 'Audio',
    'Parisson/TimeSide': 'Audio',
    'lepture/authlib': 'OAuth',
    'pennersr/django-allauth': 'OAuth',
    'evonove/django-oauth-toolkit': 'OAuth',
    'idan/oauthlib': 'OAuth',
    'joestump/python-oauth2': 'OAuth',
    'omab/python-social-auth': 'OAuth',
    'paramiko/paramiko': 'Cryptography',
    'pyca/pynacl': 'Cryptography',
    'jindaxiang/akshare': 'Downloader',
    's3tools/s3cmd': 'Downloader',
    'bloomreach/s4cmd': 'Downloader',
    'euske/pdfminer': 'PDF',
    'mstamy2/PyPDF2': 'PDF',
    'lepture/mistune': 'Markdown',
    'waylan/Python-Markdown': 'Markdown'
}

In [74]:
!mkdir -p output
for repo in REPOS:
    !mkdir -p {repo} & git clone {f"https://github.com/{repo}.git"} {repo}
    !inspect4py -i {repo} -o output/ {repo} -r -dt -sc -rm -md

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
fatal: destination path 'keon/algorithms' already exists and is not an empty directory.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Error when processing invert_tree.py:  (<class 'AttributeError

## 3. Extract docstrings, codes, requirements, structure, readme, and metadata from repositories

In [75]:
# List all codes or docs
def list_codes_docs(funcs, codes_list, docs_list):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            codes_list.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs_list.append(f"{func_name} {func_info['doc'].get(key)}")
                break


# Extract codes and codes documents
def extract_codes_docs(filepath):
    codes_list = []
    docs_list = []
    with open(filepath, "r") as f:
        json_info = json.load(f)
    for element in ["requirements", "directory_tree", "readme_files", "metadata"]:
        json_info.pop(element, None)

    for name, files in json_info.items():
        for file in files:
            if file.get("functions") is not None:
                list_codes_docs(file["functions"], codes_list, docs_list)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        list_codes_docs(class_info["methods"], codes_list, docs_list)

    return codes_list, docs_list


# Extract other content
def extract_other_content(filepath, element):
    element_list = []
    with open(filepath, "r") as f:
        json_info = json.load(f)
    if json_info.get(element) is not None:
        for key, value in json_info.get(element).items():
            element_list.append(value)
    return element_list

In [76]:
import json

repo_info = {}
for repo, topic in REPOS.items():
    repo_info[repo] = {}
    codes_list, docs_list = extract_codes_docs(f"output/{repo}/directory_info.json")
    repo_info[repo]["docs"] = docs_list
    repo_info[repo]["codes"] = codes_list
    repo_info[repo]["requirements"] = extract_other_content(f"output/{repo}/directory_info.json", "requirements")
    repo_info[repo]["structure"] = extract_other_content(f"output/{repo}/directory_info.json", "directory_tree")
    repo_info[repo]["readme"] = extract_other_content(f"output/{repo}/directory_info.json", "readme_files")
    repo_info[repo]["metadata"] = extract_other_content(f"output/{repo}/directory_info.json", "metadata")
    repo_info[repo]["topic"] = topic

## 4. Download pre-trained model

In [77]:
!pip3 install sentence-transformers
!pip3 install transformers
!curl -o unixcoder.py https: // raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | 

In [78]:
import torch
from unixcoder import UniXcoder
from sentence_transformers import SentenceTransformer

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(device)

mps


In [79]:
doc_model = SentenceTransformer("all-mpnet-base-v2", device=device)
code_model = UniXcoder("Lazyhope/unixcoder-nine-advtest")
code_model.to(device)

UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

## 5. Generating embedding

In [80]:
def get_code_embeddings(code):
    tokens_ids = code_model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = code_model(source_ids)
    return embeddings


def get_repo_embeddings(list, input_type):
    if list is None or not list:
        return torch.zeros((768,), device=device)
    with torch.no_grad():
        if input_type == "code":
            embeddings_list = torch.concat([get_code_embeddings(code) for code in list])
        elif input_type == "doc":
            embeddings_list = doc_model.encode(list, convert_to_tensor=True)

        mean_embeddings = torch.mean(embeddings_list, dim=0)

    return mean_embeddings

In [81]:
from tqdm import tqdm

# Generate embeddings for all repositories
for repo_name, repo_dict in tqdm(repo_info.items()):
    print(f" - Generating embeddings for {repo_name} - ")
    if repo_dict.get("code_embeddings") is None:
        repo_dict["code_embeddings"] = get_repo_embeddings(repo_dict["codes"], input_type="code")
    if repo_dict.get("doc_embeddings") is None:
        repo_dict["doc_embeddings"] = get_repo_embeddings(repo_dict["docs"], input_type="doc")
    if repo_dict.get("requirement_embeddings") is None:
        repo_dict["requirement_embeddings"] = get_repo_embeddings(repo_dict["requirements"], input_type="doc")
    if repo_dict.get("structure_embeddings") is None:
        repo_dict["structure_embeddings"] = get_repo_embeddings(repo_dict["structure"], input_type="doc")
    if repo_dict.get("readme_embeddings") is None:
        repo_dict["readme_embeddings"] = get_repo_embeddings(repo_dict["readme"], input_type="doc")

  0%|          | 0/27 [00:00<?, ?it/s]

 - Generating embeddings for keon/algorithms - 


  4%|▎         | 1/27 [00:42<18:35, 42.90s/it]

 - Generating embeddings for prabhupant/python-ds - 


  7%|▋         | 2/27 [00:57<10:51, 26.05s/it]

 - Generating embeddings for grantjenks/python-sortedcontainers - 


 11%|█         | 3/27 [01:10<08:05, 20.25s/it]

 - Generating embeddings for TheAlgorithms/Python - 


 15%|█▍        | 4/27 [04:15<32:43, 85.36s/it]

 - Generating embeddings for beetbox/audioread - 


 19%|█▊        | 5/27 [04:18<20:23, 55.60s/it]

 - Generating embeddings for worldveil/dejavu - 


 22%|██▏       | 6/27 [04:23<13:26, 38.42s/it]

 - Generating embeddings for keunwoochoi/kapre - 


 26%|██▌       | 7/27 [04:31<09:26, 28.34s/it]

 - Generating embeddings for librosa/librosa - 


 30%|██▉       | 8/27 [05:23<11:23, 35.98s/it]

 - Generating embeddings for sergree/matchering - 


 33%|███▎      | 9/27 [05:27<07:45, 25.87s/it]

 - Generating embeddings for tyiannak/pyAudioAnalysis - 


 37%|███▋      | 10/27 [05:34<05:42, 20.12s/it]

 - Generating embeddings for jiaaro/pydub - 


 41%|████      | 11/27 [05:44<04:31, 16.98s/it]

 - Generating embeddings for Parisson/TimeSide - 


 44%|████▍     | 12/27 [06:20<05:41, 22.76s/it]

 - Generating embeddings for lepture/authlib - 


 48%|████▊     | 13/27 [07:26<08:23, 35.95s/it]

 - Generating embeddings for pennersr/django-allauth - 


 52%|█████▏    | 14/27 [08:13<08:30, 39.28s/it]

 - Generating embeddings for evonove/django-oauth-toolkit - 


 56%|█████▌    | 15/27 [08:44<07:20, 36.75s/it]

 - Generating embeddings for idan/oauthlib - 


 59%|█████▉    | 16/27 [09:29<07:10, 39.18s/it]

 - Generating embeddings for joestump/python-oauth2 - 


 63%|██████▎   | 17/27 [09:36<04:56, 29.64s/it]

 - Generating embeddings for omab/python-social-auth - 


 67%|██████▋   | 18/27 [09:48<03:38, 24.29s/it]

 - Generating embeddings for paramiko/paramiko - 


 70%|███████   | 19/27 [10:14<03:19, 24.93s/it]

 - Generating embeddings for pyca/pynacl - 


 74%|███████▍  | 20/27 [10:35<02:45, 23.65s/it]

 - Generating embeddings for jindaxiang/akshare - 


 78%|███████▊  | 21/27 [12:23<04:54, 49.08s/it]

 - Generating embeddings for s3tools/s3cmd - 


 81%|████████▏ | 22/27 [12:31<03:02, 36.51s/it]

 - Generating embeddings for bloomreach/s4cmd - 


 85%|████████▌ | 23/27 [12:31<01:42, 25.65s/it]

 - Generating embeddings for euske/pdfminer - 


 89%|████████▉ | 24/27 [12:32<00:54, 18.19s/it]

 - Generating embeddings for mstamy2/PyPDF2 - 


 93%|█████████▎| 25/27 [13:04<00:44, 22.29s/it]

 - Generating embeddings for lepture/mistune - 


 96%|█████████▋| 26/27 [13:13<00:18, 18.36s/it]

 - Generating embeddings for waylan/Python-Markdown - 


100%|██████████| 27/27 [13:47<00:00, 30.65s/it]


## 6. Similarity calculation

In [83]:
from torch.nn import CosineSimilarity
from itertools import combinations

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info.keys(), 2), total=num_of_rows):
    code_embeddings1 = repo_info[repo1]["code_embeddings"]
    code_embeddings2 = repo_info[repo2]["code_embeddings"]
    code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    doc_embeddings1 = repo_info[repo1]["doc_embeddings"]
    doc_embeddings2 = repo_info[repo2]["doc_embeddings"]
    doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    requirement_embeddings1 = repo_info[repo1]["requirement_embeddings"]
    requirement_embeddings2 = repo_info[repo2]["requirement_embeddings"]
    requirement_similarity = cossim(requirement_embeddings1, requirement_embeddings2).cpu().detach().numpy().item()

    structure_embeddings1 = repo_info[repo1]["structure_embeddings"]
    structure_embeddings2 = repo_info[repo2]["structure_embeddings"]
    structure_similarity = cossim(structure_embeddings1, structure_embeddings2).cpu().detach().numpy().item()

    readme_embeddings1 = repo_info[repo1]["readme_embeddings"]
    readme_embeddings2 = repo_info[repo2]["readme_embeddings"]
    readme_similarity = cossim(readme_embeddings1, readme_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info[repo1]["topic"]
    topic2 = repo_info[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity, requirement_similarity,
                structure_similarity, readme_similarity))

100%|██████████| 351/351 [00:01<00:00, 255.86it/s]


In [84]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim", "requirement_sim",
                                "structure_sim", "readme_sim"])
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,keon/algorithms,prabhupant/python-ds,Algorithms,Algorithms,0.776967,0.822550,0.000000,0.474299,0.709225
1,keon/algorithms,grantjenks/python-sortedcontainers,Algorithms,Algorithms,0.727241,0.707484,0.838826,0.209868,0.453314
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897145,0.845873,0.880533,0.415860,0.517053
3,keon/algorithms,beetbox/audioread,Algorithms,Audio,0.009967,0.279662,0.706504,0.474299,0.256359
4,keon/algorithms,worldveil/dejavu,Algorithms,Audio,0.176932,0.410920,0.929586,0.209868,0.216086
...,...,...,...,...,...,...,...,...,...
346,euske/pdfminer,lepture/mistune,PDF,Markdown,0.298307,0.251666,0.919282,1.000000,0.380539
347,euske/pdfminer,waylan/Python-Markdown,PDF,Markdown,0.305356,0.266002,0.908167,0.394829,0.378558
348,mstamy2/PyPDF2,lepture/mistune,PDF,Markdown,0.329024,0.430963,0.887526,0.417669,0.317927
349,mstamy2/PyPDF2,waylan/Python-Markdown,PDF,Markdown,0.524874,0.610779,0.804444,0.463661,0.396914


In [85]:
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.937531,0.952218,0.885930,0.385717,0.769324
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.919488,0.871041,0.880123,1.000000,0.673768
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897145,0.845873,0.880533,0.415860,0.517053
3,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.889833,0.813740,0.986769,0.385717,0.731658
4,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.879978,0.884434,0.893791,0.415860,0.849909
...,...,...,...,...,...,...,...,...,...
346,sergree/matchering,pennersr/django-allauth,Audio,OAuth,-0.127382,0.000000,0.886638,0.327863,0.431417
347,tyiannak/pyAudioAnalysis,pennersr/django-allauth,Audio,OAuth,-0.127944,0.133521,0.936459,0.415962,0.185671
348,jindaxiang/akshare,s3tools/s3cmd,Downloader,Downloader,-0.128285,0.174601,0.868814,0.415962,0.262691
349,librosa/librosa,pennersr/django-allauth,Audio,OAuth,-0.134153,0.117186,0.952453,0.415962,0.307107


In [86]:
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.937531,0.952218,0.885930,0.385717,0.769324
1,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.879978,0.884434,0.893791,0.415860,0.849909
2,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.919488,0.871041,0.880123,1.000000,0.673768
3,lepture/authlib,joestump/python-oauth2,OAuth,OAuth,0.837781,0.849447,0.909247,0.352853,0.758986
4,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897145,0.845873,0.880533,0.415860,0.517053
...,...,...,...,...,...,...,...,...,...
346,keunwoochoi/kapre,sergree/matchering,Audio,Audio,0.665866,0.000000,0.957774,0.406075,0.098606
347,worldveil/dejavu,sergree/matchering,Audio,Audio,0.239733,0.000000,0.973077,0.225204,0.254849
348,keon/algorithms,sergree/matchering,Algorithms,Audio,0.385051,0.000000,0.909003,0.329200,0.416653
349,beetbox/audioread,sergree/matchering,Audio,Audio,0.304321,0.000000,0.756959,0.283867,0.109871


In [87]:
df.sort_values("requirement_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,librosa/librosa,Parisson/TimeSide,Audio,Audio,0.348526,0.487567,0.994391,0.415860,0.464888
1,worldveil/dejavu,tyiannak/pyAudioAnalysis,Audio,Audio,0.398418,0.500775,0.987630,0.358336,0.554342
2,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.889833,0.813740,0.986769,0.385717,0.731658
3,TheAlgorithms/Python,evonove/django-oauth-toolkit,Algorithms,OAuth,0.039334,0.181022,0.983084,1.000000,0.172390
4,keon/algorithms,Parisson/TimeSide,Algorithms,Audio,0.056477,0.366021,0.981612,1.000000,0.428975
...,...,...,...,...,...,...,...,...,...
346,prabhupant/python-ds,idan/oauthlib,Algorithms,OAuth,-0.012275,0.179171,0.000000,0.394829,0.339108
347,prabhupant/python-ds,evonove/django-oauth-toolkit,Algorithms,OAuth,-0.061155,0.105172,0.000000,0.394829,0.190244
348,prabhupant/python-ds,pennersr/django-allauth,Algorithms,OAuth,-0.116639,0.170995,0.000000,0.473030,0.188903
349,prabhupant/python-ds,lepture/authlib,Algorithms,OAuth,-0.034557,0.188919,0.000000,0.307355,0.304759


In [88]:
df.sort_values("structure_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,Parisson/TimeSide,joestump/python-oauth2,Audio,OAuth,0.226742,0.437810,0.960195,1.000000,0.233556
1,keon/algorithms,joestump/python-oauth2,Algorithms,OAuth,0.222778,0.305454,0.952184,1.000000,0.398186
2,beetbox/audioread,lepture/mistune,Audio,Markdown,0.068330,0.267206,0.800928,1.000000,0.147910
3,prabhupant/python-ds,bloomreach/s4cmd,Algorithms,Downloader,0.124566,0.253667,0.000000,1.000000,0.136857
4,jiaaro/pydub,lepture/mistune,Audio,Markdown,0.096278,0.164577,0.919316,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...
346,grantjenks/python-sortedcontainers,mstamy2/PyPDF2,Algorithms,PDF,0.341461,0.323438,0.952418,0.146309,0.177409
347,worldveil/dejavu,paramiko/paramiko,Audio,Cryptography,0.207150,0.475452,0.955411,0.146309,0.071113
348,worldveil/dejavu,mstamy2/PyPDF2,Audio,PDF,0.149832,0.460356,0.874601,0.146309,0.092482
349,grantjenks/python-sortedcontainers,keunwoochoi/kapre,Algorithms,Audio,0.127882,0.349354,0.950976,0.106568,0.060177


In [89]:
df.sort_values("readme_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.879978,0.884434,0.893791,0.415860,0.849909
1,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.937531,0.952218,0.885930,0.385717,0.769324
2,lepture/authlib,joestump/python-oauth2,OAuth,OAuth,0.837781,0.849447,0.909247,0.352853,0.758986
3,pennersr/django-allauth,evonove/django-oauth-toolkit,OAuth,OAuth,0.678230,0.735093,0.953953,0.415962,0.749154
4,evonove/django-oauth-toolkit,joestump/python-oauth2,OAuth,OAuth,0.786111,0.781988,0.905497,0.415860,0.748130
...,...,...,...,...,...,...,...,...,...
346,prabhupant/python-ds,jiaaro/pydub,Algorithms,Audio,0.166515,0.268667,0.000000,1.000000,0.000000
347,keunwoochoi/kapre,jiaaro/pydub,Audio,Audio,0.606200,0.568462,0.966739,0.325340,0.000000
348,grantjenks/python-sortedcontainers,jiaaro/pydub,Algorithms,Audio,0.360218,0.240564,0.943266,0.158174,0.000000
349,keon/algorithms,jiaaro/pydub,Algorithms,Audio,0.441632,0.289792,0.892854,0.474299,0.000000


## 7. Embedding concatenation

In [91]:
for repo_name, repo_dict in tqdm(repo_info.items()):
    repo_dict["embeddings"] = torch.concat([
        repo_dict["code_embeddings"],
        repo_dict["doc_embeddings"],
        repo_dict["requirement_embeddings"],
        repo_dict["structure_embeddings"],
        repo_dict["readme_embeddings"]
    ], dim=0)

100%|██████████| 27/27 [00:00<00:00, 8127.91it/s]


In [158]:
X = torch.concat([
    repo_dict["embeddings"] for _, repo_dict in repo_info.items()
], dim=0).reshape(-1, 3840)
X.shape

torch.Size([27, 3840])

In [159]:
label = [repo_dict["topic"] for _, repo_dict in repo_info.items()]
label

['Algorithms',
 'Algorithms',
 'Algorithms',
 'Algorithms',
 'Audio',
 'Audio',
 'Audio',
 'Audio',
 'Audio',
 'Audio',
 'Audio',
 'Audio',
 'OAuth',
 'OAuth',
 'OAuth',
 'OAuth',
 'OAuth',
 'OAuth',
 'Cryptography',
 'Cryptography',
 'Downloader',
 'Downloader',
 'Downloader',
 'PDF',
 'PDF',
 'Markdown',
 'Markdown']

In [160]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_ = label_encoder.fit_transform(label)
y = torch.tensor(label_, device=device)
y.shape

torch.Size([27])

## 8. Build Network

In [161]:
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [162]:
class EmbeddingDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [163]:
dataset = EmbeddingDataset(X, y)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [182]:
class EmbeddingNet(nn.Module):
    def __init__(self, input_dimension, output_dimension):
        super().__init__()
        self.network = nn.Linear(input_dimension, output_dimension)

    def forward(self, input):
        predict_probabilities = self.network(input)
        return predict_probabilities

In [183]:
# Initialize the model and optimizer
model = EmbeddingNet(X.shape[1], len(y.unique()))
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [184]:
def train():
    for epoch in range(100):
        for i, (features, labels) in enumerate(dataloader):
            # Forward pass
            outputs = model(features)
            # Compute loss
            loss = F.cross_entropy(outputs, labels)
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

In [185]:
train()

Epoch 1, Loss: 3.9668421745300293
Epoch 2, Loss: 0.2852829396724701
Epoch 3, Loss: 0.06449320167303085
Epoch 4, Loss: 0.045005735009908676
Epoch 5, Loss: 0.03360104560852051
Epoch 6, Loss: 0.02606319636106491
Epoch 7, Loss: 0.02097473107278347
Epoch 8, Loss: 0.017359577119350433
Epoch 9, Loss: 0.014676582999527454
Epoch 10, Loss: 0.012617639265954494
Epoch 11, Loss: 0.010995393618941307
Epoch 12, Loss: 0.00969003327190876
Epoch 13, Loss: 0.008620667271316051
Epoch 14, Loss: 0.007731631398200989
Epoch 15, Loss: 0.006982923019677401
Epoch 16, Loss: 0.006345126312226057
Epoch 17, Loss: 0.005796527955681086
Epoch 18, Loss: 0.0053206742741167545
Epoch 19, Loss: 0.0049047390930354595
Epoch 20, Loss: 0.0045387111604213715
Epoch 21, Loss: 0.004214572254568338
Epoch 22, Loss: 0.003925714176148176
Epoch 23, Loss: 0.0036671787966042757
Epoch 24, Loss: 0.003434831975027919
Epoch 25, Loss: 0.003224770538508892
Epoch 26, Loss: 0.0030343940015882254
Epoch 27, Loss: 0.002861217362806201
Epoch 28, Loss

In [195]:
params = {}
for name, parameter in model.named_parameters():
    params[name] = parameter
weight = params["network.weight"]
bias = params["network.bias"]
print(weight.shape)

torch.Size([7, 3840])


In [197]:
actual_label = [y[i].cpu().detach().numpy().item() for i in range(X.shape[0])]
actual_label

[0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 2,
 3,
 3,
 3,
 6,
 6,
 4,
 4]

In [198]:
for i, (repo_name, repo_dict) in enumerate(repo_info.items()):
    repo_dict["embeddings"] = repo_dict["embeddings"] * weight[actual_label[i]] + bias[actual_label[i]]
    print(repo_dict["embeddings"].shape)

torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])
torch.Size([3840])


## 9. Repository similarity

In [199]:
cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info.keys(), 2), total=num_of_rows):
    code_embeddings1 = repo_info[repo1]["code_embeddings"]
    code_embeddings2 = repo_info[repo2]["code_embeddings"]
    code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    doc_embeddings1 = repo_info[repo1]["doc_embeddings"]
    doc_embeddings2 = repo_info[repo2]["doc_embeddings"]
    doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    requirement_embeddings1 = repo_info[repo1]["requirement_embeddings"]
    requirement_embeddings2 = repo_info[repo2]["requirement_embeddings"]
    requirement_similarity = cossim(requirement_embeddings1, requirement_embeddings2).cpu().detach().numpy().item()

    structure_embeddings1 = repo_info[repo1]["structure_embeddings"]
    structure_embeddings2 = repo_info[repo2]["structure_embeddings"]
    structure_similarity = cossim(structure_embeddings1, structure_embeddings2).cpu().detach().numpy().item()

    readme_embeddings1 = repo_info[repo1]["readme_embeddings"]
    readme_embeddings2 = repo_info[repo2]["readme_embeddings"]
    readme_similarity = cossim(readme_embeddings1, readme_embeddings2).cpu().detach().numpy().item()

    reposiroty_embeddings1 = repo_info[repo1]["embeddings"]
    reposiroty_embeddings2 = repo_info[repo2]["embeddings"]
    repository_similarity = cossim(reposiroty_embeddings1, reposiroty_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info[repo1]["topic"]
    topic2 = repo_info[repo2]["topic"]

    res.append(
        (repo1, repo2, topic1, topic2, repository_similarity, code_similarity, doc_similarity, requirement_similarity,
         structure_similarity, readme_similarity))

100%|██████████| 351/351 [00:02<00:00, 174.12it/s]


In [200]:
df = pd.DataFrame(res,
                  columns=["repo1", "repo2", "topic1", "topic2", "repo_sim", "code_sim", "doc_sim", "requirement_sim",
                           "structure_sim", "readme_sim"])
df

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,keon/algorithms,prabhupant/python-ds,Algorithms,Algorithms,0.949539,0.776967,0.822550,0.000000,0.474299,0.709225
1,keon/algorithms,grantjenks/python-sortedcontainers,Algorithms,Algorithms,0.925913,0.727241,0.707484,0.838826,0.209868,0.453314
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.980498,0.897145,0.845873,0.880533,0.415860,0.517053
3,keon/algorithms,beetbox/audioread,Algorithms,Audio,-0.673970,0.009967,0.279662,0.706504,0.474299,0.256359
4,keon/algorithms,worldveil/dejavu,Algorithms,Audio,-0.744078,0.176932,0.410920,0.929586,0.209868,0.216086
...,...,...,...,...,...,...,...,...,...,...
346,euske/pdfminer,lepture/mistune,PDF,Markdown,0.889041,0.298307,0.251666,0.919282,1.000000,0.380539
347,euske/pdfminer,waylan/Python-Markdown,PDF,Markdown,0.889269,0.305356,0.266002,0.908167,0.394829,0.378558
348,mstamy2/PyPDF2,lepture/mistune,PDF,Markdown,0.909671,0.329024,0.430963,0.887526,0.417669,0.317927
349,mstamy2/PyPDF2,waylan/Python-Markdown,PDF,Markdown,0.911741,0.524874,0.610779,0.804444,0.463661,0.396914


In [201]:
df.sort_values("repo_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.980498,0.897145,0.845873,0.880533,0.415860,0.517053
1,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.973772,0.937531,0.952218,0.885930,0.385717,0.769324
2,lepture/mistune,waylan/Python-Markdown,Markdown,Markdown,0.962252,0.811350,0.664221,0.845524,0.394829,0.508314
3,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.960485,0.919488,0.871041,0.880123,1.000000,0.673768
4,euske/pdfminer,mstamy2/PyPDF2,PDF,PDF,0.956583,0.444577,0.491483,0.898296,0.417669,0.539181
...,...,...,...,...,...,...,...,...,...,...
346,worldveil/dejavu,euske/pdfminer,Audio,PDF,-0.814192,0.173007,0.224390,0.910549,0.158174,0.151996
347,Parisson/TimeSide,euske/pdfminer,Audio,PDF,-0.822681,0.139196,0.332820,0.928658,0.474299,0.204960
348,sergree/matchering,mstamy2/PyPDF2,Audio,PDF,-0.826250,0.192997,0.000000,0.853674,0.363430,0.347914
349,worldveil/dejavu,mstamy2/PyPDF2,Audio,PDF,-0.841080,0.149832,0.460356,0.874601,0.146309,0.092482


In [202]:
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.973772,0.937531,0.952218,0.885930,0.385717,0.769324
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.960485,0.919488,0.871041,0.880123,1.000000,0.673768
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.980498,0.897145,0.845873,0.880533,0.415860,0.517053
3,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.950198,0.889833,0.813740,0.986769,0.385717,0.731658
4,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.950584,0.879978,0.884434,0.893791,0.415860,0.849909
...,...,...,...,...,...,...,...,...,...,...
346,sergree/matchering,pennersr/django-allauth,Audio,OAuth,-0.647265,-0.127382,0.000000,0.886638,0.327863,0.431417
347,tyiannak/pyAudioAnalysis,pennersr/django-allauth,Audio,OAuth,-0.577514,-0.127944,0.133521,0.936459,0.415962,0.185671
348,jindaxiang/akshare,s3tools/s3cmd,Downloader,Downloader,0.159707,-0.128285,0.174601,0.868814,0.415962,0.262691
349,librosa/librosa,pennersr/django-allauth,Audio,OAuth,-0.604690,-0.134153,0.117186,0.952453,0.415962,0.307107


In [203]:
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.973772,0.937531,0.952218,0.885930,0.385717,0.769324
1,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.950584,0.879978,0.884434,0.893791,0.415860,0.849909
2,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.960485,0.919488,0.871041,0.880123,1.000000,0.673768
3,lepture/authlib,joestump/python-oauth2,OAuth,OAuth,0.941901,0.837781,0.849447,0.909247,0.352853,0.758986
4,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.980498,0.897145,0.845873,0.880533,0.415860,0.517053
...,...,...,...,...,...,...,...,...,...,...
346,keunwoochoi/kapre,sergree/matchering,Audio,Audio,0.899755,0.665866,0.000000,0.957774,0.406075,0.098606
347,worldveil/dejavu,sergree/matchering,Audio,Audio,0.844625,0.239733,0.000000,0.973077,0.225204,0.254849
348,keon/algorithms,sergree/matchering,Algorithms,Audio,-0.757071,0.385051,0.000000,0.909003,0.329200,0.416653
349,beetbox/audioread,sergree/matchering,Audio,Audio,0.814225,0.304321,0.000000,0.756959,0.283867,0.109871


In [204]:
df.sort_values("requirement_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,librosa/librosa,Parisson/TimeSide,Audio,Audio,0.837629,0.348526,0.487567,0.994391,0.415860,0.464888
1,worldveil/dejavu,tyiannak/pyAudioAnalysis,Audio,Audio,0.839450,0.398418,0.500775,0.987630,0.358336,0.554342
2,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.950198,0.889833,0.813740,0.986769,0.385717,0.731658
3,TheAlgorithms/Python,evonove/django-oauth-toolkit,Algorithms,OAuth,0.793745,0.039334,0.181022,0.983084,1.000000,0.172390
4,keon/algorithms,Parisson/TimeSide,Algorithms,Audio,-0.745338,0.056477,0.366021,0.981612,1.000000,0.428975
...,...,...,...,...,...,...,...,...,...,...
346,prabhupant/python-ds,idan/oauthlib,Algorithms,OAuth,0.776274,-0.012275,0.179171,0.000000,0.394829,0.339108
347,prabhupant/python-ds,evonove/django-oauth-toolkit,Algorithms,OAuth,0.766224,-0.061155,0.105172,0.000000,0.394829,0.190244
348,prabhupant/python-ds,pennersr/django-allauth,Algorithms,OAuth,0.770293,-0.116639,0.170995,0.000000,0.473030,0.188903
349,prabhupant/python-ds,lepture/authlib,Algorithms,OAuth,0.804491,-0.034557,0.188919,0.000000,0.307355,0.304759


In [205]:
df.sort_values("structure_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,Parisson/TimeSide,joestump/python-oauth2,Audio,OAuth,-0.713969,0.226742,0.437810,0.960195,1.000000,0.233556
1,keon/algorithms,joestump/python-oauth2,Algorithms,OAuth,0.820574,0.222778,0.305454,0.952184,1.000000,0.398186
2,beetbox/audioread,lepture/mistune,Audio,Markdown,-0.707495,0.068330,0.267206,0.800928,1.000000,0.147910
3,prabhupant/python-ds,bloomreach/s4cmd,Algorithms,Downloader,-0.172694,0.124566,0.253667,0.000000,1.000000,0.136857
4,jiaaro/pydub,lepture/mistune,Audio,Markdown,-0.688591,0.096278,0.164577,0.919316,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
346,grantjenks/python-sortedcontainers,mstamy2/PyPDF2,Algorithms,PDF,0.837679,0.341461,0.323438,0.952418,0.146309,0.177409
347,worldveil/dejavu,paramiko/paramiko,Audio,Cryptography,-0.790835,0.207150,0.475452,0.955411,0.146309,0.071113
348,worldveil/dejavu,mstamy2/PyPDF2,Audio,PDF,-0.841080,0.149832,0.460356,0.874601,0.146309,0.092482
349,grantjenks/python-sortedcontainers,keunwoochoi/kapre,Algorithms,Audio,-0.639729,0.127882,0.349354,0.950976,0.106568,0.060177


In [206]:
df.sort_values("readme_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,repo_sim,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.950584,0.879978,0.884434,0.893791,0.415860,0.849909
1,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.973772,0.937531,0.952218,0.885930,0.385717,0.769324
2,lepture/authlib,joestump/python-oauth2,OAuth,OAuth,0.941901,0.837781,0.849447,0.909247,0.352853,0.758986
3,pennersr/django-allauth,evonove/django-oauth-toolkit,OAuth,OAuth,0.897070,0.678230,0.735093,0.953953,0.415962,0.749154
4,evonove/django-oauth-toolkit,joestump/python-oauth2,OAuth,OAuth,0.905756,0.786111,0.781988,0.905497,0.415860,0.748130
...,...,...,...,...,...,...,...,...,...,...
346,prabhupant/python-ds,jiaaro/pydub,Algorithms,Audio,-0.626827,0.166515,0.268667,0.000000,1.000000,0.000000
347,keunwoochoi/kapre,jiaaro/pydub,Audio,Audio,0.867651,0.606200,0.568462,0.966739,0.325340,0.000000
348,grantjenks/python-sortedcontainers,jiaaro/pydub,Algorithms,Audio,-0.667993,0.360218,0.240564,0.943266,0.158174,0.000000
349,keon/algorithms,jiaaro/pydub,Algorithms,Audio,-0.701777,0.441632,0.289792,0.892854,0.474299,0.000000
