# Embedding similarity (with best model of each embedding)

## 1. Downloading embeddings

In [1]:
# Need to update
!gdown

Downloading...
From (uriginal): https://drive.google.com/uc?id=1qgd2F5CxXJ27u0F-Tix96uWlBSNK7GcJ
From (redirected): https://drive.google.com/uc?id=1qgd2F5CxXJ27u0F-Tix96uWlBSNK7GcJ&confirm=t&uuid=ad6d8cc3-fa53-44f2-b367-dc2d11ba7bb4
To: /Users/Henry/Documents/PyCharmProjects/RepoSim4Py/Embedding/Embedding_model_train_315/repo_info_train_embeddings.pkl
100%|████████████████████████████████████████| 251M/251M [00:09<00:00, 27.4MB/s]


In [2]:
import pickle

with open("repo_info_train_embeddings.pkl", "rb") as f:
    repo_info_train_embeddings = pickle.load(f)
    f.close()

In [3]:
next(iter(repo_info_train_embeddings.values())).keys()

dict_keys(['docs', 'codes', 'structure', 'requirements', 'readme', 'topic', 'codes_embeddings', 'docs_embeddings', 'structure_embeddings', 'requirements_embeddings', 'readme_embeddings'])

## 2. Choosing code embeddings

In [4]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["codes_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py'])

## 3. Choosing doc embeddings

In [5]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["docs_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens'])

## 4. Choosing structure embeddings

In [6]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["structure_embeddings"].keys()

# Best model: sentence-transformers/bert-base-nli-mean-tokens

dict_keys(['sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens'])

## 5. Choosing readme embeddings

In [7]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["readme_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens'])

## 6. Choosing requirements embeddings

In [8]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["requirements_embeddings"].keys()

# Best model: microsoft/unixcoder-base-unimodal

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens'])

## 7. Similarity calculation

In [24]:
import torch
from torch.nn import CosineSimilarity
from itertools import combinations
from tqdm import tqdm

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info_train_embeddings)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info_train_embeddings.keys(), 2), total=num_of_rows):
      
    # Code embeddings
    code_embeddings1 = repo_info_train_embeddings[repo1]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["codes_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo1]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"]
    
    code_embeddings2 = repo_info_train_embeddings[repo2]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["codes_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo2]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"]
    
    code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    # Doc embeddings
    doc_embeddings1 = repo_info_train_embeddings[repo1]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["docs_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo1]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"]
    
    doc_embeddings2 = repo_info_train_embeddings[repo2]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["docs_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo2]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"]
        
    doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    # Requirements embeddings
    requirement_embeddings1 = repo_info_train_embeddings[repo1]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["requirements_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo1]["requirements_embeddings"]["microsoft/unixcoder-base-unimodal"]
    
    requirement_embeddings2 = repo_info_train_embeddings[repo2]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["requirements_embeddings"])  is torch.Tensor \
    else repo_info_train_embeddings[repo2]["requirements_embeddings"]["microsoft/unixcoder-base-unimodal"]
    
    requirement_similarity = cossim(requirement_embeddings1, requirement_embeddings2).cpu().detach().numpy().item()

    # Structure embeddings
    structure_embeddings1 = repo_info_train_embeddings[repo1]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["structure_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo1]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"]
    
    structure_embeddings2 = repo_info_train_embeddings[repo2]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["structure_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo2]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"]
    
    structure_similarity = cossim(structure_embeddings1, structure_embeddings2).cpu().detach().numpy().item()

    # Reame embeddings
    readme_embeddings1 = repo_info_train_embeddings[repo1]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["readme_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo1]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"]
    
    readme_embeddings2 = repo_info_train_embeddings[repo2]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["readme_embeddings"]) is torch.Tensor \
    else repo_info_train_embeddings[repo2]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"]
    
    readme_similarity = cossim(readme_embeddings1, readme_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info_train_embeddings[repo1]["topic"]
    topic2 = repo_info_train_embeddings[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity, requirement_similarity,
                structure_similarity, readme_similarity))

100%|██████████████████████████████████| 49455/49455 [00:02<00:00, 20391.99it/s]


In [25]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim", "requirement_sim",
                                "structure_sim", "readme_sim"])
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,jet-admin/jet-bridge,patrys/httmock,Admin Panels,Mock,0.286407,0.000000,0.984576,0.805547,0.321227
1,jet-admin/jet-bridge,pytransitions/transitions,Admin Panels,Design Patterns,0.112494,0.022725,0.994461,0.805547,0.290076
2,jet-admin/jet-bridge,keleshev/schema,Admin Panels,Data Validation,0.276554,0.211781,0.987203,0.805547,0.164357
3,jet-admin/jet-bridge,dylanaraps/pywal,Admin Panels,Image Processing,0.057678,0.159377,0.976722,0.805547,0.387439
4,jet-admin/jet-bridge,PyCQA/modernize,Admin Panels,Compatibility,-0.052191,-0.012766,0.976740,0.805547,0.270546
...,...,...,...,...,...,...,...,...,...
49450,thauber/django-schedule,Miserlou/Zappa,Job Scheduler,Serverless Frameworks,0.255809,0.120266,0.993783,1.000000,0.000000
49451,thauber/django-schedule,jek/blinker,Job Scheduler,Miscellaneous,0.172473,0.129667,0.974804,0.733963,0.000000
49452,Yelp/mrjob,Miserlou/Zappa,Batch Processing,Serverless Frameworks,0.503636,0.280724,0.995842,0.851043,0.274333
49453,Yelp/mrjob,jek/blinker,Batch Processing,Miscellaneous,0.253782,-0.018066,0.981475,0.738889,-0.063499


In [26]:
df.to_csv("Embeddings_similairity_best_model.csv")

In [27]:
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,facebook/pyre-check,srusskih/SublimeJEDI,Static Type Checkers,Sublime Text,0.957531,0.640003,0.998648,0.795817,0.704988
1,chapmanb/bcbb,chapmanb/bcbio-nextgen,Science,Science,0.946794,0.945710,0.996018,0.857463,0.689437
2,dimka665/awesome-slugify,un33k/python-slugify,Slugify,Slugify,0.933911,0.271404,0.974190,1.000000,0.809692
3,html5lib/html5lib-python,mozilla/bleach,HTML Manipulation,HTML Manipulation,0.928365,0.727614,0.998628,0.851043,0.513648
4,Suor/funcy,pytoolz/toolz,Functional Programming,Functional Programming,0.912396,0.592634,0.985169,1.000000,0.532951
...,...,...,...,...,...,...,...,...,...
49450,sergree/matchering,istrategylabs/django-wordpress,Audio,Third-party APIs,-0.222754,0.000000,0.953290,1.000000,0.127519
49451,facebook/pyre-check,chapmanb/bcbio-nextgen,Static Type Checkers,Science,-0.237362,0.237499,0.995563,0.871185,0.376390
49452,jet-admin/jet-bridge,AtsushiSakai/PythonRobotics,Admin Panels,Robotics,-0.241641,-0.009897,0.992677,0.628502,0.432332
49453,glamp/bashplotlib,knipknap/SpiffWorkflow,Terminal Rendering,Job Scheduler,-0.259406,-0.115281,0.981899,0.851043,0.253982


In [28]:
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,boppreh/mouse,boppreh/keyboard,Hardware,Hardware,0.791582,0.958058,0.998265,1.000000,0.667301
1,chapmanb/bcbb,chapmanb/bcbio-nextgen,Science,Science,0.946794,0.945710,0.996018,0.857463,0.689437
2,gabrielfalcao/HTTPretty,mindflayer/python-mocket,Mock,Mock,0.828275,0.940462,0.994737,1.000000,0.442126
3,dmlc/mxnet,Theano/Theano,Deep Learning,Deep Learning,0.867719,0.892617,0.985851,0.758201,0.502166
4,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.896777,0.862710,0.987882,0.857463,0.545784
...,...,...,...,...,...,...,...,...,...
49450,benhamner/Metrics,geex-arts/django-jet,Machine Learning,Admin Panels,-0.000408,-0.232852,0.985020,0.682007,0.298061
49451,PyCQA/prospector,burnash/gspread,Code Analysis,Third-party APIs,-0.019134,-0.234851,0.985880,0.709694,-0.085384
49452,librosa/librosa,MasoniteFramework/masonite,Audio,Synchronous,0.075014,-0.236557,0.997178,0.851043,0.327490
49453,boto/boto3,librosa/librosa,Third-party APIs,Audio,0.193592,-0.243958,0.994741,1.000000,0.394958


In [30]:
df.sort_values("requirement_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,dylanaraps/pywal,glamp/bashplotlib,Image Processing,Terminal Rendering,0.450348,0.546986,1.0,1.000000,0.348482
1,shinux/PyTime,martinblech/xmltodict,Date and Time,HTML Manipulation,0.146950,0.000000,1.0,1.000000,0.242712
2,martinblech/xmltodict,Alir3z4/python-currencies,HTML Manipulation,E-commerce,0.194751,0.000000,1.0,1.000000,0.118030
3,glamp/bashplotlib,daviddrysdale/python-phonenumbers,Terminal Rendering,Parser,0.145619,0.252130,1.0,0.685853,0.151184
4,martinblech/xmltodict,daviddrysdale/python-phonenumbers,HTML Manipulation,Parser,0.344254,0.000000,1.0,0.685853,0.255288
...,...,...,...,...,...,...,...,...,...
49450,zopefoundation/ZODB,prabhupant/python-ds,Database,Algorithms,0.179174,0.137548,0.0,0.667501,0.445973
49451,dmlc/xgboost,Microsoft/Pyjion,Machine Learning,Implementations,0.135288,0.195839,0.0,0.791587,0.481029
49452,dmlc/xgboost,dahlia/awesome-sqlalchemy,Machine Learning,Relational Databases,0.000000,0.000000,0.0,0.870143,0.235139
49453,sdispater/orator,amoffat/sh,Relational Databases,Processes,0.134950,-0.033145,0.0,1.000000,0.411599


In [31]:
df.sort_values("structure_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,awslabs/aws-data-wrangler,mingrammer/diagrams,Data Analysis,Data Visualization,0.138789,-0.049664,0.936455,1.000000,0.264236
1,benhamner/Metrics,awslabs/aws-data-wrangler,Machine Learning,Data Analysis,0.202575,-0.082546,0.951059,1.000000,0.354353
2,facebook/pyre-check,devpi/devpi,Static Type Checkers,Package Repositories,0.183871,0.368912,0.992691,1.000000,0.473640
3,facebook/pyre-check,modoboa/modoboa,Static Type Checkers,Mail Servers,0.225852,0.156829,0.997404,1.000000,0.548179
4,modoboa/modoboa,mitmproxy/pdoc,Mail Servers,Documentation,0.140156,0.111532,0.953818,1.000000,0.526407
...,...,...,...,...,...,...,...,...,...
49450,python-attrs/attrs,msiemens/tinydb,Built-in Classes Enhancement,Database,0.543279,0.418506,0.970380,0.445523,0.228107
49451,noxrepo/pox,msiemens/tinydb,Network Virtualization,Database,0.169206,0.081184,0.960690,0.445523,0.126602
49452,jet-admin/jet-bridge,python-attrs/attrs,Admin Panels,Built-in Classes Enhancement,0.162316,0.172673,0.986974,0.441703,0.201435
49453,jet-admin/jet-bridge,noxrepo/pox,Admin Panels,Network Virtualization,0.141546,0.116691,0.982989,0.441703,0.235536


In [32]:
df.sort_values("readme_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,mozilla/unicode-slugify,un33k/python-slugify,Slugify,Slugify,0.759318,0.653769,0.980232,1.000000,0.820198
1,getnikola/nikola,modoboa/modoboa,Static Site Generator,Mail Servers,0.361417,0.399754,0.990360,0.871185,0.811529
2,dimka665/awesome-slugify,un33k/python-slugify,Slugify,Slugify,0.933911,0.271404,0.974190,1.000000,0.809692
3,dimka665/awesome-slugify,mozilla/unicode-slugify,Slugify,Slugify,0.754324,0.228155,0.985669,1.000000,0.801683
4,s3tools/s3cmd,bloomreach/s4cmd,Downloader,Downloader,0.117505,0.204611,0.975632,0.851043,0.800931
...,...,...,...,...,...,...,...,...,...
49450,PyCQA/prospector,TheAlgorithms/Python,Code Analysis,Algorithms,0.047594,0.073590,0.995109,0.816045,-0.077004
49451,PyCQA/prospector,mymarilyn/clickhouse-driver,Code Analysis,Database Drivers,0.183647,-0.121352,0.994007,0.755685,-0.077629
49452,PyCQA/prospector,burnash/gspread,Code Analysis,Third-party APIs,-0.019134,-0.234851,0.985880,0.709694,-0.085384
49453,PyCQA/prospector,jek/blinker,Code Analysis,Miscellaneous,0.185908,-0.037453,0.983519,0.709694,-0.092737
