# Embedding similarity (with best model of each embedding)

## 1. Downloading embeddings

In [1]:
# Downloading repo_info_train_embeddings.pkl
!gdown 1gYpZW-rWdyOskNgbwDrmOg_Ij4nMJ6_V

Downloading...
From (uriginal): https://drive.google.com/uc?id=1qgd2F5CxXJ27u0F-Tix96uWlBSNK7GcJ
From (redirected): https://drive.google.com/uc?id=1qgd2F5CxXJ27u0F-Tix96uWlBSNK7GcJ&confirm=t&uuid=ad6d8cc3-fa53-44f2-b367-dc2d11ba7bb4
To: /Users/Henry/Documents/PyCharmProjects/RepoSim4Py/Embedding/Embedding_model_train_315/repo_info_train_embeddings.pkl
100%|████████████████████████████████████████| 251M/251M [00:09<00:00, 27.4MB/s]


In [1]:
import pickle

with open("repo_info_train_embeddings.pkl", "rb") as f:
    repo_info_train_embeddings = pickle.load(f)
    f.close()

In [2]:
# Check
next(iter(repo_info_train_embeddings.values())).keys()

dict_keys(['docs', 'codes', 'structure', 'requirements', 'readme', 'topic', 'codes_embeddings', 'docs_embeddings', 'structure_embeddings', 'requirements_embeddings', 'readme_embeddings'])

## 2. Choosing code embeddings

In [3]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["codes_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'Enoch/cocosoda-graphcodebert', 'Enoch/graphcodebert-py'])

## 3. Choosing doc embeddings

In [4]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["docs_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens', 'Enoch/cocosoda-graphcodebert', 'Enoch/graphcodebert-py'])

## 4. Choosing structure embeddings

In [5]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["structure_embeddings"].keys()

# Best model: sentence-transformers/bert-base-nli-mean-tokens

dict_keys(['sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens'])

## 5. Choosing readme embeddings

In [6]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["readme_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens', 'distilbert-base-uncased'])

## 6. Choosing requirements embeddings

In [7]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["requirements_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['microsoft/unixcoder-base-nine', 'microsoft/unixcoder-base', 'microsoft/unixcoder-base-unimodal', 'Lazyhope/unixcoder-nine-advtest', 'Lazyhope/unixcoder-clone-detection', 'Enoch/Unixcoder-Tuned-Code-Search-Py', 'sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/multi-qa-mpnet-base-cos-v1', 'sentence-transformers/bert-base-nli-mean-tokens', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'sentence-transformers/all-distilroberta-v1', 'sentence-transformers/distilbert-base-nli-mean-tokens', 'distilbert-base-uncased'])

## 7. Similarity calculation

In [8]:
import torch
from torch.nn import CosineSimilarity
from itertools import combinations
from tqdm import tqdm

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info_train_embeddings)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info_train_embeddings.keys(), 2), total=num_of_rows):
      
    # Code embeddings
    code_embeddings1 = repo_info_train_embeddings[repo1]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["codes_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    code_embeddings2 = repo_info_train_embeddings[repo2]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["codes_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    # Doc embeddings
    doc_embeddings1 = repo_info_train_embeddings[repo1]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["docs_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    doc_embeddings2 = repo_info_train_embeddings[repo2]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["docs_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
        
    doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    # Requirements embeddings
    requirement_embeddings1 = repo_info_train_embeddings[repo1]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["requirements_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["requirements_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    requirement_embeddings2 = repo_info_train_embeddings[repo2]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["requirements_embeddings"])  is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["requirements_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    requirement_similarity = cossim(requirement_embeddings1, requirement_embeddings2).cpu().detach().numpy().item()

    # Structure embeddings
    structure_embeddings1 = repo_info_train_embeddings[repo1]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["structure_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"], dim=0)
    
    structure_embeddings2 = repo_info_train_embeddings[repo2]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["structure_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"], dim=0)
    
    structure_similarity = cossim(structure_embeddings1, structure_embeddings2).cpu().detach().numpy().item()

    # Reame embeddings
    readme_embeddings1 = repo_info_train_embeddings[repo1]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["readme_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    readme_embeddings2 = repo_info_train_embeddings[repo2]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["readme_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    readme_similarity = cossim(readme_embeddings1, readme_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info_train_embeddings[repo1]["topic"]
    topic2 = repo_info_train_embeddings[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity, requirement_similarity,
                structure_similarity, readme_similarity))

100%|███████████████████████████████████| 49455/49455 [00:31<00:00, 1582.12it/s]


In [9]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim", "requirement_sim",
                                "structure_sim", "readme_sim"])
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,jet-admin/jet-bridge,patrys/httmock,Admin Panels,Mock,0.286407,0.000000,0.510243,0.805547,0.321227
1,jet-admin/jet-bridge,pytransitions/transitions,Admin Panels,Design Patterns,0.112494,0.022725,0.656439,0.805547,0.290076
2,jet-admin/jet-bridge,keleshev/schema,Admin Panels,Data Validation,0.276554,0.211781,0.533524,0.805547,0.164357
3,jet-admin/jet-bridge,dylanaraps/pywal,Admin Panels,Image Processing,0.057678,0.159377,0.385125,0.805547,0.387439
4,jet-admin/jet-bridge,PyCQA/modernize,Admin Panels,Compatibility,-0.052191,-0.012766,0.512542,0.805547,0.270546
...,...,...,...,...,...,...,...,...,...
49450,thauber/django-schedule,Miserlou/Zappa,Job Scheduler,Serverless Frameworks,0.255809,0.120266,0.731916,1.000000,0.000000
49451,thauber/django-schedule,jek/blinker,Job Scheduler,Miscellaneous,0.172473,0.129667,0.551813,0.733963,0.000000
49452,Yelp/mrjob,Miserlou/Zappa,Batch Processing,Serverless Frameworks,0.503636,0.280724,0.816891,0.851043,0.274333
49453,Yelp/mrjob,jek/blinker,Batch Processing,Miscellaneous,0.253781,-0.018066,0.584902,0.738889,-0.063499


In [10]:
# Saving similarity calculation
df.to_csv("Embeddings_similairity_best_model_train.csv")

In [11]:
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,facebook/pyre-check,srusskih/SublimeJEDI,Static Type Checkers,Sublime Text,0.957535,0.640003,0.925268,0.795816,0.704988
1,chapmanb/bcbb,chapmanb/bcbio-nextgen,Science,Science,0.946794,0.945710,0.875043,0.857463,0.689437
2,dimka665/awesome-slugify,un33k/python-slugify,Slugify,Slugify,0.933911,0.271404,0.850342,1.000000,0.809692
3,html5lib/html5lib-python,mozilla/bleach,HTML Manipulation,HTML Manipulation,0.928365,0.727614,0.919731,0.851043,0.513649
4,Suor/funcy,pytoolz/toolz,Functional Programming,Functional Programming,0.912396,0.592634,0.566169,1.000000,0.532951
...,...,...,...,...,...,...,...,...,...
49450,sergree/matchering,istrategylabs/django-wordpress,Audio,Third-party APIs,-0.222754,0.000000,0.224051,1.000000,0.127519
49451,facebook/pyre-check,chapmanb/bcbio-nextgen,Static Type Checkers,Science,-0.237231,0.237499,0.803685,0.871185,0.376390
49452,jet-admin/jet-bridge,AtsushiSakai/PythonRobotics,Admin Panels,Robotics,-0.239567,-0.009897,0.567053,0.628502,0.432332
49453,glamp/bashplotlib,knipknap/SpiffWorkflow,Terminal Rendering,Job Scheduler,-0.259406,-0.115281,0.706463,0.851043,0.253982


In [12]:
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,boppreh/mouse,boppreh/keyboard,Hardware,Hardware,0.789829,0.958058,0.924119,1.000000,0.667301
1,chapmanb/bcbb,chapmanb/bcbio-nextgen,Science,Science,0.946794,0.945710,0.875043,0.857463,0.689437
2,gabrielfalcao/HTTPretty,mindflayer/python-mocket,Mock,Mock,0.828275,0.940462,0.849415,1.000000,0.442127
3,dmlc/mxnet,Theano/Theano,Deep Learning,Deep Learning,0.867719,0.892617,0.793662,0.758201,0.502167
4,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.896777,0.862710,0.638201,0.857463,0.545784
...,...,...,...,...,...,...,...,...,...
49450,benhamner/Metrics,geex-arts/django-jet,Machine Learning,Admin Panels,-0.000408,-0.232852,0.360352,0.682007,0.298061
49451,PyCQA/prospector,burnash/gspread,Code Analysis,Third-party APIs,-0.019134,-0.234851,0.618530,0.709694,-0.085384
49452,librosa/librosa,MasoniteFramework/masonite,Audio,Synchronous,0.075014,-0.236557,0.786372,0.851043,0.327490
49453,boto/boto3,librosa/librosa,Third-party APIs,Audio,0.193592,-0.243958,0.615467,1.000000,0.394958


In [13]:
df.sort_values("requirement_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,dbader/schedule,dirn/When.py,Job Scheduler,Date and Time,0.459264,0.208572,1.000000,0.851043,0.451838
1,skorokithakis/shortuuid,istrategylabs/django-wordpress,Unique identifiers,Third-party APIs,-0.023184,-0.019064,1.000000,1.000000,0.181034
2,python-excel/xlrd,faif/python-patterns,Office,Design Patterns,0.234415,0.062191,1.000000,0.851043,0.026815
3,devsnd/tinytag,Alir3z4/html2text,Metadata,Web Content Extracting,0.494363,0.000000,1.000000,0.851043,0.422292
4,devsnd/tinytag,faif/python-patterns,Metadata,Design Patterns,0.203556,0.000000,1.000000,1.000000,0.166934
...,...,...,...,...,...,...,...,...,...
49450,rochacbruno/quokka,Microsoft/Pyjion,CMS,Implementations,-0.065284,-0.010580,0.000000,0.791984,0.447984
49451,martinrusev/imbox,prabhupant/python-ds,Mail Clients,Algorithms,0.093208,0.106717,0.000000,0.720690,0.277425
49452,benfred/py-spy,carlospalol/money,Profiler,E-commerce,0.135712,0.000000,-0.009899,1.000000,0.236749
49453,martinrusev/imbox,zopefoundation/ZODB,Mail Clients,Database,0.247922,-0.102474,-0.025430,0.851043,0.202491


In [14]:
df.sort_values("structure_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,jonashaag/bjoern,evhub/coconut,WSGI Servers,Functional Programming,0.043007,0.211615,0.470906,1.000000,0.368124
1,getsentry/sentry-python,Cornices/cornice,Logging,Pyramid,0.665457,0.389864,0.530859,1.000000,0.356368
2,getsentry/sentry-python,boppreh/mouse,Logging,Hardware,0.292311,0.127953,0.551456,1.000000,0.286297
3,getsentry/sentry-python,justquick/django-activity-stream,Logging,News Feed,0.396946,0.103432,0.606096,1.000000,0.317177
4,getsentry/sentry-python,benoitc/gunicorn,Logging,WSGI Servers,0.632394,0.591248,0.893616,1.000000,0.351098
...,...,...,...,...,...,...,...,...,...
49450,python-attrs/attrs,msiemens/tinydb,Built-in Classes Enhancement,Database,0.543187,0.418506,0.456601,0.445523,0.228108
49451,noxrepo/pox,msiemens/tinydb,Network Virtualization,Database,0.169206,0.081184,0.511462,0.445523,0.126602
49452,jet-admin/jet-bridge,python-attrs/attrs,Admin Panels,Built-in Classes Enhancement,0.162141,0.172673,0.553017,0.441703,0.201436
49453,jet-admin/jet-bridge,noxrepo/pox,Admin Panels,Network Virtualization,0.141546,0.116691,0.746225,0.441703,0.235536


In [15]:
df.sort_values("readme_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,mozilla/unicode-slugify,un33k/python-slugify,Slugify,Slugify,0.759318,0.653769,0.823107,1.000000,0.820198
1,getnikola/nikola,modoboa/modoboa,Static Site Generator,Mail Servers,0.361154,0.399754,0.792979,0.871185,0.811529
2,dimka665/awesome-slugify,un33k/python-slugify,Slugify,Slugify,0.933911,0.271404,0.850342,1.000000,0.809692
3,dimka665/awesome-slugify,mozilla/unicode-slugify,Slugify,Slugify,0.754324,0.228155,0.837706,1.000000,0.801684
4,s3tools/s3cmd,bloomreach/s4cmd,Downloader,Downloader,0.117505,0.204611,0.648519,0.851043,0.800931
...,...,...,...,...,...,...,...,...,...
49450,PyCQA/prospector,TheAlgorithms/Python,Code Analysis,Algorithms,0.047594,0.073590,0.689168,0.816045,-0.077003
49451,PyCQA/prospector,mymarilyn/clickhouse-driver,Code Analysis,Database Drivers,0.183647,-0.121352,0.764123,0.755685,-0.077628
49452,PyCQA/prospector,burnash/gspread,Code Analysis,Third-party APIs,-0.019134,-0.234851,0.618530,0.709694,-0.085384
49453,PyCQA/prospector,jek/blinker,Code Analysis,Miscellaneous,0.185908,-0.037453,0.589212,0.709694,-0.092737
