# Embedding similarity (with best model of each embedding)

## 1. Downloading embeddings

In [1]:
!gdown 1WOqffTwS2AA8KDYOcA78vg_tCtVhx71T

Downloading...
From (uriginal): https://drive.google.com/uc?id=1WOqffTwS2AA8KDYOcA78vg_tCtVhx71T
From (redirected): https://drive.google.com/uc?id=1WOqffTwS2AA8KDYOcA78vg_tCtVhx71T&confirm=t&uuid=d3a15cb2-85ed-4d43-9984-1e53d3005fbf
To: /Users/henry/Documents/PycharmProjects/RepoSim4Py/Embedding/Embedding_model_validation_100/repo_info_validation_embeddings.pkl
100%|████████████████████████████████████████| 465M/465M [00:54<00:00, 8.52MB/s]


In [2]:
import pickle

with open("repo_info_validation_embeddings.pkl", "rb") as f:
    repo_info_train_embeddings = pickle.load(f)
    f.close()

In [3]:
next(iter(repo_info_train_embeddings.values())).keys()

dict_keys(['docs', 'codes', 'structure', 'requirements', 'readme', 'topic', 'codes_embeddings', 'docs_embeddings', 'structure_embeddings', 'requirements_embeddings', 'readme_embeddings'])

## 2. Choosing code embeddings

In [4]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["codes_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 3. Choosing doc embeddings

In [5]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["docs_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 4. Choosing structure embeddings

In [6]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["structure_embeddings"].keys()

# Best model: sentence-transformers/bert-base-nli-mean-tokens

dict_keys(['sentence-transformers/bert-base-nli-mean-tokens'])

## 5. Choosing readme embeddings

In [7]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["readme_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 6. Choosing requirements embeddings

In [8]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["requirements_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 7. Similarity calculation

In [9]:
import torch
from torch.nn import CosineSimilarity
from itertools import combinations
from tqdm import tqdm

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info_train_embeddings)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info_train_embeddings.keys(), 2), total=num_of_rows):
      
    # Code embeddings
    code_embeddings1 = repo_info_train_embeddings[repo1]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["codes_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    code_embeddings2 = repo_info_train_embeddings[repo2]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["codes_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    # Doc embeddings
    doc_embeddings1 = repo_info_train_embeddings[repo1]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["docs_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    doc_embeddings2 = repo_info_train_embeddings[repo2]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["docs_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
        
    doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    # Requirements embeddings
    requirement_embeddings1 = repo_info_train_embeddings[repo1]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["requirements_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["requirements_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    requirement_embeddings2 = repo_info_train_embeddings[repo2]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["requirements_embeddings"])  is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["requirements_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    requirement_similarity = cossim(requirement_embeddings1, requirement_embeddings2).cpu().detach().numpy().item()

    # Structure embeddings
    structure_embeddings1 = repo_info_train_embeddings[repo1]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["structure_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"], dim=0)
    
    structure_embeddings2 = repo_info_train_embeddings[repo2]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["structure_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"], dim=0)
    
    structure_similarity = cossim(structure_embeddings1, structure_embeddings2).cpu().detach().numpy().item()

    # Reame embeddings
    readme_embeddings1 = repo_info_train_embeddings[repo1]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["readme_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    readme_embeddings2 = repo_info_train_embeddings[repo2]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["readme_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    readme_similarity = cossim(readme_embeddings1, readme_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info_train_embeddings[repo1]["topic"]
    topic2 = repo_info_train_embeddings[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity, requirement_similarity,
                structure_similarity, readme_similarity))

100%|█████████████████████████████████████| 4950/4950 [00:02<00:00, 2381.90it/s]


In [10]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim", "requirement_sim",
                                "structure_sim", "readme_sim"])
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,pyparsing/pyparsing,JohnLangford/vowpal_wabbit,Parser,Machine Learning,0.367156,0.206592,0.564345,0.709694,0.499978
1,pyparsing/pyparsing,django/channels,Parser,WebSocket,0.107252,-0.043911,0.477494,1.000000,0.295796
2,pyparsing/pyparsing,gak/pycallgraph,Parser,Code Analysis,0.407470,0.255868,0.609124,1.000000,0.473191
3,pyparsing/pyparsing,Microsoft/PTVS,Parser,Visual Studio,0.346247,0.417160,0.605438,0.704664,0.481464
4,pyparsing/pyparsing,fabtools/fabtools,Parser,SSH-style Deployment,0.007028,-0.053981,0.495892,1.000000,0.455127
...,...,...,...,...,...,...,...,...,...
4945,pytoolz/cytoolz,asweigart/pyautogui,Functional Programming,GUI / Web Testing,0.546379,0.000000,0.752657,0.666088,0.360821
4946,pytoolz/cytoolz,pyeve/eve,Functional Programming,Flask,0.422874,0.248503,0.589732,1.000000,0.134485
4947,tartiflette/tartiflette-aiohttp,asweigart/pyautogui,GraphQL,GUI / Web Testing,0.119902,0.000000,0.501720,0.628838,0.197305
4948,tartiflette/tartiflette-aiohttp,pyeve/eve,GraphQL,Flask,0.369080,0.000000,0.696359,0.704664,0.469383


In [11]:
df.to_csv("Embeddings_similairity_best_model_validation.csv")

In [12]:
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,httplib2/httplib2,shazow/urllib3,HTTP Clients,HTTP Clients,0.917901,0.697721,0.848736,1.000000,0.363316
1,google/pytype,python-rope/rope,Static Type Annotations Generators,Refactoring,0.813393,0.618409,0.625537,0.704664,0.352110
2,Instagram/MonkeyType,google/pytype,Static Type Annotations Generators,Static Type Annotations Generators,0.813358,0.715399,0.707368,0.704664,0.559394
3,aaugustin/websockets,shazow/urllib3,WebSocket,HTTP Clients,0.807290,0.716890,0.673194,0.666088,0.440735
4,lorien/grab,shazow/urllib3,Web Crawling,HTTP Clients,0.792860,0.454989,0.771430,0.628838,0.410731
...,...,...,...,...,...,...,...,...,...
4945,ryanmcgrath/twython,secdev/scapy,Third-party APIs,Hardware,-0.186496,-0.097018,0.404578,0.721410,0.281304
4946,jindaxiang/akshare,pypa/virtualenv,Downloader,Environment Management,-0.188052,-0.027738,0.754029,0.686534,0.376315
4947,sshwsfc/xadmin,tyiannak/pyAudioAnalysis,Admin Panels,Audio,-0.207828,-0.146951,0.580400,0.747487,0.166496
4948,ajenti/ajenti,tyiannak/pyAudioAnalysis,Admin Panels,Audio,-0.224499,-0.165711,0.673999,0.809244,0.255304


In [13]:
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,JaidedAI/EasyOCR,facebookresearch/pytext,Computer Vision,Natural Language Processing,0.770208,0.796132,0.827822,0.628838,0.579416
1,google/pytype,davidhalter/jedi,Static Type Annotations Generators,Vim,0.780371,0.772559,0.829475,0.704664,0.515473
2,PetrochukM/PyTorch-NLP,facebookresearch/pytext,Natural Language Processing,Natural Language Processing,0.787152,0.758234,0.800141,1.000000,0.594601
3,tartiflette/tartiflette-asgi,encode/orm,GraphQL,Relational Databases,0.200622,0.736177,0.590901,0.809244,0.389162
4,davidhalter/jedi,python-rope/rope,Vim,Refactoring,0.739363,0.727670,0.689547,1.000000,0.430219
...,...,...,...,...,...,...,...,...,...
4945,scanny/python-pptx,gaojiuli/toapi,Office,Web Content Extracting,0.203151,-0.172683,0.650113,0.704664,0.290912
4946,fabtools/fabtools,scanny/python-pptx,SSH-style Deployment,Office,0.138114,-0.179330,0.651110,1.000000,0.327415
4947,indico/indico,tyiannak/pyAudioAnalysis,CMS,Audio,-0.176711,-0.181690,0.566028,0.747487,0.217208
4948,ionelmc/python-manhole,scanny/python-pptx,Tracing,Office,0.075986,-0.189448,0.755285,1.000000,0.360220


In [14]:
df.sort_values("requirement_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,PyMySQL/PyMySQL,andialbrecht/sqlparse,MySQL,Parser,0.664137,0.219536,1.000000,0.709694,0.489664
1,crossbario/autobahn-python,scrapinghub/portia,WebSocket,Web Crawling,0.194806,-0.014134,0.990423,0.676703,0.522421
2,PetrochukM/PyTorch-NLP,JaidedAI/EasyOCR,Natural Language Processing,Computer Vision,0.590429,0.600512,0.975624,0.628838,0.549766
3,gak/pycallgraph,davidhalter/jedi,Code Analysis,Vim,0.423575,0.256859,0.973204,1.000000,0.579664
4,davidhalter/jedi,vispy/vispy,Vim,Data Visualization,0.173398,0.235214,0.940004,1.000000,0.527787
...,...,...,...,...,...,...,...,...,...
4945,lemire/simdjson,encode/orm,Serialization,Relational Databases,-0.048673,0.000000,0.000000,0.747487,0.424130
4946,lemire/simdjson,grantjenks/python-sortedcontainers,Serialization,Algorithms,0.162602,0.000000,0.000000,0.666088,0.147967
4947,lemire/simdjson,Delgan/loguru,Serialization,Logging,0.288933,0.000000,0.000000,0.666088,0.243325
4948,lemire/simdjson,ajenti/ajenti,Serialization,Admin Panels,-0.149748,0.000000,0.000000,0.628838,0.349506


In [15]:
df.sort_values("structure_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,mitsuhiko/pluginbase,andialbrecht/sqlparse,Miscellaneous,Parser,0.381767,0.065592,0.754319,1.000000,0.206877
1,fxsjy/jieba,jiaaro/pydub,Natural Language Processing,Audio,0.354195,0.152968,0.596742,1.000000,0.000000
2,klen/pylama,coala/coala,Code Linters,Code Analysis,0.680719,0.551293,0.780182,1.000000,0.534767
3,klen/pylama,aaugustin/websockets,Code Linters,WebSocket,0.244420,0.094199,0.637185,1.000000,0.385789
4,jpadilla/pyjwt,antocuni/pdb,JWT,pdb-like Debugger,0.130721,-0.059294,0.594705,1.000000,0.207692
...,...,...,...,...,...,...,...,...,...
4945,mkdocs/mkdocs,python/black,Static Site Generator,Code Formatters,0.501618,0.285447,0.000000,0.459789,0.315566
4946,scottrogowski/code2flow,ovalhub/pyicu,Code Analysis,Internationalization,0.425702,0.156671,0.671238,0.449773,0.480363
4947,mkdocs/mkdocs,ovalhub/pyicu,Static Site Generator,Internationalization,0.497950,0.089087,0.611254,0.449773,0.410556
4948,mkdocs/mkdocs,scrapinghub/portia,Static Site Generator,Web Crawling,0.585667,0.358667,0.539573,0.388221,0.447204


In [16]:
df.sort_values("readme_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,ionelmc/python-hunter,ionelmc/python-manhole,Tracing,Tracing,0.507361,0.156370,0.926956,1.000000,0.971062
1,davedoesdev/python-jwt,jpadilla/pyjwt,JWT,JWT,0.603151,0.431077,0.476013,0.810735,0.835624
2,aaugustin/websockets,crossbario/autobahn-python,WebSocket,WebSocket,0.624987,0.652438,0.565946,1.000000,0.810667
3,JohnLangford/vowpal_wabbit,ray-project/ray,Machine Learning,Batch Processing,0.499065,0.363417,0.000000,0.709694,0.704818
4,JohnLangford/vowpal_wabbit,JaidedAI/EasyOCR,Machine Learning,Computer Vision,0.550395,0.476553,0.818603,0.721410,0.679356
...,...,...,...,...,...,...,...,...,...
4945,jiaaro/pydub,dabeaz/ply,Audio,Parser,0.093612,0.006090,0.261832,0.809244,0.000000
4946,jindaxiang/akshare,grantjenks/python-sortedcontainers,Downloader,Algorithms,0.013335,-0.000278,0.745487,0.666088,-0.001167
4947,chriskiehl/Gooey,elastic/elasticsearch-dsl-py,GUI Development,Search,0.182570,0.080065,0.607743,0.666088,-0.010738
4948,fabtools/fabtools,grantjenks/python-sortedcontainers,SSH-style Deployment,Algorithms,0.136689,-0.060213,0.769852,1.000000,-0.018479
