# Embedding similarity (with best model of each embedding)

## 1. Downloading embeddings

In [1]:
!gdown 1x4g24q2qKW2oqKemVmerxZkZ9ZCzvlZK

Downloading...
From (uriginal): https://drive.google.com/uc?id=1x4g24q2qKW2oqKemVmerxZkZ9ZCzvlZK
From (redirected): https://drive.google.com/uc?id=1x4g24q2qKW2oqKemVmerxZkZ9ZCzvlZK&confirm=t&uuid=0cadebf1-eb4b-4337-81c4-96fb3fbc9ab5
To: /Users/henry/Documents/PycharmProjects/RepoSim4Py/Embedding/Embedding_model_test_41/repo_info_test_embeddings.pkl
100%|████████████████████████████████████████| 374M/374M [00:25<00:00, 14.5MB/s]


In [2]:
import pickle

with open("repo_info_test_embeddings.pkl", "rb") as f:
    repo_info_train_embeddings = pickle.load(f)
    f.close()

In [3]:
next(iter(repo_info_train_embeddings.values())).keys()

dict_keys(['docs', 'codes', 'structure', 'requirements', 'readme', 'topic', 'codes_embeddings', 'docs_embeddings', 'structure_embeddings', 'requirements_embeddings', 'readme_embeddings'])

## 2. Choosing code embeddings

In [4]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["codes_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 3. Choosing doc embeddings

In [5]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["docs_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 4. Choosing structure embeddings

In [6]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["structure_embeddings"].keys()

# Best model: sentence-transformers/bert-base-nli-mean-tokens

dict_keys(['sentence-transformers/bert-base-nli-mean-tokens'])

## 5. Choosing readme embeddings

In [7]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["readme_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 6. Choosing requirements embeddings

In [8]:
repo_info_train_embeddings[next(iter(repo_info_train_embeddings.keys()))]["requirements_embeddings"].keys()

# Best model: Lazyhope/unixcoder-nine-advtest

dict_keys(['Lazyhope/unixcoder-nine-advtest'])

## 7. Similarity calculation

In [9]:
import torch
from torch.nn import CosineSimilarity
from itertools import combinations
from tqdm import tqdm

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info_train_embeddings)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info_train_embeddings.keys(), 2), total=num_of_rows):
      
    # Code embeddings
    code_embeddings1 = repo_info_train_embeddings[repo1]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["codes_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    code_embeddings2 = repo_info_train_embeddings[repo2]["codes_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["codes_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["codes_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    # Doc embeddings
    doc_embeddings1 = repo_info_train_embeddings[repo1]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["docs_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    doc_embeddings2 = repo_info_train_embeddings[repo2]["docs_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["docs_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["docs_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
        
    doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    # Requirements embeddings
    requirement_embeddings1 = repo_info_train_embeddings[repo1]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["requirements_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["requirements_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    requirement_embeddings2 = repo_info_train_embeddings[repo2]["requirements_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["requirements_embeddings"])  is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["requirements_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    requirement_similarity = cossim(requirement_embeddings1, requirement_embeddings2).cpu().detach().numpy().item()

    # Structure embeddings
    structure_embeddings1 = repo_info_train_embeddings[repo1]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["structure_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"], dim=0)
    
    structure_embeddings2 = repo_info_train_embeddings[repo2]["structure_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["structure_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["structure_embeddings"]["sentence-transformers/bert-base-nli-mean-tokens"], dim=0)
    
    structure_similarity = cossim(structure_embeddings1, structure_embeddings2).cpu().detach().numpy().item()

    # Reame embeddings
    readme_embeddings1 = repo_info_train_embeddings[repo1]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo1]["readme_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo1]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    readme_embeddings2 = repo_info_train_embeddings[repo2]["readme_embeddings"] \
    if type(repo_info_train_embeddings[repo2]["readme_embeddings"]) is torch.Tensor \
    else torch.mean(repo_info_train_embeddings[repo2]["readme_embeddings"]["Lazyhope/unixcoder-nine-advtest"], dim=0)
    
    readme_similarity = cossim(readme_embeddings1, readme_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info_train_embeddings[repo1]["topic"]
    topic2 = repo_info_train_embeddings[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity, requirement_similarity,
                structure_similarity, readme_similarity))

100%|███████████████████████████████████████| 820/820 [00:00<00:00, 1261.18it/s]


In [10]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim", "requirement_sim",
                                "structure_sim", "readme_sim"])
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,ellisonleao/pyshorteners,sloria/doitlive,URL Manipulation,Productivity CLI Tools,0.278231,0.274021,0.625271,0.666088,0.352250
1,ellisonleao/pyshorteners,sebastien/cuisine,URL Manipulation,SSH-style Deployment,0.102119,0.081627,0.638544,0.628838,0.153990
2,ellisonleao/pyshorteners,lepture/authlib,URL Manipulation,OAuth,0.355684,0.059970,0.631954,0.666088,0.375606
3,ellisonleao/pyshorteners,inducer/pudb,URL Manipulation,pdb-like Debugger,0.171610,0.251377,0.549288,0.666088,0.297272
4,ellisonleao/pyshorteners,uber/pyflame,URL Manipulation,Profiler,0.168625,0.172948,0.687557,0.628838,0.265967
...,...,...,...,...,...,...,...,...,...
815,nvbn/thefuck,erikrose/more-itertools,Productivity CLI Tools,Functional Programming,0.361806,0.120328,0.445335,0.793715,0.187190
816,nvbn/thefuck,PyMySQL/mysqlclient-python,Productivity CLI Tools,MySQL,0.318877,0.088801,0.560347,0.789199,0.327744
817,benfred/implicit,erikrose/more-itertools,Recommender Systems,Functional Programming,0.233704,0.233788,0.411217,0.704664,0.306023
818,benfred/implicit,PyMySQL/mysqlclient-python,Recommender Systems,MySQL,0.078420,-0.080451,0.490748,0.721410,0.255893


In [12]:
df.to_csv("Embeddings_similairity_best_model_test.csv")

In [13]:
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,DamnWidget/anaconda,python/typeshed,Sublime Text,Static Type Checkers,0.950295,0.293729,0.775944,1.000000,0.724984
1,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.937533,0.942031,0.468762,1.000000,0.744216
2,pykong/copier,audreyr/cookiecutter,Productivity CLI Tools,Productivity CLI Tools,0.812354,0.776335,0.756465,0.686534,0.667971
3,robotframework/robotframework,google/python-fire,Testing Frameworks,Command-line Application Development,0.790632,0.425735,0.619527,0.704664,0.312293
4,inducer/pudb,bpython/bpython,pdb-like Debugger,Interactive Interpreter,0.777809,0.600441,0.989589,1.000000,0.489290
...,...,...,...,...,...,...,...,...,...
815,audreyr/cookiecutter,isnowfy/snownlp,Productivity CLI Tools,Natural Language Processing,-0.133114,0.125845,0.536526,0.747487,0.155487
816,platformio/platformio-core,isnowfy/snownlp,Build Tools,Natural Language Processing,-0.134030,0.305345,0.281195,0.810735,0.132751
817,robotframework/robotframework,rossgoodwin/hmap,Testing Frameworks,Image Processing,-0.140391,0.000000,0.450966,0.772761,0.031431
818,sebastien/cuisine,isnowfy/snownlp,SSH-style Deployment,Natural Language Processing,-0.155707,0.355429,1.000000,0.809244,0.240390


In [14]:
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.937533,0.942031,0.468762,1.000000,0.744216
1,pykong/copier,audreyr/cookiecutter,Productivity CLI Tools,Productivity CLI Tools,0.812354,0.776335,0.756465,0.686534,0.667971
2,DamnWidget/anaconda,bpython/bpython,Sublime Text,Interactive Interpreter,0.534831,0.699506,0.967427,0.704664,0.511509
3,aws/aws-cli,audreyr/cookiecutter,CLI Enhancements,Productivity CLI Tools,0.670380,0.673407,0.784092,0.666088,0.327293
4,pythonnet/pythonnet,erikrose/more-itertools,Microsoft Windows,Functional Programming,0.691411,0.668971,0.468204,1.000000,0.202872
...,...,...,...,...,...,...,...,...,...
815,uber/pyflame,pallets/werkzeug,Profiler,WSGI Servers,0.121525,-0.145982,0.574876,0.704664,0.267820
816,robotframework/robotframework,benfred/implicit,Testing Frameworks,Recommender Systems,0.032338,-0.148065,0.568022,0.704664,0.348604
817,uber/pyflame,dfunckt/django-rules,Profiler,Permissions,0.144969,-0.155627,0.400260,0.704664,0.186315
818,google/python-fire,benfred/implicit,Command-line Application Development,Recommender Systems,0.058952,-0.170791,0.592357,1.000000,0.208566


In [15]:
df.sort_values("requirement_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,KoffeinFlummi/Chronyk,alecthomas/voluptuous,Date and Time,Data Validation,0.205766,0.075917,1.000000,0.810735,0.210984
1,sebastien/cuisine,isnowfy/snownlp,SSH-style Deployment,Natural Language Processing,-0.155707,0.355429,1.000000,0.809244,0.240390
2,inducer/pudb,bpython/bpython,pdb-like Debugger,Interactive Interpreter,0.777809,0.600441,0.989589,1.000000,0.489290
3,inducer/pudb,DamnWidget/anaconda,pdb-like Debugger,Sublime Text,0.516984,0.586639,0.969381,0.704664,0.456360
4,DamnWidget/anaconda,bpython/bpython,Sublime Text,Interactive Interpreter,0.534831,0.699506,0.967427,0.704664,0.511509
...,...,...,...,...,...,...,...,...,...
815,rossgoodwin/hmap,erikrose/more-itertools,Image Processing,Functional Programming,0.130642,0.000000,0.215664,0.772761,0.085085
816,sebastien/cuisine,rossgoodwin/hmap,SSH-style Deployment,Image Processing,-0.088467,0.000000,0.169106,0.676298,0.109395
817,rossgoodwin/hmap,isnowfy/snownlp,Image Processing,Natural Language Processing,-0.064969,0.000000,0.169106,0.823801,0.110210
818,ztane/python-Levenshtein,rossgoodwin/hmap,Specific Formats Processing - General,Image Processing,0.069659,0.000000,0.133492,0.772761,0.173948


In [16]:
df.sort_values("structure_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,aws/aws-cli,maxmind/geoip-api-python,CLI Enhancements,Geolocation,0.379597,0.000000,0.627700,1.000000,0.216667
1,graphql-python/graphene,dfunckt/django-rules,GraphQL,Permissions,0.523764,0.195887,0.539073,1.000000,0.339545
2,robotframework/robotframework,pythonnet/pythonnet,Testing Frameworks,Microsoft Windows,0.715819,0.432857,0.639234,1.000000,0.484541
3,platformio/platformio-core,erikrose/more-itertools,Build Tools,Functional Programming,0.155189,0.069888,0.325266,1.000000,0.254913
4,DamnWidget/anaconda,pycco-docs/pycco,Sublime Text,Documentation,0.106003,0.298802,0.645002,1.000000,0.469615
...,...,...,...,...,...,...,...,...,...
815,audreyr/cookiecutter,pycco-docs/pycco,Productivity CLI Tools,Documentation,0.518122,0.271673,0.743239,0.628838,0.318473
816,ellisonleao/pyshorteners,python/typeshed,URL Manipulation,Static Type Checkers,0.087878,0.071500,0.613967,0.628838,0.244056
817,ellisonleao/pyshorteners,pycco-docs/pycco,URL Manipulation,Documentation,0.226302,0.184449,0.624174,0.628838,0.303363
818,ellisonleao/pyshorteners,mininet/mininet,URL Manipulation,Network Virtualization,0.170443,0.268785,0.621750,0.628838,0.198434


In [17]:
df.sort_values("readme_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,requirement_sim,structure_sim,readme_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.937533,0.942031,0.468762,1.000000,0.744216
1,DamnWidget/anaconda,python/typeshed,Sublime Text,Static Type Checkers,0.950295,0.293729,0.775944,1.000000,0.724984
2,pykong/copier,audreyr/cookiecutter,Productivity CLI Tools,Productivity CLI Tools,0.812354,0.776335,0.756465,0.686534,0.667971
3,bpython/bpython,pythonnet/pythonnet,Interactive Interpreter,Microsoft Windows,0.633047,0.396336,0.754554,1.000000,0.578205
4,sloria/doitlive,nvbn/thefuck,Productivity CLI Tools,Productivity CLI Tools,0.712805,0.550668,0.768428,0.793715,0.563173
...,...,...,...,...,...,...,...,...,...
815,rossgoodwin/hmap,dfunckt/django-rules,Image Processing,Permissions,-0.017761,0.000000,0.255989,0.772761,0.049420
816,pykong/copier,maxmind/geoip-api-python,Productivity CLI Tools,Geolocation,0.254198,0.000000,0.538748,0.709694,0.034258
817,robotframework/robotframework,rossgoodwin/hmap,Testing Frameworks,Image Processing,-0.140391,0.000000,0.450966,0.772761,0.031431
818,FactoryBoy/factory_boy,rossgoodwin/hmap,Object Factories,Image Processing,-0.015340,0.000000,0.508818,0.772761,0.028553
