<a href="https://colab.research.google.com/github/RepoAnalysis/RepoSim/blob/main/notebooks/PlayGround.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependency for the script

In [None]:
# Install python 3.9 on colab
!sudo apt-get update -y
!sudo apt-get install python3.9 python3-pip

# Install inspect4py
!python3.9 -m pip install -U inspect4py==0.0.6 sentence-transformers pandas

# Download scripts
%cd /content/
# !wget https://raw.githubusercontent.com/RepoAnalysis/RepoSim/main/scripts/repo_sim.py # TODO: uncomment this when repo is public
!wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

## Run RepoSim script to extract embeddings

In [None]:
# Specify repositories here
!python3.9 /content/repo_sim.py -i lepture/authlib idan/oauthlib evonove/django-oauth-toolkit selwin/python-user-agents SmileyChris/django-countries django-compressor/django-compressor -o /content/ --eval

In [None]:
import pickle

# Load repo_info generated by the script
with open("/content/repo_info.pkl", "rb") as f:
    repo_info = pickle.load(f)

repo_info.keys()

## Perform various experiments with `repo_info`

### Experiment1: Find top k similar repos for every repo by comparing specific embeddings

In [7]:
import torch
from sentence_transformers import util

def top_k(key, k=1):
    repos = [repo for repo in repo_info.keys() if repo_info[repo].get(key) is not None]
    corpus_embeddings = torch.concat(tuple(repo_info[repo][key].unsqueeze(0) for repo in repos))
    
    res = {} # {repo_name: [(similar_repo1, score1), ...]}
    for i in range(len(repos)):
        this = repos[i]
        res[this] = []
        query_embeddings = corpus_embeddings[i]
        seach_res = util.semantic_search(query_embeddings, corpus_embeddings, top_k=k+1)[0][1:]
        for match_repo in seach_res:
            other = repos[match_repo.get('corpus_id')]
            score = match_repo.get('score')
            res[this].append((other, score))

    return res

In [8]:
import pandas as pd

def most_similar(key="code_embeddings"):
    rows = []
    for repo, top_res in top_k(key=key, k=1).items():
        rows.append((repo, top_res[0][0], top_res[0][1]))

    return pd.DataFrame(rows, columns=["repo1", "repo2", "score"])

most_similar("code_embeddings")

Unnamed: 0,repo1,repo2,score
0,lepture/authlib,idan/oauthlib,0.936137
1,idan/oauthlib,lepture/authlib,0.936137
2,evonove/django-oauth-toolkit,idan/oauthlib,0.92253
3,selwin/python-user-agents,idan/oauthlib,0.295096
4,SmileyChris/django-countries,evonove/django-oauth-toolkit,0.382237
5,django-compressor/django-compressor,SmileyChris/django-countries,0.30367


In [9]:
most_similar("doc_embeddings")

Unnamed: 0,repo1,repo2,score
0,lepture/authlib,idan/oauthlib,0.942527
1,idan/oauthlib,lepture/authlib,0.942527
2,evonove/django-oauth-toolkit,idan/oauthlib,0.838541
3,SmileyChris/django-countries,django-compressor/django-compressor,0.178134
4,django-compressor/django-compressor,SmileyChris/django-countries,0.178134
