# Structure similarity experiment

## 1. Install inspect4py

In [1]:
!inspect4py --version

inspect4py, version 0.0.8


In [2]:
REPOS = [
    "RepoAnalysis/RepoSnipy",
    "RepoAnalysis/RepoSim"
]
for repo in REPOS:
    !mkdir -p {repo} & & git clone {f"https://github.com/{repo}.git"} {repo}
    !inspect4py -i {repo} -o output/ {repo} -dt

Cloning into 'RepoAnalysis/RepoSnipy'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 82 (delta 13), reused 20 (delta 9), pack-reused 56[K
Receiving objects: 100% (82/82), 57.85 MiB | 6.63 MiB/s, done.
Resolving deltas: 100% (34/34), done.
Creating jsDir:output/RepoAnalysis/RepoSnipy/RepoSnipy/json_files
Added in funct/method run_model , argument named st.session_state.github_token, number of argument 2
Creating jsDir:output/RepoAnalysis/RepoSnipy/RepoSnipy/data/json_files
RepoSnipy/
├── .gitignore
├── app.py
├── assets/
│   └── search.gif
├── data/
│   ├── create_index.py
│   ├── index.bin
│   └── repositories.txt
├── evaluate.py
├── LICENSE
├── README.md
└── requirements.txt
Analysis completed
Total number of folders processed (root folder is considered a folder): 2
Total number of files found:  3
Total number of classes found:  2
Total number of dependencies found in thos

## 2. Extract structure information

In [3]:
import json


def get_directory_tree_info(file_path):
    with open(file_path, "r") as f:
        json_info = json.load(f)
    return json_info.get("directory_tree")


directory_tree_info = {}
for repo in REPOS:
    directory_tree_info[repo] = get_directory_tree_info(f"output/{repo}/directory_info.json")

## 3. Generating embedding

In [4]:
from sentence_transformers import SentenceTransformer
import torch

directory_tree_list = []
for repo in REPOS:
    directory_tree_list.append(directory_tree_info[repo])

device = torch.device("cuda"
                      if torch.cuda.is_available()
                      else "mps"
if torch.backends.mps.is_available()
else "cpu")
structure_model = SentenceTransformer("all-mpnet-base-v2", device=device)

with torch.no_grad():
    embedding = structure_model.encode(directory_tree_list, convert_to_tensor=True)

print(embedding.shape)

torch.Size([2, 768])


## 4. Calculating similarity

In [5]:
from torch.nn import CosineSimilarity

cosine = CosineSimilarity(dim=0, eps=1e-8)
cosine(embedding[0], embedding[1]).cpu().numpy().item()

0.8261315822601318