# Code embedding evaluation (315 repositories / train set)

## 1. Loading repository information

In [1]:
!pip install gdown
!gdown 1sJDBLvVRvl4Sx0ICGuoM27pREIg0M756

Downloading...
From (uriginal): https://drive.google.com/uc?id=1sJDBLvVRvl4Sx0ICGuoM27pREIg0M756
From (redirected): https://drive.google.com/uc?id=1sJDBLvVRvl4Sx0ICGuoM27pREIg0M756&confirm=t&uuid=f987bdc1-ac35-49b7-bee8-36f90dc2cb10
To: /cs/home/hz65/PycharmProjects/RepoSim4Py/Embedding/Embedding_model_train_315/Code_embedding_evaluation_train_315/repo_info_train.pkl
100%|████████████████████████████████████████| 201M/201M [00:03<00:00, 61.3MB/s]


In [2]:
import pickle

with open("repo_info_train.pkl", "rb") as f:
    repo_info = pickle.load(f)
    f.close()

## 2. Download pre-trained model

In [3]:
!pip3 install torch
!pip3 install transformers
!wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py
# !curl -O https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

--2023-07-05 13:30:17--  https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10352 (10K) [text/plain]
Saving to: ‘unixcoder.py.2’


2023-07-05 13:30:18 (47.7 MB/s) - ‘unixcoder.py.2’ saved [10352/10352]



In [4]:
import torch

# device = (
#     "cuda"
#     if torch.cuda.is_available()
#     else "mps"
#     if torch.backends.mps.is_available()
#     else "cpu"
# )
device = "cpu"
print(device)

cpu


In [5]:
unixcoder_model_names = [
    "microsoft/unixcoder-base-nine",
    "microsoft/unixcoder-base",
    "microsoft/unixcoder-base-unimodal",
    "Lazyhope/unixcoder-nine-advtest",
    "Lazyhope/unixcoder-clone-detection",
    "Enoch/Unixcoder-Tuned-Code-Search-Py"
]

bert_model_names = [
    "Enoch/cocosoda-graphcodebert",
    "Enoch/graphcodebert-py"
]

In [6]:
from unixcoder import UniXcoder
from transformers import AutoTokenizer, AutoModel

unixcoder_models = {}
# Initialise unixcoder model
for u_name in unixcoder_model_names:
    unixcoder_models[u_name] = UniXcoder(u_name)
    unixcoder_models[u_name].to(device)

bert_models = {}
# Initialise GraphCodeBERT model
for b_name in bert_model_names:
    bert_models[b_name] = {}
    bert_models[b_name]["tokenizer"] = AutoTokenizer.from_pretrained(b_name, device=device)
    bert_models[b_name]["model"] = AutoModel.from_pretrained(b_name)

  from .autonotebook import tqdm as notebook_tqdm


## 3. Generating code embeddings

In [7]:
# Getting code embeddings by UniXcoder
def get_unixcoder_embeddings(code, model):
    tokens_ids = model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = model(source_ids)
    return embeddings

# Getting code embeddings by GraphCodeBERT
def get_bert_embeddings(code, bert_dict):
    tokenizer = bert_dict["tokenizer"]
    model = bert_dict["model"]
    inputs = tokenizer(code, padding=True, truncation=True, max_length=512, return_tensors="pt",)
    outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings

# Getting code list embeddings
def get_code_list_embeddings(data_list):
    codes_embeddings = {}
    # Using UniXcoder model
    for u_name, u_model in tqdm(unixcoder_models.items()):
        print(f" - Using {u_name} model - ")
        code_embeddings_list = torch.concat([get_unixcoder_embeddings(code, u_model) for code in data_list])
        codes_embeddings[u_name] = code_embeddings_list
    
    # Using GraphCodeBERT model
    for b_name, b_dict in tqdm(bert_models.items()):
        print(f" - Using {b_name} model - ")
        code_embeddings_list = torch.concat([get_bert_embeddings(code, b_dict) for code in data_list])
        codes_embeddings[b_name] = code_embeddings_list
    
    return codes_embeddings


# Getting embeddings
def get_embeddings(input_data=None, input_mode=None):
    if input_data is None or not input_data:
        return torch.zeros((768,), device=device)

    with torch.no_grad():
        if input_mode == "codes":
            return get_code_list_embeddings(input_data)


In [None]:
from tqdm import tqdm

for repo_name, repo_dict in tqdm(repo_info.items()):
    print(f" - Generating code embeddings for {repo_name} - ")
    repo_dict["codes_embeddings"] = get_embeddings(input_data=repo_dict["codes"], input_mode="codes")

  0%|                                                   | 0/315 [00:00<?, ?it/s]

 - Generating code embeddings for jet-admin/jet-bridge - 



  0%|                                                     | 0/6 [00:00<?, ?it/s][A

 - Using microsoft/unixcoder-base-nine model - 



 17%|███████▌                                     | 1/6 [00:17<01:25, 17.12s/it][A

 - Using microsoft/unixcoder-base model - 



 33%|███████████████                              | 2/6 [00:34<01:08, 17.08s/it][A

 - Using microsoft/unixcoder-base-unimodal model - 



 50%|██████████████████████▌                      | 3/6 [00:51<00:51, 17.19s/it][A

 - Using Lazyhope/unixcoder-nine-advtest model - 



 67%|██████████████████████████████               | 4/6 [01:08<00:34, 17.21s/it][A

 - Using Lazyhope/unixcoder-clone-detection model - 



 83%|█████████████████████████████████████▌       | 5/6 [01:26<00:17, 17.27s/it][A

 - Using Enoch/Unixcoder-Tuned-Code-Search-Py model - 



100%|█████████████████████████████████████████████| 6/6 [01:43<00:00, 17.27s/it][A

  0%|                                                     | 0/2 [00:00<?, ?it/s][A

 - Using Enoch/cocosoda-graphcodebert model - 



 50%|██████████████████████▌                      | 1/2 [00:22<00:22, 22.30s/it][A

 - Using Enoch/graphcodebert-py model - 



100%|█████████████████████████████████████████████| 2/2 [00:44<00:00, 22.31s/it][A
  0%|                                       | 1/315 [02:28<12:55:57, 148.27s/it]

 - Generating code embeddings for patrys/httmock - 



  0%|                                                     | 0/6 [00:00<?, ?it/s][A

 - Using microsoft/unixcoder-base-nine model - 



 17%|███████▌                                     | 1/6 [00:01<00:08,  1.69s/it][A

 - Using microsoft/unixcoder-base model - 



 33%|███████████████                              | 2/6 [00:03<00:06,  1.71s/it][A

 - Using microsoft/unixcoder-base-unimodal model - 



 50%|██████████████████████▌                      | 3/6 [00:05<00:05,  1.70s/it][A

 - Using Lazyhope/unixcoder-nine-advtest model - 



 67%|██████████████████████████████               | 4/6 [00:06<00:03,  1.70s/it][A

 - Using Lazyhope/unixcoder-clone-detection model - 



 83%|█████████████████████████████████████▌       | 5/6 [00:08<00:01,  1.69s/it][A

 - Using Enoch/Unixcoder-Tuned-Code-Search-Py model - 



100%|█████████████████████████████████████████████| 6/6 [00:10<00:00,  1.70s/it][A

  0%|                                                     | 0/2 [00:00<?, ?it/s][A

 - Using Enoch/cocosoda-graphcodebert model - 



 50%|██████████████████████▌                      | 1/2 [00:01<00:01,  1.97s/it][A

 - Using Enoch/graphcodebert-py model - 



100%|█████████████████████████████████████████████| 2/2 [00:03<00:00,  1.96s/it][A
  1%|▎                                        | 2/315 [02:42<6:01:45, 69.35s/it]

 - Generating code embeddings for pytransitions/transitions - 



  0%|                                                     | 0/6 [00:00<?, ?it/s][A

 - Using microsoft/unixcoder-base-nine model - 


## 4. Saving code embeddings

In [None]:
# Change the tensor format to CPU
save_device = "cpu"

for repo_name, repo_dict in repo_info.items():
    if type(repo_dict["codes_embeddings"]) is torch.Tensor:
        repo_info[repo_name]["codes_embeddings"] = repo_info[repo_name]["codes_embeddings"].to(save_device)
    else:
        for model_name, model_tensor in repo_dict["codes_embeddings"].items():
            repo_info[repo_name]["codes_embeddings"][model_name] = model_tensor.to(save_device)

In [None]:
# Check the tensor format
for repo_name, repo_dict in repo_info.items():
    if type(repo_dict["codes_embeddings"]) is torch.Tensor:
        print(repo_dict["codes_embeddings"].get_device())
    else:
        for model_name, model_tensor in repo_dict["codes_embeddings"].items():
             print(repo_info[repo_name]["codes_embeddings"][model_name].get_device())

In [None]:
with open("repo_info_train_code_embeddings.pkl", "wb") as f:
    pickle.dump(repo_info, f)
    f.close()

## 5. Calculating similarity

In [None]:
from torch.nn import CosineSimilarity
from itertools import combinations
import pandas as pd

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2

model_names = unixcoder_model_names + bert_model_names

for repo1, repo2 in tqdm(combinations(repo_info.keys(), 2), total=num_of_rows):
    row = [repo1, repo2, repo_info[repo1]["topic"], repo_info[repo2]["topic"]]
    for model_name in model_names:

        if type(repo_info[repo1]["codes_embeddings"]) is torch.Tensor:
            embedding1 = repo_info[repo1]["codes_embeddings"]
        else:
            embedding1 = torch.mean(repo_info[repo1]["codes_embeddings"][model_name], dim=0)

        if type(repo_info[repo2]["codes_embeddings"]) is torch.Tensor:
            embedding2 = repo_info[repo2]["codes_embeddings"]
        else:
            embedding2 = torch.mean(repo_info[repo2]["codes_embeddings"][model_name], dim=0)

        similarity = cossim(embedding1, embedding2).cpu().detach().numpy().item()
        row.append(similarity)

    res.append(row)

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2"] + model_names)
df

In [None]:
df.dropna(inplace=True, subset=model_names)
df.reset_index(drop=True, inplace=True)
df

In [None]:
df.to_csv("code_embedding_evaluation_train_315.csv")

## 6. Evaluating different models

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

model_auc = {}
y_true = df['topic1'] == df['topic2']
fig = plt.figure(figsize=(12, 10))


def roc(model_name):
    y_score = df[model_name]
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    model_auc[model_name] = roc_auc
    plt.plot(fpr, tpr, label=model_name)


for model in model_names:
    roc(model)

# Order labels by each model's auc score
handles, labels = plt.gca().get_legend_handles_labels()
order = sorted(range(len(labels)), key=lambda k: -model_auc[labels[k]])
ordered_handles = [handles[idx] for idx in order]
ordered_labels = [f'{labels[idx]} (AUC = {model_auc[labels[idx]]:.3f})' for idx in order]
plt.legend(ordered_handles, ordered_labels, loc="lower right")

plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic of different models')
plt.savefig('roc_code_evaluation_train_315.eps')
plt.show()

In [None]:
model_auc