# Requirements embedding evaluation (315 repositories / train set)

## 1. Loading repository information

In [None]:
# Downloading repo_info_train.pkl
!pip install gdown
!gdown 13Z-ReDj4QcwlfvIgUlL6MNE5W2dcPmLr

In [None]:
import pickle

with open("repo_info_train.pkl", "rb") as f:
    repo_info = pickle.load(f)
    f.close()

## 2. Download fine-tuning model

In [None]:
!pip3 install torch
!pip3 install transformers
# Downloading sentence transformers
!pip3 install sentence-transformers
# Downloading UniXcoder
!wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py
# !curl -O https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

In [None]:
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(device)

In [None]:
unixcoder_model_names = [
    "microsoft/unixcoder-base-nine",
    "microsoft/unixcoder-base",
    "microsoft/unixcoder-base-unimodal",
    "Lazyhope/unixcoder-nine-advtest",
    "Lazyhope/unixcoder-clone-detection",
    "Enoch/Unixcoder-Tuned-Code-Search-Py"
]
sentence_transformer_model_names = [
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/multi-qa-mpnet-base-cos-v1",
    "sentence-transformers/bert-base-nli-mean-tokens",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    "sentence-transformers/all-distilroberta-v1",
    "sentence-transformers/distilbert-base-nli-mean-tokens"
]
distil_bert_names = [
    "distilbert-base-uncased"
]

In [None]:
from unixcoder import UniXcoder
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, DistilBertModel

unixcoder_models = {}
sentence_transformer_models = {}
distil_bert_models = {}

# Initialise unixcoder model
for u_name in unixcoder_model_names:
    unixcoder_models[u_name] = UniXcoder(u_name)
    unixcoder_models[u_name].to(device)

# Initialise sentence transformer model
for s_name in sentence_transformer_model_names:
    sentence_transformer_models[s_name] = SentenceTransformer(model_name_or_path=s_name, device=device)

# Initialise DistilBERT model
for d_name in distil_bert_names:
    distil_bert_models[d_name] = {}
    distil_bert_models[d_name]["tokenizer"] = AutoTokenizer.from_pretrained(d_name, device=device)
    distil_bert_models[d_name]["model"] = DistilBertModel.from_pretrained(d_name)

## 3. Generating requirements embeddings

In [None]:
# Getting requirements embeddings
def get_unixcoder_embeddings(data, model):
    tokens_ids = model.tokenize([data], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = model(source_ids)
    return embeddings

# Getting requirements embeddings by DistilBERT
def get_bert_embeddings(data, bert_dict):
    tokenizer = bert_dict["tokenizer"]
    model = bert_dict["model"]
    inputs = tokenizer(data, padding=True, truncation=True, max_length=512, return_tensors="pt",)
    outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings

# Getting other embbeddings
def get_other_embeddings(data_list):
    embeddings = {}

    # Unixcoder model
    for u_name, u_model in tqdm(unixcoder_models.items()):
        print(f" - Using {u_name} model - ")
        embeddings_list = torch.concat([get_unixcoder_embeddings(data, u_model) for data in data_list])
        embeddings[u_name] = embeddings_list

    # Sentence transformer model
    for s_name, s_model in tqdm(sentence_transformer_models.items()):
        print(f" - Using {s_name} model - ")
        embeddings_list = s_model.encode(data_list, convert_to_tensor=True)
        embeddings[s_name] = embeddings_list

    # Using DistilBERT model
    for d_name, d_dict in tqdm(distil_bert_models.items()):
        print(f" - Using {d_name} model - ")
        embeddings_list = torch.concat([get_bert_embeddings(data, d_dict) for data in data_list])
        embeddings[d_name] = embeddings_list

    return embeddings


# Getting embeddings
def get_embeddings(input_data=None, input_mode=None):
    if input_data is None or not input_data:
        return torch.zeros((768,), device=device)

    with torch.no_grad():
        if input_mode == "requirements":
            return get_other_embeddings(input_data)


In [None]:
from tqdm import tqdm

# Generating requirements embedings
for repo_name, repo_dict in tqdm(repo_info.items()):
    print(f" - Generating requirements embeddings for {repo_name} - ")
    repo_dict["requirements_embeddings"] = get_embeddings(input_data=repo_dict["requirements"],
                                                          input_mode="requirements")

## 4. Saving requirements embeddings

In [None]:
# Change the tensor format to CPU
save_device = "cpu"

for repo_name, repo_dict in repo_info.items():
    if type(repo_dict["requirements_embeddings"]) is torch.Tensor:
        repo_info[repo_name]["requirements_embeddings"] = repo_info[repo_name]["requirements_embeddings"].to(save_device)
    else:
        for model_name, model_tensor in repo_dict["requirements_embeddings"].items():
            repo_info[repo_name]["requirements_embeddings"][model_name] = model_tensor.to(save_device)


In [None]:
# Check the tensor format
for repo_name, repo_dict in repo_info.items():
    if type(repo_dict["requirements_embeddings"]) is torch.Tensor:
        print(repo_dict["requirements_embeddings"].get_device())
    else:
        for model_name, model_tensor in repo_dict["requirements_embeddings"].items():
             print(repo_info[repo_name]["requirements_embeddings"][model_name].get_device())


In [None]:
# Saving to repo_info_train_requirements_embeddings.pkl
with open("repo_info_train_requirements_embeddings.pkl", "wb") as f:
    pickle.dump(repo_info, f)
    f.close()

## 5. Calculating similarity

In [None]:
from torch.nn import CosineSimilarity
from itertools import combinations
import pandas as pd

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2

models = unixcoder_model_names + sentence_transformer_model_names + distil_bert_names

for repo1, repo2 in tqdm(combinations(repo_info.keys(), 2), total=num_of_rows):
    row = [repo1, repo2, repo_info[repo1]["topic"], repo_info[repo2]["topic"]]
    for model_name in models:

        if type(repo_info[repo1]["requirements_embeddings"]) is torch.Tensor:
            embedding1 = repo_info[repo1]["requirements_embeddings"]
        else:
            # mean embedding
            embedding1 = torch.mean(repo_info[repo1]["requirements_embeddings"][model_name], dim=0)

        if type(repo_info[repo2]["requirements_embeddings"]) is torch.Tensor:
            embedding2 = repo_info[repo2]["requirements_embeddings"]
        else:
            # mean embedding
            embedding2 = torch.mean(repo_info[repo2]["requirements_embeddings"][model_name], dim=0)

        similarity = cossim(embedding1, embedding2).cpu().detach().numpy().item()
        row.append(similarity)

    res.append(row)

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2"] + models)
df

In [None]:
df.dropna(inplace=True, subset=models)
df.reset_index(drop=True, inplace=True)
df

In [None]:
# Saving similarity calculation
df.to_csv("requirements_embedding_evaluation_train_315.csv")

## 6. Evaluating different models

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

model_auc = {}
y_true = df['topic1'] == df['topic2']
fig = plt.figure(figsize=(12, 10))


def roc(model_name):
    y_score = df[model_name]
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    model_auc[model_name] = roc_auc

    plt.plot(fpr, tpr, label=model_name)


for model in models:
    roc(model)

# Order labels by each model's auc score
handles, labels = plt.gca().get_legend_handles_labels()
order = sorted(range(len(labels)), key=lambda k: -model_auc[labels[k]])
ordered_handles = [handles[idx] for idx in order]
ordered_labels = [f'{labels[idx]} (AUC = {model_auc[labels[idx]]:.3f})' for idx in order]
plt.legend(ordered_handles, ordered_labels, loc="lower right")

plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic of different models')
# Saving evaluation result
plt.savefig('roc_requirements_evaluation_train_315.png')
plt.show()

In [None]:
model_auc