In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [None]:
PRETRAINED_MODEL = "microsoft/codebert-base"
tokenizer, model = AutoTokenizer.from_pretrained(PRETRAINED_MODEL), AutoModel.from_pretrained(PRETRAINED_MODEL)

In [None]:
def get_code_embedding(code: str):
    inputs = tokenizer(code, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        outputs = outputs.last_hidden_state.mean(dim=1)  # [1, 9, 768] mean pooled -> [1, 768]
        return outputs

In [None]:
# Example code snippets and pairs

code_1 = "def add(a, b): return a + b"
code_2 = "def sum(x, y): return x + y"
code_3 = "print('Hello, World!')"

ppair = code_1, code_2
npair = code_1, code_3

ppair_emb = tuple(map(get_code_embedding, ppair))
npair_emb = tuple(map(get_code_embedding, npair))

# Example similarity and distance calculation

p_cosine_sim = F.cosine_similarity(*ppair_emb)
n_cosine_sim = F.cosine_similarity(*npair_emb)
p_pair_dist = F.pairwise_distance(*ppair_emb)
n_pair_dist = F.pairwise_distance(*npair_emb)

print('Cosine similarity:')
print('- positive pair:', p_cosine_sim)
print('- negative pair:', n_cosine_sim)

print('Pairwise distance:')
print('- positive pair:', p_pair_dist)
print('- negative pair:', n_pair_dist)