In [1]:
import sys
sys.path.append('../src')

In [2]:
from utils import get_combined_df, tokenize, reverse_tokenize
from bm25 import search, mean_reciprocal_rank, precision_at_k, average_precision_score
# from pyserini.index.lucene import IndexReader
from pyserini.search.lucene import LuceneSearcher
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the combined dataframe for a repository
repo_dir = "../smalldata/fbr"
idx_path = "../smalldata/fbr/index_commit_tokenized"

In [4]:
df = get_combined_df(repo_dir)

In [5]:
def sample_query(df, seed=42):
    """
    Sample a query from the dataframe
    """
    sampled_commit = df.drop_duplicates(subset='commit_id').sample(1, random_state=seed).iloc[0]
    return {
        'commit_message': sampled_commit['commit_message'],
        'commit_id': sampled_commit['commit_id'],
        'commit_date': sampled_commit['commit_date'],
        'actual_files_modified': df[df['commit_id'] == sampled_commit['commit_id']]['file_path'].tolist()
    }

In [6]:
def print_search_results(query, results, showOnlyActualModified = False):
    """
    Print the search results
    """
    actual_modified_files = query['actual_files_modified']
    for i in range(len(results)):
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
    # print with repo name and file name
        obj = json.loads(results[i].raw)
        commit_date = int(obj["commit_date"])
        if showOnlyActualModified:
            if obj["file_path"] in actual_modified_files:
                print(f'{i+1:2} {obj["file_path"]:4} {results[i].score:.5f} {commit_date}')
        else:
            print(f'{i+1:2} {obj["file_path"]:4} {results[i].score:.5f} {commit_date}')


In [7]:
def evaluate(hits, actual_modified_files, k=1000):
    retrieved_files = [json.loads(hit.raw)['file_path'] for hit in hits]
    relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]

    metrics = {
        'MAP': average_precision_score(relevant, [1]*len(relevant)) if any(relevant) else 0,
        'P@10': precision_at_k(relevant, 10),
        'P@100': precision_at_k(relevant, 100),
        'P@1000': precision_at_k(relevant, 1000),
        'MRR': mean_reciprocal_rank(relevant),
        f'Recall@{k}': len(set(file for idx, file in enumerate(retrieved_files) if relevant[idx] == 1)) / len(actual_modified_files)
    }

    return {k: round(v, 4) for k, v in metrics.items()}

In [8]:
# Sample a random query from the dataframe
query = sample_query(df, seed=108)
# Print the sample query's details
query

{'commit_message': 'fix typo on inline comment (#13364)\n\n',
 'commit_id': 'e07a3cd28f3b360755d6832a01d724a3b7576bc4',
 'commit_date': 1533916266,
 'actual_files_modified': ['packages/react-reconciler/src/ReactFiberScheduler.js']}

In [9]:
searcher = LuceneSearcher(idx_path)

In [10]:
search_results = search(searcher, query['commit_message'], query['commit_date'], 1000)

In [11]:
print_search_results(query, search_results , showOnlyActualModified=True)

174 packages/react-reconciler/src/ReactFiberScheduler.js 9.77590 1533746573


In [12]:
evaluate(search_results, query['actual_files_modified'])

{'MAP': 0.0014,
 'P@10': 0.0,
 'P@100': 0.0,
 'P@1000': 0.001,
 'MRR': 0.0057,
 'Recall@1000': 1.0}

# Reranking

In [20]:
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaConfig, RobertaModel
import torch

## bert-base-uncased

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 labels: Relevant or Not relevant

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def get_bert_input(query_message, commit_message):
    """
    Convert the query and commit message into BERT input format.
    """
    return tokenizer(query_message + " [SEP] " + commit_message, return_tensors="pt", truncation=True, padding=True, max_length=512)

In [16]:
def get_relevance_score(input_data):
    """
    Use BERT to predict the relevance score.
    """
    with torch.no_grad():
        logits = model(**input_data).logits[0]
    return logits[1] - logits[0]  # Difference between scores for "relevant" and "not relevant"

In [17]:
def rerank_with_bert(query_message, search_results):
    """
    Rerank the BM25 search results using BERT.
    """
    scores = []
    for hit in search_results:
        # commit_message = json.loads(hit.raw)["commit_message"]
        commit_message =reverse_tokenize(json.loads(hit.raw)['contents'])
        bert_input = get_bert_input(query_message, commit_message)
        relevance_score = get_relevance_score(bert_input)
        scores.append(relevance_score)

    reranked_results = [hit for _, hit in sorted(zip(scores, search_results), key=lambda pair: pair[0], reverse=True)]

    return reranked_results

In [18]:
# Use BERT to rerank the BM25 search results
reranked_results = rerank_with_bert(query['commit_message'], search_results)

# Print the reranked results
print("Reranked results:")
print_search_results(query, reranked_results)

Reranked results:
 1 packages/react-dom/node-stream.js 7.26790 1498368702
 2 scripts/rollup/bundles.js 7.26790 1498368702
 3 src/node_modules/react-dom/node-stream.js 7.26790 1498368702
 4 src/node_modules/react-dom/server.js 7.26790 1498368702
 5 src/renderers/dom/ReactDOMNodeStreamEntry.js 7.26790 1498368702
 6 src/renderers/dom/ReactDOMNodeStreamRenderer.js 7.26790 1498368702
 7 src/renderers/dom/ReactDOMServerEntry.js 7.26789 1498368702
 8 src/renderers/dom/ReactDOMStringRenderer.js 7.26789 1498368702
 9 src/renderers/dom/shared/__tests__/ReactDOMServerIntegration-test.js 7.26789 1498368702
10 src/renderers/shared/server/ReactPartialRenderer.js 7.26789 1498368702
11 src/isomorphic/children/flattenChildren.js 8.20580 1499899781
12 src/renderers/__tests__/ReactChildReconciler-test.js 8.20580 1499899781
13 src/renderers/__tests__/ReactMultiChild-test.js 8.20580 1499899781
14 src/renderers/shared/fiber/ReactChildFiber.js 8.20580 1499899781
15 src/renderers/shared/stack/reconciler/React

In [19]:
print_search_results(query, reranked_results, showOnlyActualModified=True)

672 packages/react-reconciler/src/ReactFiberScheduler.js 9.77590 1533746573


In [22]:
evaluate(reranked_results, query['actual_files_modified'])

{'MAP': 0.0014,
 'P@10': 0.0,
 'P@100': 0.0,
 'P@1000': 0.001,
 'MRR': 0.0015,
 'Recall@1000': 1.0}

## codebert-base

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.to(device)
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [31]:
def get_codebert_embedding(text1, text2):
    """
    Get the embeddings from CodeBERT for a given pair of text
    """
    tokens1 = tokenizer.tokenize(text1)
    tokens2 = tokenizer.tokenize(text2)

    # Ensure the combined length doesn't exceed the model's max input length (512 tokens)
    max_tokens_for_text2 = 512 - len(tokens1) - 3  # -3 for [CLS], [SEP], and [EOS] tokens
    if len(tokens2) > max_tokens_for_text2:
        tokens2 = tokens2[:max_tokens_for_text2]

    tokens = [tokenizer.cls_token] + tokens1 + [tokenizer.sep_token] + tokens2 + [tokenizer.eos_token]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    with torch.no_grad():
        context_embeddings = model(torch.tensor(token_ids).unsqueeze(0).to(device))[0]
    # Return only the embedding for the [CLS] token
    return context_embeddings[0, 0].unsqueeze(0)

In [27]:
def rerank_results(search_results, query_text):
    """
    Rerank the search results using CodeBERT
    """
    reranked_results = []
    query_embedding = get_codebert_embedding(query_text, "")  # We're getting the embedding for just the query here

    for result in search_results:
        result_obj = json.loads(result.raw)
        cur_commit_message = reverse_tokenize(result_obj["contents"])
        result_embedding = get_codebert_embedding("", cur_commit_message)  # Getting embedding for each result
        score = torch.nn.functional.cosine_similarity(query_embedding, result_embedding, dim=-1)
        reranked_results.append((result, score.item()))

    # Sort by CodeBERT similarity score (from high to low)
    reranked_results.sort(key=lambda x: x[1], reverse=True)
    return [result[0] for result in reranked_results]  # Return results only (without scores)

In [32]:
codebert_reranked_results = rerank_results(search_results, query['commit_message'])

print("CodeBERT reranked results:")
print_search_results(query, codebert_reranked_results, showOnlyActualModified=True)

CodeBERT reranked results:
32 packages/react-reconciler/src/ReactFiberScheduler.js 9.77590 1533746573


In [33]:
evaluate(codebert_reranked_results, query['actual_files_modified'])

{'MAP': 0.0014,
 'P@10': 0.0,
 'P@100': 0.01,
 'P@1000': 0.001,
 'MRR': 0.0312,
 'Recall@1000': 1.0}