In [2]:
import sys
sys.path.append('../src')

In [117]:
import json
import os
import numpy as np
import pandas as pd
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader
from sklearn.metrics import average_precision_score, ndcg_score

from utils import count_commits, get_combined_df, tokenize, reverse_tokenize
from collections import defaultdict

In [122]:
class SearchResult:
    def __init__(self, commit_id, file_path, score, commit_date, commit_msg):
        self.commit_id = commit_id
        self.file_path = file_path
        self.score = score
        self.commit_date = commit_date
        self.commit_msg = commit_msg


    def __repr__(self):
        class_name = self.__class__.__name__
        # return f"{self.file_path} {self.score:.5f} {self.commit_date}"
        return f"{class_name}(score: {self.score:.5f}, file_path: {self.file_path!r}, commit_id: {self.commit_id!r}, commit_date: {self.commit_date})"

    def is_actual_modified(self, actual_modified_files):
        return self.file_path in actual_modified_files

    @staticmethod
    def print_results(query, search_results, show_only_actual_modified=False):
        actual_modified_files = query['actual_files_modified']
        for i, result in enumerate(search_results):
            if show_only_actual_modified and not result.is_actual_modified(actual_modified_files):
                continue
            print(f"{i+1:2} {result}")

In [123]:
class BM25Search:
    def __init__(self, index_path):
        if not os.path.exists(index_path):
            raise FileNotFoundError(f"Index at {index_path} does not exist!")
        self.searcher = LuceneSearcher(index_path)
        print(f"Loaded index at {index_path}")
        print(f'Index Stats: {IndexReader(index_path).index_stats()}')
        # self.ranking_depth = ranking_depth

    def search(self, query, query_date, ranking_depth):
        # TODO maybe change this to mean returning reranking_depths total results instead of being pruned by the query date
        hits = self.searcher.search(tokenize(query), ranking_depth)
        unix_date = query_date
        filtered_hits = [
            SearchResult(hit.docid, json.loads(hit.raw)['file_path'], hit.score, int(json.loads(hit.raw)["commit_date"]), reverse_tokenize(json.loads(hit.raw)['contents']))
            for hit in hits if int(json.loads(hit.raw)["commit_date"]) < unix_date
        ]
        return filtered_hits

In [121]:
class SearchEvaluator:
    def __init__(self, metrics):
        self.metrics = metrics

    @staticmethod
    def precision_at_k(relevant, k):
        return sum(relevant[:k]) / k

    @staticmethod
    def mean_reciprocal_rank(relevant):
        for idx, value in enumerate(relevant):
            if value == 1:
                return 1 / (idx + 1)
        return 0

    def evaluate(self, search_results, actual_modified_files):
        retrieved_files = [result.file_path for result in search_results]
        relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]

        evaluations = {}
        for metric in self.metrics:
            if metric == 'MAP':
                evaluations[metric] = average_precision_score(relevant, [1]*len(relevant)) if any(relevant) else 0
            elif metric == 'MRR':
                evaluations[metric] = self.mean_reciprocal_rank(relevant)
            elif metric.startswith('P@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.precision_at_k(relevant, k)
            elif metric.startswith('Recall@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = len(
                    {
                        file
                        for idx, file in enumerate(retrieved_files)
                        if relevant[idx] == 1
                    }
                ) / len(actual_modified_files)

        return {k: round(v, 4) for k, v in evaluations.items()}

    def evaluate_file_based(self, search_results, actual_modified_files, aggregation_strategy='sump'):
        file_relevance = defaultdict(list)

        # Aggregate relevance scores for each file across all commits
        for result in search_results:
            if result.file_path in actual_modified_files:
                # file_relevance[result.file_path] += 1
                file_relevance[result.file_path].append(result.score)

        # Normalize relevance scores based on occurrences in actual modified files
        # max_relevance = max(file_relevance.values(), default=1)
        # normalized_relevance = {file: relevance / max_relevance for file, relevance in file_relevance.items()}
        # sorted_normalized_relevance = sorted(normalized_relevance.items(), key=lambda item: item[1], reverse=True)
        print(file_relevance)
        if aggregation_strategy == 'sump':
            aggregated_scores = {file: sum(relevance) for file, relevance in file_relevance.items()}
        elif aggregation_strategy == 'maxp':
            aggregated_scores = {file: max(relevance) for file, relevance in file_relevance.items()}
        elif aggregation_strategy == 'firstp':
            aggregated_scores = {file: relevance[0] for file, relevance in file_relevance.items()}
        elif aggregation_strategy == 'avgp':
            aggregated_scores = {file: np.mean(relevance) for file, relevance in file_relevance.items()}
        else:
            raise ValueError(f"Unknown aggregation strategy {aggregation_strategy}")

        sorted_aggregated_scores = sorted(aggregated_scores.items(), key=lambda item: item[1], reverse=True)

        print(sorted_aggregated_scores)

        evaluations = {}
        for metric in self.metrics:
            if metric.startswith('P@'):
                # Compute precision at k for files, not individual commit mentions
                k = int(metric.split('@')[1])
                # top_k_files = sorted(normalized_relevance.items(), key=lambda item: item[1], reverse=True)[:k]
                top_k_files = sorted_aggregated_scores[:k]
                precision_at_k = sum(1 for file, relevance in top_k_files if file in actual_modified_files) / k
                evaluations[metric] = precision_at_k
            elif metric.startswith('Recall@'):
                k = int(metric.split('@')[1])
                # top_k_files = sorted(normalized_relevance.items(), key=lambda item: item[1], reverse=True)[:k]
                top_k_files = sorted_aggregated_scores[:k]
                recall_at_k = sum(1 for file, relevance in top_k_files if file in actual_modified_files) / len(actual_modified_files)
                evaluations[metric] = recall_at_k
            elif metric == 'MAP':
                # Compute average precision for files, not individual commit mentions
                average_precision = 0
                num_relevant_files = 0
                for idx, (file, relevance) in enumerate(sorted_aggregated_scores):
                    if file in actual_modified_files:
                        num_relevant_files += 1
                        average_precision += num_relevant_files / (idx + 1)
                average_precision /= len(actual_modified_files)
                evaluations[metric] = average_precision
            elif metric == 'MRR':
                # Compute mean reciprocal rank for files, not individual commit mentions
                reciprocal_rank = 0
                for idx, (file, relevance) in enumerate(sorted_aggregated_scores):
                    if file in actual_modified_files:
                        reciprocal_rank = 1 / (idx + 1)
                        break
                evaluations[metric] = reciprocal_rank

        return {k: round(v, 4) for k, v in evaluations.items()}

In [23]:
index_path = '../smalldata/fbr/index_commit_tokenized'
repo_path = '../smalldata/fbr/'
K=1000

In [124]:
bm25_search = BM25Search(index_path)

In [14]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', f'Recall@{K}']

In [115]:
evaluator = SearchEvaluator(metrics)

In [97]:
class ModelEvaluator:
    def __init__(self, model, eval_model, combined_df, seed=42):
        self.model = model
        self.eval_model = eval_model
        self.combined_df = combined_df
        self.seed = seed

    def sample_commits(self, n):
        if self.combined_df.commit_id.nunique() < n:
            raise ValueError(f'Not enough commits to sample. Required: {n}, available: {self.combined_df.commit_id.nunique()}')
        return self.combined_df.drop_duplicates(subset='commit_id').sample(n=n, replace=False, random_state=self.seed)

    def evaluate_sampling(self, n=100, k=1000, output_dir='.', skip_existing=False, evaluation_strategy='commit', aggregation_strategy='sump'):
        model_name = self.model.__class__.__name__
        output_file = f"{output_dir}/{model_name}_metrics.txt"

        if skip_existing and os.path.exists(output_file):
            print(f'Output file {output_file} already exists, skipping...')
            return

        sampled_commits = self.sample_commits(n)

        results = []
        for _, row in sampled_commits.iterrows():
            search_results = self.model.search(row['commit_message'], row['commit_date'], ranking_depth=k)
            if evaluation_strategy == 'commit':
                evaluation = self.eval_model.evaluate(search_results,
                                                       self.combined_df[self.combined_df['commit_id'] == row['commit_id']]['file_path'].tolist())
            elif evaluation_strategy == 'file':
                evaluation = self.eval_model.evaluate_file_based(search_results,
                                                                  self.combined_df[self.combined_df['commit_id'] == row['commit_id']]['file_path'].tolist(), aggregation_strategy=aggregation_strategy)
            else:
                raise ValueError(f'Invalid evaluation strategy: {evaluation_strategy}')
            results.append(evaluation)

        avg_scores = {metric: round(np.mean([result[metric] for result in results]), 4) for metric in results[0]}

        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
        with open(output_file, "w") as file:
            file.write(f"Model Name: {model_name}\n")
            file.write(f"Sample Size: {n}\n")
            file.write("Evaluation Metrics:\n")
            for key, value in avg_scores.items():
                file.write(f"{key}: {value}\n")

        return avg_scores

In [24]:
combined_df = get_combined_df(repo_path)

In [98]:
bm25_evaluator = ModelEvaluator(bm25_search, evaluator, combined_df)

In [99]:
bm25_evaluator.evaluate_sampling(n=100, k=K, output_dir='../tmp/')

{'MAP': 0.0427,
 'P@10': 0.079,
 'P@100': 0.0327,
 'P@1000': 0.0084,
 'MRR': 0.2676,
 'Recall@1000': 0.6351}

In [100]:
bm25_evaluator.evaluate_sampling(n=100, k=K, output_dir='../tmp/', evaluation_strategy='file')

{'MAP': 0.6351,
 'P@10': 0.242,
 'P@100': 0.0269,
 'P@1000': 0.0027,
 'MRR': 0.76,
 'Recall@1000': 0.6351}

In [110]:
# randomly sample a commit from the combined_df
random_commit = combined_df.sample(1).iloc[0]
random_commit

owner                                                             facebook
repo_name                                                            react
commit_date                                                     1556154069
commit_id                         64e3da286f2c1e66303d2ae1dc06276b2f866e9d
commit_message            Event API: Add `FocusScope` surface (#15487)\n\n
file_path                         packages/react-events/src/ReactEvents.js
cur_file_content         /**\n * Copyright (c) Facebook, Inc. and its a...
previous_commit_id                3f058debc29ccb05a47ac8a8d747c5a5b29a6ed3
previous_file_path                                                    <NA>
previous_file_content    /**\n * Copyright (c) Facebook, Inc. and its a...
diff                     @@ -10,8 +10,6 @@\n import {\n   REACT_EVENT_T...
status                                                            modified
is_merge_request                                                     False
file_extension           

In [134]:
# get search results for the random commit
search_results = bm25_search.search(random_commit['commit_message'], random_commit['commit_date'], ranking_depth=K)
search_results[:20]

[SearchResult(score: 13.67170, file_path: 'src/core/ReactCompositeComponent.js', commit_id: '8855d6153e252c735de0e6cc373787d22c1a467b', commit_date: 1397094653),
 SearchResult(score: 13.67170, file_path: 'src/core/ReactPropTypes.js', commit_id: '8855d6153e252c735de0e6cc373787d22c1a467b', commit_date: 1397094653),
 SearchResult(score: 13.67170, file_path: 'src/core/__tests__/ReactPropTypes-test.js', commit_id: '8855d6153e252c735de0e6cc373787d22c1a467b', commit_date: 1397094653),
 SearchResult(score: 11.86170, file_path: 'src/event/delegate/DelegateEvent.js', commit_id: '3eaed5a122a050a6a2852b9f5348685373a9a6f7', commit_date: 1370668112),
 SearchResult(score: 11.86170, file_path: 'src/event/delegate/DelegateFocusEvent.js', commit_id: '3eaed5a122a050a6a2852b9f5348685373a9a6f7', commit_date: 1370668112),
 SearchResult(score: 11.86170, file_path: 'src/event/delegate/DelegateKeyboardEvent.js', commit_id: '3eaed5a122a050a6a2852b9f5348685373a9a6f7', commit_date: 1370668112),
 SearchResult(scor

In [136]:
# evaluate the search results
evaluation = evaluator.evaluate(search_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist())
evaluation

{'MAP': 0.0546,
 'P@10': 0.0,
 'P@100': 0.05,
 'P@1000': 0.019,
 'MRR': 0.0119,
 'Recall@1000': 0.3478}

In [138]:
file_based_evaluation = evaluator.evaluate_file_based(search_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist())
file_based_evaluation

defaultdict(<class 'list'>, {'packages/react-dom/src/events/DOMEventResponderSystem.js': [9.576800346374512, 9.452381134033203, 8.900697708129883, 8.575098037719727], 'packages/shared/ReactTypes.js': [9.576796531677246, 9.452377319335938, 8.900694847106934, 8.575092315673828], 'packages/react-dom/src/events/__tests__/DOMEventResponderSystem-test.internal.js': [9.452380180358887, 8.900696754455566, 8.57509708404541], 'packages/react-reconciler/src/ReactFiberCompleteWork.js': [8.90069580078125, 8.575094223022461, 8.168195724487305], 'packages/react-events/src/ReactEvents.js': [8.575096130371094], 'packages/shared/ReactSymbols.js': [8.575093269348145, 8.168185234069824], 'packages/shared/getComponentName.js': [8.575091361999512], 'scripts/rollup/bundles.js': [7.981998920440674]})
[('packages/react-dom/src/events/DOMEventResponderSystem.js', 36.504977226257324), ('packages/shared/ReactTypes.js', 36.504961013793945), ('packages/react-dom/src/events/__tests__/DOMEventResponderSystem-test.int

{'MAP': 0.3478,
 'P@10': 0.8,
 'P@100': 0.08,
 'P@1000': 0.008,
 'MRR': 1.0,
 'Recall@1000': 0.3478}

In [116]:
# iterate over the list of aggregation strategies and evaluate each one

aggregation_strategies = ['sump', 'maxp', 'firstp', 'avgp']
for strategy in aggregation_strategies:
    file_based_evaluation = evaluator.evaluate_file_based(search_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist(), aggregation_strategy=strategy)
    print(f"{strategy}: {file_based_evaluation}")

defaultdict(<class 'list'>, {'packages/react-dom/src/events/DOMEventResponderSystem.js': [9.576800346374512, 9.452381134033203, 8.900697708129883, 8.575098037719727], 'packages/shared/ReactTypes.js': [9.576796531677246, 9.452377319335938, 8.900694847106934, 8.575092315673828], 'packages/react-dom/src/events/__tests__/DOMEventResponderSystem-test.internal.js': [9.452380180358887, 8.900696754455566, 8.57509708404541], 'packages/react-reconciler/src/ReactFiberCompleteWork.js': [8.90069580078125, 8.575094223022461, 8.168195724487305], 'packages/react-events/src/ReactEvents.js': [8.575096130371094], 'packages/shared/ReactSymbols.js': [8.575093269348145, 8.168185234069824], 'packages/shared/getComponentName.js': [8.575091361999512], 'scripts/rollup/bundles.js': [7.981998920440674]})
[('packages/react-dom/src/events/DOMEventResponderSystem.js', 36.504977226257324), ('packages/shared/ReactTypes.js', 36.504961013793945), ('packages/react-dom/src/events/__tests__/DOMEventResponderSystem-test.int

In [109]:
# do the same for the model evaluator
for strategy in aggregation_strategies:
    model_evaluation = bm25_evaluator.evaluate_sampling(n=1000, k=K, output_dir='../tmp/', evaluation_strategy='file', aggregation_strategy=strategy)
    print(f"{strategy}: {model_evaluation}")

sump: {'MAP': 0.5755, 'P@10': 0.1981, 'P@100': 0.0237, 'P@1000': 0.0025, 'MRR': 0.698, 'Recall@1000': 0.5755}
maxp: {'MAP': 0.5755, 'P@10': 0.1981, 'P@100': 0.0237, 'P@1000': 0.0025, 'MRR': 0.698, 'Recall@1000': 0.5755}
firstp: {'MAP': 0.5755, 'P@10': 0.1981, 'P@100': 0.0237, 'P@1000': 0.0025, 'MRR': 0.698, 'Recall@1000': 0.5755}
avgp: {'MAP': 0.5755, 'P@10': 0.1981, 'P@100': 0.0237, 'P@1000': 0.0025, 'MRR': 0.698, 'Recall@1000': 0.5755}


In [127]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from collections import defaultdict

class BERTReRanker:
    """
    A class for performing reranking with a BERT-based model.
    """
    def __init__(self, model_name, psgLen=128, psgStride=64, psgCnt=None, scoreAggregation='maxp', batchSize=8):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.model.eval()  # Set model to evaluation mode

        # Passage handling parameters
        self.psgLen = psgLen
        self.psgStride = psgStride
        self.psgCnt = psgCnt
        self.scoreAggregation = scoreAggregation

        if self.scoreAggregation == 'firstp':
            self.psgCnt = 1

        self.batchSize = batchSize

    def rerank(self, query, search_results):
        """
        Rerank the search results using the BERT model.

        query: The query string.
        search_results: A list of SearchResult objects.
        """
        reranked_results = []
        # todo - add batching

        # Process each SearchResult to create input for BERT
        for result in search_results:
            passages = self._split_into_passages(query, result.commit_msg)

            # Score each passage with BERT
            passage_scores = [self._score_passage(query, passage) for passage in passages]

            # Aggregate passage scores to get a single document score
            doc_score = self._aggregate_scores(passage_scores)

            # Create a new SearchResult with the updated score
            reranked_results.append((doc_score, result))

        # Sort reranked results by the new score
        reranked_results.sort(key=lambda x: x[0], reverse=True)
        return [result for _, result in reranked_results]

    def _split_into_passages(self, query, commit_msg):
        # Tokenize the query and commit message
        tokens = self.tokenizer.tokenize(query) + self.tokenizer.tokenize(commit_msg)

        # Split the tokens into passages
        passages = []
        for i in range(0, len(tokens), self.psgStride):
            passage = tokens[i:i+self.psgLen]
            passages.append(self.tokenizer.convert_tokens_to_string(passage))
            if self.psgCnt and len(passages) >= self.psgCnt:
                break
        return passages

    def _score_passage(self, query, passage):
        # Encode query and passage for BERT
        inputs = self.tokenizer.encode_plus(
            query,
            passage,
            add_special_tokens=True,
            max_length=self.psgLen,
            truncation="only_second",
            return_tensors="pt"
        )

        # Score the (query, passage) pair with BERT
        with torch.no_grad():
            outputs = self.model(**inputs)
            score = outputs.logits.squeeze().item()
        return score

    def _aggregate_scores(self, passage_scores):
        # Aggregate passage scores based on the specified strategy
        if self.scoreAggregation == 'firstp':
            return passage_scores[0]
        elif self.scoreAggregation == 'maxp':
            return max(passage_scores)
        elif self.scoreAggregation == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        elif self.scoreAggregation == 'sump':
            return sum(passage_scores)
        else:
            raise ValueError(f"Invalid score aggregation method: {self.scoreAggregation}")

In [128]:
# Assuming bm25_searcher is an instance of BM25Search
query = "Fix login page error"
query_date = 1699261235
bm25_results = bm25_search.search(query, query_date, K)

# Now rerank those results with BERT
bert_reranker = BERTReRanker(model_name="bert-base-uncased")
reranked_results = bert_reranker.rerank(query, bm25_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (791 > 512). Running this sequence through the model will result in indexing errors


In [130]:
# Print the top 10 results
reranked_results[:10]

[SearchResult(score: 2.69137, file_path: 'packages/react-reconciler/src/ReactFiberBeginWork.new.js', commit_id: '1faf9e3dd5d6492f3607d5c721055819e4106bc6', commit_date: 1601495853),
 SearchResult(score: 2.69136, file_path: 'packages/react-reconciler/src/ReactFiberBeginWork.old.js', commit_id: '1faf9e3dd5d6492f3607d5c721055819e4106bc6', commit_date: 1601495853),
 SearchResult(score: 2.69136, file_path: 'packages/react-reconciler/src/ReactFiberSuspenseComponent.new.js', commit_id: '1faf9e3dd5d6492f3607d5c721055819e4106bc6', commit_date: 1601495853),
 SearchResult(score: 2.69136, file_path: 'packages/react-reconciler/src/ReactFiberSuspenseComponent.old.js', commit_id: '1faf9e3dd5d6492f3607d5c721055819e4106bc6', commit_date: 1601495853),
 SearchResult(score: 2.69136, file_path: 'packages/react-reconciler/src/__tests__/ReactCPUSuspense-test.js', commit_id: '1faf9e3dd5d6492f3607d5c721055819e4106bc6', commit_date: 1601495853),
 SearchResult(score: 3.26617, file_path: 'packages/react-dom-bindi

In [131]:
# evaluate the reranked results
evaluation = evaluator.evaluate(reranked_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist())
evaluation

{'MAP': 0.016,
 'P@10': 0.0,
 'P@100': 0.0,
 'P@1000': 0.016,
 'MRR': 0.0031,
 'Recall@1000': 0.3913}

In [137]:
evaluation = evaluator.evaluate_file_based(reranked_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist())
evaluation

defaultdict(<class 'list'>, {'packages/react-reconciler/src/ReactFiberCompleteWork.js': [2.718986988067627, 3.8634390830993652, 3.863529920578003, 4.273975372314453], 'packages/shared/ReactSymbols.js': [3.5164780616760254], 'packages/shared/ReactTypes.js': [3.6565749645233154, 3.8633739948272705, 3.863521099090576], 'scripts/rollup/bundles.js': [4.612883567810059, 5.564578056335449, 2.7962939739227295], 'packages/react-dom/src/shared/assertValidProps.js': [5.564676284790039], 'packages/react-dom/src/events/DOMEventResponderSystem.js': [3.8635449409484863], 'packages/react-dom/src/events/__tests__/DOMEventResponderSystem-test.internal.js': [3.86354398727417], 'packages/react-events/src/FocusScope.js': [3.863539934158325], 'packages/shared/getComponentName.js': [3.8635189533233643]})
[('packages/react-reconciler/src/ReactFiberCompleteWork.js', 14.719931364059448), ('scripts/rollup/bundles.js', 12.973755598068237), ('packages/shared/ReactTypes.js', 11.383470058441162), ('packages/react-do

{'MAP': 0.3913,
 'P@10': 0.9,
 'P@100': 0.09,
 'P@1000': 0.009,
 'MRR': 1.0,
 'Recall@1000': 0.3913}

In [None]:
{'MAP': 0.0546,
 'P@10': 0.0,
 'P@100': 0.05,
 'P@1000': 0.019,
 'MRR': 0.0119,
 'Recall@1000': 0.3478}

{'MAP': 0.3478,
 'P@10': 0.8,
 'P@100': 0.08,
 'P@1000': 0.008,
 'MRR': 1.0,
 'Recall@1000': 0.3478}