In [2]:
import sys
sys.path.append('../src')

In [74]:
import json
import os
import numpy as np
import pandas as pd
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader
from sklearn.metrics import average_precision_score, ndcg_score

from utils import count_commits, get_combined_df, tokenize
from collections import defaultdict

In [65]:
class SearchResult:
    def __init__(self, commit_id, file_path, score, commit_date):
        self.commit_id = commit_id
        self.file_path = file_path
        self.score = score
        self.commit_date = commit_date


    def __repr__(self):
        class_name = self.__class__.__name__
        # return f"{self.file_path} {self.score:.5f} {self.commit_date}"
        return f"{class_name}(score: {self.score:.5f}, file_path: {self.file_path!r}, commit_id: {self.commit_id!r}, commit_date: {self.commit_date})"

    def is_actual_modified(self, actual_modified_files):
        return self.file_path in actual_modified_files

    @staticmethod
    def print_results(query, search_results, show_only_actual_modified=False):
        actual_modified_files = query['actual_files_modified']
        for i, result in enumerate(search_results):
            if show_only_actual_modified and not result.is_actual_modified(actual_modified_files):
                continue
            print(f"{i+1:2} {result}")

In [59]:
class BM25Search:
    def __init__(self, index_path):
        if not os.path.exists(index_path):
            raise FileNotFoundError(f"Index at {index_path} does not exist!")
        self.searcher = LuceneSearcher(index_path)
        # self.ranking_depth = ranking_depth

    def search(self, query, query_date, ranking_depth):
        # TODO maybe change this to mean returning reranking_depths total results instead of being pruned by the query date
        hits = self.searcher.search(tokenize(query), ranking_depth)
        unix_date = query_date
        filtered_hits = [
            SearchResult(hit.docid, json.loads(hit.raw)['file_path'], hit.score, int(json.loads(hit.raw)["commit_date"]))
            for hit in hits if int(json.loads(hit.raw)["commit_date"]) < unix_date
        ]
        return filtered_hits

In [85]:


class SearchEvaluator:
    def __init__(self, metrics):
        self.metrics = metrics

    @staticmethod
    def precision_at_k(relevant, k):
        return sum(relevant[:k]) / k

    @staticmethod
    def mean_reciprocal_rank(relevant):
        for idx, value in enumerate(relevant):
            if value == 1:
                return 1 / (idx + 1)
        return 0

    def evaluate(self, search_results, actual_modified_files):
        retrieved_files = [result.file_path for result in search_results]
        relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]

        evaluations = {}
        for metric in self.metrics:
            if metric == 'MAP':
                evaluations[metric] = average_precision_score(relevant, [1]*len(relevant)) if any(relevant) else 0
            elif metric == 'MRR':
                evaluations[metric] = self.mean_reciprocal_rank(relevant)
            elif metric.startswith('P@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.precision_at_k(relevant, k)
            elif metric.startswith('Recall@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = len(
                    {
                        file
                        for idx, file in enumerate(retrieved_files)
                        if relevant[idx] == 1
                    }
                ) / len(actual_modified_files)

        return {k: round(v, 4) for k, v in evaluations.items()}

    def evaluate_file_based(self, search_results, actual_modified_files):
        file_relevance = defaultdict(int)

        # Aggregate relevance scores for each file across all commits
        for result in search_results:
            if result.file_path in actual_modified_files:
                file_relevance[result.file_path] += 1

        # Normalize relevance scores based on occurrences in actual modified files
        max_relevance = max(file_relevance.values(), default=1)
        normalized_relevance = {file: relevance / max_relevance for file, relevance in file_relevance.items()}
        sorted_normalized_relevance = sorted(normalized_relevance.items(), key=lambda item: item[1], reverse=True)

        evaluations = {}
        for metric in self.metrics:
            if metric.startswith('P@'):
                # Compute precision at k for files, not individual commit mentions
                k = int(metric.split('@')[1])
                # top_k_files = sorted(normalized_relevance.items(), key=lambda item: item[1], reverse=True)[:k]
                top_k_files = sorted_normalized_relevance[:k]
                precision_at_k = sum(1 for file, relevance in top_k_files if file in actual_modified_files) / k
                evaluations[metric] = precision_at_k
            elif metric.startswith('Recall@'):
                k = int(metric.split('@')[1])
                # top_k_files = sorted(normalized_relevance.items(), key=lambda item: item[1], reverse=True)[:k]
                top_k_files = sorted_normalized_relevance[:k]
                recall_at_k = sum(1 for file, relevance in top_k_files if file in actual_modified_files) / len(actual_modified_files)
                evaluations[metric] = recall_at_k
            elif metric == 'MAP':
                # Compute average precision for files, not individual commit mentions
                average_precision = 0
                num_relevant_files = 0
                for idx, (file, relevance) in enumerate(sorted_normalized_relevance):
                    if file in actual_modified_files:
                        num_relevant_files += 1
                        average_precision += num_relevant_files / (idx + 1)
                average_precision /= len(actual_modified_files)
                evaluations[metric] = average_precision
            elif metric == 'MRR':
                # Compute mean reciprocal rank for files, not individual commit mentions
                reciprocal_rank = 0
                for idx, (file, relevance) in enumerate(sorted_normalized_relevance):
                    if file in actual_modified_files:
                        reciprocal_rank = 1 / (idx + 1)
                        break
                evaluations[metric] = reciprocal_rank

        return {k: round(v, 4) for k, v in evaluations.items()}

In [23]:
index_path = '../smalldata/fbr/index_commit_tokenized'
repo_path = '../smalldata/fbr/'
K=1000

In [66]:
bm25_search = BM25Search(index_path)

In [14]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', f'Recall@{K}']

In [86]:
evaluator = SearchEvaluator(metrics)

In [80]:
class ModelEvaluator:
    def __init__(self, model, eval_model, combined_df, seed=42):
        self.model = model
        self.eval_model = eval_model
        self.combined_df = combined_df
        self.seed = seed

    def sample_commits(self, n):
        if self.combined_df.commit_id.nunique() < n:
            raise ValueError(f'Not enough commits to sample. Required: {n}, available: {self.combined_df.commit_id.nunique()}')
        return self.combined_df.drop_duplicates(subset='commit_id').sample(n=n, replace=False, random_state=self.seed)

    def evaluate_sampling(self, n=100, k=1000, output_dir='.', skip_existing=False, evaluation_strategy='commit'):
        model_name = self.model.__class__.__name__
        output_file = f"{output_dir}/{model_name}_metrics.txt"

        if skip_existing and os.path.exists(output_file):
            print(f'Output file {output_file} already exists, skipping...')
            return

        sampled_commits = self.sample_commits(n)

        results = []
        for _, row in sampled_commits.iterrows():
            search_results = self.model.search(row['commit_message'], row['commit_date'], ranking_depth=k)
            if evaluation_strategy == 'commit':
                evaluation = self.eval_model.evaluate(search_results,
                                                       self.combined_df[self.combined_df['commit_id'] == row['commit_id']]['file_path'].tolist())
            elif evaluation_strategy == 'file':
                evaluation = self.eval_model.evaluate_file_based(search_results,
                                                                  self.combined_df[self.combined_df['commit_id'] == row['commit_id']]['file_path'].tolist())
            else:
                raise ValueError(f'Invalid evaluation strategy: {evaluation_strategy}')
            results.append(evaluation)

        avg_scores = {metric: round(np.mean([result[metric] for result in results]), 4) for metric in results[0]}

        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
        with open(output_file, "w") as file:
            file.write(f"Model Name: {model_name}\n")
            file.write(f"Sample Size: {n}\n")
            file.write("Evaluation Metrics:\n")
            for key, value in avg_scores.items():
                file.write(f"{key}: {value}\n")

        return avg_scores

In [24]:
combined_df = get_combined_df(repo_path)

In [89]:
bm25_evaluator = ModelEvaluator(bm25_search, evaluator, combined_df)

In [33]:
bm25_evaluator.evaluate_sampling(n=100, k=K, output_dir='../tmp/')

{'MAP': 0.0427,
 'P@10': 0.079,
 'P@100': 0.0327,
 'P@1000': 0.0084,
 'MRR': 0.2676,
 'Recall@1000': 0.6351}

In [82]:
bm25_evaluator.evaluate_sampling(n=100, k=K, output_dir='../tmp/')

{'MAP': 0.0427,
 'P@10': 0.079,
 'P@100': 0.0327,
 'P@1000': 0.0084,
 'MRR': 0.2676,
 'Recall@1000': 0.6351}

In [90]:
bm25_evaluator.evaluate_sampling(n=100, k=K, output_dir='../tmp/', evaluation_strategy='file')

{'MAP': 0.6351,
 'P@10': 0.242,
 'P@100': 0.0269,
 'P@1000': 0.0027,
 'MRR': 0.76,
 'Recall@1000': 0.6351}

In [35]:
# randomly sample a commit from the combined_df
random_commit = combined_df.sample(1).iloc[0]
random_commit

owner                                                             facebook
repo_name                                                            react
commit_date                                                     1482357845
commit_id                         a27e4f3361caf6461ef51a71855903578604ace0
commit_message           [Fiber] Make requestIdleCallback() and request...
file_path                                      scripts/jest/environment.js
cur_file_content         /* eslint-disable */\nglobal.__DEV__ = true;\n...
previous_commit_id                2a5fe4c2b021ad9a67b904aceed6bb10fe160a79
previous_file_path                                                    <NA>
previous_file_content    /* eslint-disable */\nglobal.__DEV__ = true;\n...
diff                     @@ -1,11 +1,13 @@\n /* eslint-disable */\n glo...
status                                                            modified
is_merge_request                                                     False
file_extension           

In [91]:
# get search results for the random commit
search_results = bm25_search.search(random_commit['commit_message'], random_commit['commit_date'], ranking_depth=K)
search_results[:20]

[SearchResult(score: 93.78740, file_path: 'src/renderers/art/ReactARTFiber.js', commit_id: 'c87ffc0bebaabe69dfbd7b385480da614c5dc0da', commit_date: 1481231447),
 SearchResult(score: 93.78740, file_path: 'src/renderers/dom/fiber/ReactDOMFiber.js', commit_id: 'c87ffc0bebaabe69dfbd7b385480da614c5dc0da', commit_date: 1481231447),
 SearchResult(score: 93.78740, file_path: 'src/renderers/dom/fiber/ReactDOMFiberComponent.js', commit_id: 'c87ffc0bebaabe69dfbd7b385480da614c5dc0da', commit_date: 1481231447),
 SearchResult(score: 93.78740, file_path: 'src/renderers/dom/fiber/__tests__/ReactDOMFiber-test.js', commit_id: 'c87ffc0bebaabe69dfbd7b385480da614c5dc0da', commit_date: 1481231447),
 SearchResult(score: 93.78739, file_path: 'src/renderers/dom/shared/__tests__/ReactDOMSVG-test.js', commit_id: 'c87ffc0bebaabe69dfbd7b385480da614c5dc0da', commit_date: 1481231447),
 SearchResult(score: 93.78739, file_path: 'src/renderers/noop/ReactNoop.js', commit_id: 'c87ffc0bebaabe69dfbd7b385480da614c5dc0da', c

In [92]:
# evaluate the search results
evaluation = evaluator.evaluate(search_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist())
evaluation

{'MAP': 0.0253,
 'P@10': 0.1,
 'P@100': 0.02,
 'P@1000': 0.002,
 'MRR': 0.25,
 'Recall@1000': 0.5}

In [87]:
file_based_evaluation = evaluator.evaluate_file_based(search_results, combined_df[combined_df['commit_id'] == random_commit['commit_id']]['file_path'].tolist())
file_based_evaluation

{'MAP': 0.5,
 'P@10': 0.1,
 'P@100': 0.01,
 'P@1000': 0.001,
 'MRR': 1.0,
 'Recall@1000': 0.5}