Goal is, given an input ranking of AggregatedCodeResult

# Changes
1. Eval code files must be split on the same function as the original train splits (need a Split Class)
2. So your eval list will contain code snippets instead of files

## before 
actual = set(f2,f4,f8)
predictions = [f1, f4, f2, f7, f8]


## now
actual = set(p2.1, p2.2, p2.3, p4.1, p4.2, p8.1)
precictions = [p1.1, p7.8, p8.3, p4.1, p2.3, ....)

In [1]:
import sys
sys.path.append('../src')
sys.path.append('src')

In [2]:
import re
import pickle
from typing import List
from utils import AggregatedSearchResult, get_combined_df, full_tokenize
from bm25_v2 import BM25Searcher
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import pandas as pd
import numpy as np
from BERTReranker_v4 import BERTReranker
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
# from CodeReranker import BERTCodeReranker

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
class SearchEvaluator:
    def __init__(self, metrics):
        self.metrics = metrics

    @staticmethod
    def precision_at_k(relevant, k):
        return sum(relevant[:k]) / k

    @staticmethod
    def mean_reciprocal_rank(relevant):
        for idx, value in enumerate(relevant):
            if value == 1:
                return 1 / (idx + 1)
        return 0

    @staticmethod
    def calculate_average_precision(relevant):
        pred_rel = [1] * len(relevant)
        relevant_documents_count = 0
        cumulative_precision = 0.0

        # We iterate through the predicted relevance scores
        for i in range(len(pred_rel)):
            # Check if the prediction at this rank is correct (i.e., if it is a relevant document)
            if pred_rel[i] == 1 and relevant[i] == 1:
                relevant_documents_count += 1
                precision_at_i = relevant_documents_count / (i + 1)
                cumulative_precision += precision_at_i

        # The average precision is the cumulative precision divided by the number of relevant documents
        average_precision = cumulative_precision / sum(relevant) if sum(relevant) > 0 else 0
        return average_precision

    # @staticmethod
    # def calculate_recall(relevant, total_modified_files, k):
    #   # Does not work for commit based approach as it can have multiple mentions of the same file across commits leading to a higher than 1 recall
    #     print(total_modified_files)
    #     print(relevant)
    #     return sum(relevant[:k]) / total_modified_files

    @staticmethod
    def calculate_recall(retrieved_files, actual_modified, relevant, k):
        # this complicated mess is required as compared to the above much simpler code to support both commit-based and file-based approaches
        # in file-based approach, this is equivalent to the above code
        # in code-based approach, duplicates could be present in retrieved_files, which is why we need to filter them out (the above code would not work in this case)

        return len({file for idx, file in enumerate(retrieved_files[:k])
                        if relevant[idx] == 1
                    }) / len(actual_modified) if len(actual_modified) > 0 else 0


    def evaluate(self, search_results, actual_modified, eval_type='file'):
        if eval_type == 'patch':
            retrieved = [result.passage for result in search_results]
        else:
            retrieved = [result.file_path for result in search_results]
        relevant = [1 if file in actual_modified else 0 for file in retrieved]

        evaluations = {}
        for metric in self.metrics:
            if metric == 'MAP':
                evaluations[metric] = self.calculate_average_precision(relevant)
            elif metric == 'MRR':
                evaluations[metric] = self.mean_reciprocal_rank(relevant)
            elif metric.startswith('P@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.precision_at_k(relevant, k)
            elif metric.startswith('R@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.calculate_recall(retrieved, actual_modified, relevant, k)

        return {k: round(v, 4) for k, v in evaluations.items()}

In [5]:
class ModelEvaluator:
    def __init__(self, model, eval_model, combined_df, seed=42):
        self.model = model
        self.eval_model = eval_model
        self.combined_df = combined_df
        self.seed = seed

    def sample_commits(self, n):
        if self.combined_df.commit_id.nunique() < n:
            raise ValueError(f'Not enough commits to sample. Required: {n}, available: {self.combined_df.commit_id.nunique()}')

        midpoint_date = np.median(self.combined_df['commit_date'])
        recent_df = self.combined_df[self.combined_df['commit_date'] > midpoint_date]

        return recent_df.drop_duplicates(subset='commit_id').sample(n=n, replace=False, random_state=self.seed)

    def evaluate_df(self, df, k=1000, aggregation_strategy=None, rerankers=None):
        results = []
        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
            cur_query = row['commit_message']
            search_results = self.model.pipeline(cur_query, row['commit_date'], ranking_depth=k, aggregation_method=aggregation_strategy)
            for reranker in rerankers:
                if reranker.__class__.__name__ == 'BERTCodeReranker':
                    search_results = reranker.rerank_pipeline(cur_query, search_results, row['commit_id'])
                else:
                    search_results = reranker.rerank_pipeline(cur_query, search_results)


            if 'actual_modified_files' in df.columns:
                actual_modified_files = row['actual_modified_files']
            else:
                actual_modified_files = self.combined_df[self.combined_df['commit_id'] == row['commit_id']]['file_path'].tolist()
            evaluation = self.eval_model.evaluate(search_results, actual_modified_files)
            results.append(evaluation)
        return results

    def evaluate_sampling(self, n=100, k=1000, output_file_path=None, overwrite_eval=False, aggregation_strategy=None, rerankers=None, gold_df=None): #, repo_path=None):
        # if repo_path is None:
        #     print("Repo path not provided, using current working directory")
            # repo_path = os.getcwd()
        if rerankers is None:
            rerankers = []

        if output_file_path is None:
            print("WARNING: Output file path not provided, not writing results to file")
            # output_file_path = os.path.join(repo_path, f'{self.model.__class__.__name__}_results.txt')

        # output_file_path = os.path.join(repo_path, output_file)
        model_name = self.model.__class__.__name__

        if not overwrite_eval and output_file_path and os.path.exists(output_file_path):
            print(f'Output file {output_file_path} already exists - not writing to file, set overwrite_eval flag to True for that...')
            # print the contents of the file
            # with open(output_file_path, "r") as file:
            #     print(file.read())
            # return
            output_file_path=None

        if gold_df is None:
            sampled_commits = self.sample_commits(n)
            results = self.evaluate_df(sampled_commits, k, aggregation_strategy, rerankers)
        else:
            print(f'Found gold_df, evaluating on {len(gold_df)} commits')
            print(gold_df.info())
            results = self.evaluate_df(gold_df, k, aggregation_strategy, rerankers)

        avg_scores = {metric: round(np.mean([result[metric] for result in results]), 4) for metric in results[0]}

        if output_file_path is not None:
            with open(output_file_path, "w") as file:
                file.write(f"Model Name: {model_name}\n")
                # write name of each reranker
                if len(rerankers) > 0:
                    file.write("Rerankers:\n")
                    for reranker in rerankers:
                        reranker_model_name = reranker.model.config.name_or_path
                        # replace / with _
                        reranker_model_name = reranker_model_name.replace('/', '_')
                        file.write(f"{reranker.__class__.__name__} ({reranker_model_name}) @ {reranker.rerank_depth}\n")


                file.write(f"Sample Size: {n}\n")
                file.write("Evaluation Metrics:\n")
                for key, value in avg_scores.items():
                    file.write(f"{key}: {value}\n")

            print(f'Evaluation results written to {output_file_path}')

        return avg_scores

In [6]:
class Reranker:
    def __init__(self, parameters):
        self.parameters = parameters

        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        print(f'Using device: {self.device}')
        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']

    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        raise NotImplementedError

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

In [7]:
class BERTCodeReranker:
    def __init__(self, parameters, combined_df):
        self.combined_df = combined_df
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)

        print(f'Using device: {self.device}')

        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

        print(f"Initialized Code File BERT reranker with parameters: {parameters}")


    def rerank(self, query, aggregated_results: List[AggregatedSearchResult], train_commit_id):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
        # print(f'Reranking {len(aggregated_results)} results')

        self.model.eval()

        query_passage_pairs, per_result_contribution = self.split_into_query_passage_pairs(query, aggregated_results, train_commit_id)


        # for agg_result in aggregated_results:
        #     query_passage_pairs.extend(
        #         (query, result.commit_message)
        #         for result in agg_result.contributing_results[: self.psg_cnt]
        #     )

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        # Now assign the scores to the aggregated results by mapping the scores to the contributing results
        for i, agg_result in enumerate(aggregated_results):
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
            assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
            end_index = score_index + per_result_contribution[i] # only use psg_cnt contributing_results
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index


            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def split_into_query_passage_pairs(self, query, aggregated_results, train_commit_id):
        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        def full_tokenize(s):
            return self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()
        query_passage_pairs = []
        per_result_contribution = []
        if self.combined_df is not None:
            combined_df = self.combined_df


        for agg_result in aggregated_results:
            # agg_result.contributing_results.sort(key=lambda res: res.commit_date, reverse=True)
            # get most recent file version
            most_recent_search_result = agg_result.contributing_results[0]
            # get the file_path and commit_id
            file_path = most_recent_search_result.file_path
            # commit_id = most_recent_search_result.commit_id
            # get the file content from combined_df
            # file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['cur_file_content'].values[0]

            file_content = get_file_at_commit_from_git(file_path, train_commit_id)

            # file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['previous_file_content'].values[0]

            # now need to split this file content into psg_cnt passages
            # first tokenize the file content

            # warning these asserts are useless since we are using NaNs
            # assert file_content is not None, f'file_content is None for commit_id: {commit_id}, file_path: {file_path}'
            # assert file_path is not None, f'file_path is None for commit_id: {commit_id}'
            assert query is not None, 'query is None'

            # query_tokens = full_tokenize(query)
            path_tokens = full_tokenize(file_path)

            if pd.isna(file_content):
                # if file_content is NaN, then we can just set file_content to empty string
                print(f'WARNING: file_content is NaN for commit_id: {train}, file_path: {file_path}, setting file_content to empty string')
                file_content = ''

            file_tokens = full_tokenize(file_content)


            # now split the file content into psg_cnt passages
            cur_result_passages = []
            # get the input ids
            # input_ids = file_content['input_ids'].squeeze()
            # get the number of tokens in the file content
            total_tokens = len(file_tokens)

            for cur_start in range(0, total_tokens, self.psg_stride):
                cur_passage = []
                # add query tokens and path tokens
                # cur_passage.extend(query_tokens)
                cur_passage.extend(path_tokens)

                # add the file tokens
                cur_passage.extend(file_tokens[cur_start:cur_start+self.psg_len])

                # now convert cur_passage into a string
                cur_passage_decoded = self.tokenizer.decode(cur_passage)

                # add the cur_passage to cur_result_passages
                cur_result_passages.append(cur_passage_decoded)

                # if len(cur_result_passages) == self.psg_cnt:
                #     break

            # now add the query, passage pairs to query_passage_pairs
            per_result_contribution.append(len(cur_result_passages))
            query_passage_pairs.extend((query, passage) for passage in cur_result_passages)
        return query_passage_pairs, per_result_contribution

    def rerank_pipeline(self, query, aggregated_results, train_commit_id):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results, train_commit_id)
        min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        for i, result in enumerate(bottom_results):
            result.score = min_top_score - i - 1
        # combine the results
        reranked_results.extend(bottom_results)
        assert(len(reranked_results) == len(aggregated_results))
        return reranked_results

# Loading some dummy data

In [8]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='../data/2_7/facebook_react/index_commit_tokenized',
    repo_path='../data/2_7/facebook_react', k=1000, n=100,
    model_path='microsoft/codebert-base', overwrite_cache=False,
    batch_size=32, num_epochs=10, learning_rate=5e-05,
    run_name='debug',
    notes='debug (ignore)',
    num_positives=10, num_negatives=10, train_depth=1000, num_workers=8,
    train_commits=1000, psg_cnt=25, use_gpu=True,
    rerank_depth=100, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4',
    overwrite_eval=False, sanity_check=True, debug=False,
    psg_len=350, psg_stride=250, ignore_gold_in_training=False,
    eval_folder='repr_0.1663', use_gpt_train=True,
    aggregation_strategy='sump',
    bert_best_model='../data/combined_commit_train/best_model',
    best_model_path='../data/2_7/facebook_react/models/bce/best_model'

)

metrics =['MAP', 'P@1', 'P@10', 'P@20', 'P@30', 'MRR', 'R@1', 'R@10', 'R@100', 'R@1000']
repo_path = args.repo_path
repo_name = repo_path.split('/')[-1]
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

test_path = os.path.join('..', 'gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
# test_path = os.path.join('gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
gold_df = pd.read_parquet(test_path)
gold_df = gold_df.rename(columns={'commit_message': 'original_message', f'transformed_message_{args.openai_model}': 'commit_message'})

Loaded index at ../data/2_7/facebook_react/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [9]:
import git

In [10]:
local_path = '../repos/facebook_react'
repo = git.Repo(local_path)

In [11]:
code_reranker_params = {
        'model_name': args.model_path,
        'aggregation_strategy': 'maxp',
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': args.rerank_depth,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': BM25_AGGR_STRAT,
        'psg_len': args.psg_len,
        'psg_stride': args.psg_stride,
        'psg_cnt': args.psg_cnt,
    }

bert_params = {
        'model_name': args.model_path,
        'psg_cnt': 5,
        'aggregation_strategy': 'sump',
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': 250,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': 'sump',
    }

In [12]:
dummy_train_row = gold_df.iloc[6]
dummy_commit_id = dummy_train_row.commit_id
dummy_train_row

commit_id                         323efbc33c27a602a4aab8519f58feba1e0a216c
commit_date                                                     1512398372
original_message         Ensure value and defaultValue do not assign fu...
actual_files_modified    [packages/react-dom/src/__tests__/ReactDOMInpu...
commit_message           Input properties 'value' and 'defaultValue' ac...
Name: 6, dtype: object

In [13]:
dummy_train_query = dummy_train_row.commit_message
dummy_file_path_list = dummy_train_row.actual_files_modified

In [14]:
dummy_file_path_list

array(['packages/react-dom/src/__tests__/ReactDOMInput-test.js',
       'packages/react-dom/src/client/ReactDOMFiberInput.js',
       'packages/react-dom/src/events/ChangeEventPlugin.js',
       'packages/react-dom/src/shared/DOMProperty.js'], dtype=object)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
def split_random_chunks(file, tokenizer, stride=250, psg_len=350):
    file_tokens = full_tokenize(file, tokenizer)
    total_tokens = len(file_tokens)
    res = []
    for cur_start in range(0, total_tokens, stride):
        # get tokens for current passage
        res.append(tokenizer.decode(file_tokens[cur_start:cur_start+psg_len]))

    return res

In [16]:
dummy_file = combined_df.query(f"commit_id=='{dummy_commit_id}' & file_path=='{dummy_file_path_list[0]}'")['previous_file_content'].values[0]

In [17]:
dummy_file_list = [combined_df.query(f"commit_id=='{dummy_commit_id}' & file_path=='{x}'")['previous_file_content'].values[0] for x in dummy_file_path_list]
dummy_file_patch_list = [chunk for x in dummy_file_list for chunk in split_random_chunks(x, tokenizer)]

Token indices sequence length is longer than the specified maximum sequence length for this model (21407 > 512). Running this sequence through the model will result in indexing errors


In [18]:
dummy_diff_list = [combined_df.query(f"commit_id=='{dummy_commit_id}' & file_path=='{x}'")['diff'].values[0] for x in dummy_file_path_list]

In [19]:
sample_diff = dummy_diff_list[0]

In [20]:
def extract_modified_lines(diff):
    """
    Extracts line numbers of modified lines from a diff string.

    Args:
    - diff (str): The diff string in Linux diff format.

    Returns:
    - List[int]: A list of line numbers that were modified in the previous file state.
    """
    modified_lines = []

    # Regular expression to find all instances of line number indicators in the diff
    line_indicator_regex = re.compile(r'@@ -(\d+),(\d+) \+(\d+),(\d+) @@')

    for match in line_indicator_regex.finditer(diff):
        start_line = int(match.group(1))
        line_count = int(match.group(2))

        # Adding all affected lines by this change to the list
        # for i in range(line_count):
            # modified_lines.append(start_line + i)

        modified_lines.append((start_line, line_count))

    return modified_lines
    
class LineSpanPassage:
    def __init__(self, si, ei, passage):
        self.start_index = si
        self.end_index = ei
        self.passage = passage

    def __repr__(self):
        return f'LineSpanPassage(start_index={self.start_index}, end_index={self.end_index}, passage=\n{self.passage})'

def pluck_diff_sections(file_content, modified_lines, context_lines=0):
    """
    Extracts specific sections from the original file content based on modified lines
    and a specified number of context lines before and after each section.

    Args:
    - file_content (str): The content of the original file.
    - modified_lines (List[Tuple[int, int]]): A list of tuples, each representing
      the starting line number and the count of consecutive modified lines.
    - context_lines (int): The number of extra lines to include before and after
      each section of modified lines.

    Returns:
    - str: The extracted sections joined together by newlines.
    """
    lines = file_content.split('\n')
    passages = []

    for start_line, line_count in modified_lines:
        # Adjusting the start line to be zero-indexed and accounting for context lines
        start_index = max(0, start_line - 1 - context_lines)
        # Ensuring we don't go beyond the file's length
        end_index = min(len(lines), start_line - 1 + line_count + context_lines)

        # Extracting the specified section with context lines
        section = lines[start_index:end_index]
        lsp = LineSpanPassage(start_index, end_index, '\n'.join(section))
        passages.append(lsp)

    # Joining all passages with two newlines for clarity
    return passages

In [21]:
ml = extract_modified_lines(sample_diff)
print(pluck_diff_sections(dummy_file_list[0], modified_lines=ml, context_lines=5)[0])

LineSpanPassage(start_index=242, end_index=258, passage=
      expect(console.error.calls.count()).toBe(1);
      expect(console.error.calls.argsFor(0)[0]).toContain(
        'You provided a `value` prop to a form field ' +
          'without an `onChange` handler.',
      );
    }
  });

  it('distinguishes precision for extra zeroes in string number values', () => {
    spyOnDev(console, 'error');
    class Stub extends React.Component {
      state = {
        value: '3.0000',
      };
      render() {
        return <input type="number" value={this.state.value} />;)


In [22]:
# def get_file_at_commit_from_git(file, commit_id):
#     # Access the specified commit
#     commit = repo.commit(commit_id)

#     # Check if the commit has parents
#     if commit.parents:
#         # Access the first parent of the commit
#         parent_commit = commit.parents[0]

#         # Attempt to get the file content from the parent commit
#         try:
#             blob = parent_commit.tree / file
#             file_content = blob.data_stream.read().decode('utf-8')
#             return file_content
#         except KeyError:
#             # Handle the case where the file does not exist in the parent commit
#             return "The file was not present in the parent commit."
#     else:
#         # Handle the case where the specified commit is the initial commit and has no parents
#         return "The specified commit has no parents (it might be the initial commit)."


from git import Repo, exc

def get_file_at_commit_from_git(file_path, commit_id):
    # Access the specified commit
    try:
        # Initialize the Repo object        
        # The '^' symbol is used to refer to the commit immediately before the specified commit_id
        # Concatenate the commit_id with '^' and the file_path with a ':' separator
        file_content_before_commit = repo.git.show(f"{commit_id}^:{file_path}")
        
        return file_content_before_commit
    except exc.GitCommandError:
        # Return an empty string if the file does not exist in the commit
        return ""

In [23]:
# get_file_at_commit_from_git('packages/react-dom/src/__tests__/ReactDOMInput-test.js', '59763bf7f3ab3b06cd8ab5a5a83ae3dafc667aa9') == tmp

# Getting BM25 Results

In [24]:
bm25_results = bm25_searcher.pipeline(dummy_train_query, dummy_train_row['commit_date'], ranking_depth=K, aggregation_method=BM25_AGGR_STRAT)

In [25]:
evaluator.evaluate(bm25_results, dummy_file_path_list)

{'MAP': 0.0867,
 'P@1': 0.0,
 'P@10': 0.1,
 'P@20': 0.05,
 'P@30': 0.0333,
 'MRR': 0.1429,
 'R@1': 0.0,
 'R@10': 0.25,
 'R@100': 0.75,
 'R@1000': 0.75}

# Getting BERT Rerank @ 250 on top of BM25

In [26]:
bert_reranker = BERTReranker(bert_params)
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained(args.bert_best_model, num_labels=1, problem_type='regression')
bert_reranker.model.to(bert_reranker.device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 476.73 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump'}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [27]:
bert_rerank_results = bert_reranker.rerank_pipeline(dummy_train_query, bm25_results)

In [28]:
evaluator.evaluate(bert_rerank_results, dummy_file_path_list)

{'MAP': 0.1421,
 'P@1': 0.0,
 'P@10': 0.0,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 0.0833,
 'R@1': 0.0,
 'R@10': 0.0,
 'R@100': 0.75,
 'R@1000': 0.75}

In [29]:
# def aside():
#     results = [sorted(x.contributing_results, key=lambda res: res.commit_date, reverse=True)[0] for x in bert_rerank_results]
#     files_content = [combined_df.query(f"commit_id=='{x.commit_id}' & file_path=='{x.file_path}'")['cur_file_content'].values[0] for x in results]
#     patches = [chunk for x in files_content for chunk in split_random_chunks(x, tokenizer)]
#     print(len(set(patches).intersection(set(dummy_file_patch_list))))

# aside()

# File Code Reranker

In [30]:
code_reranker_params['aggregation_strategy'] = 'maxp'
file_code_reranker = BERTCodeReranker(code_reranker_params, combined_df)
cur_best_model_path = '../data/2_7/facebook_react/models/combined_diffs/best_model'
# cur_best_model_path = '../data/2_7/facebook_react/models/X/best_model'


file_code_reranker.model = AutoModelForSequenceClassification.from_pretrained(cur_best_model_path, num_labels=1, problem_type='regression')
file_code_reranker.model.to(file_code_reranker.device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 965.97 MB
Initialized Code File BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'aggregation_strategy': 'maxp', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 350, 'psg_stride': 250, 'psg_cnt': 25}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [31]:
file_code_reranker_results = file_code_reranker.rerank_pipeline(dummy_train_query, bert_rerank_results, dummy_commit_id)

Token indices sequence length is longer than the specified maximum sequence length for this model (2672 > 512). Running this sequence through the model will result in indexing errors


In [32]:
evaluator.evaluate(file_code_reranker_results, dummy_file_path_list)

{'MAP': 0.187,
 'P@1': 0.0,
 'P@10': 0.2,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 0.1111,
 'R@1': 0.0,
 'R@10': 0.5,
 'R@100': 0.75,
 'R@1000': 0.75}

After latest file version (gold df iloc 6, combined_Df, sump codereranker)


{'MAP': 0.7,
 'P@1': 1.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}

{'MAP': 0.5536,
 'P@1': 1.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}


Pre latest version of file

{'MAP': 0.3869,
 'P@1': 0.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 0.5,
 'R@1': 0.0,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}

# Patch Code Reranker

## refactoring a bunch of stuff to be nice and split focused

In [33]:
from abc import ABC, abstractmethod
import random

class SplitStrategy(ABC):
    @abstractmethod
    def split(self, file_content):
        pass

In [34]:
class LineSpanPassage:
        def __init__(self, si, ei, passage):
            self.start_line = si
            self.end_line = ei
            self.passage = passage
    
        def __repr__(self):
            return f'LineSpanPassage(start_line={self.start_line}, end_line={self.end_line}, passage=\n{self.passage})'

class DiffSplitStrategy(SplitStrategy):
    def __init__(self, context_lines=5):
        super().__init__()
        self.context_lines = context_lines

    @classmethod
    def extract_modified_lines(diff):
        """
        Extracts line numbers of modified lines from a diff string.
        Args:
        - diff (str): The diff string in Linux diff format.
        Returns:
        - List[int]: A list of line numbers that were modified in the previous file state.
        """
        modified_lines = []
    
        # Regular expression to find all instances of line number indicators in the diff
        line_indicator_regex = re.compile(r'@@ -(\d+),(\d+) \+(\d+),(\d+) @@')
    
        for match in line_indicator_regex.finditer(diff):
            start_line = int(match.group(1))
            line_count = int(match.group(2))
    
            # Adding all affected lines by this change to the list
            # for i in range(line_count):
                # modified_lines.append(start_line + i)
    
            modified_lines.append((start_line, line_count))
    
        return modified_lines

    def split(self, file_content, diff, *args, **kwargs):
        modified_lines = extract_modified_lines(diff)
        lines = file_content.split('\n')
        passages = []
        for start_line, line_count in modified_lines:
            start_index = max(0, start_line - 1 - self.context_lines)
            end_index = min(len(lines), start_line - 1 + line_count + self.context_lines)
            section = lines[start_index:end_index]
            lsp = LineSpanPassage(start_index, end_index, '\n'.join(section))
            passages.append(lsp)
        return passages

In [35]:
class TokenizedPassage:
        def __init__(self, st, et, passage):
            self.start_token = st
            self.end_token = et
            self.passage = passage

    
        def __repr__(self):
            return f'TokenizedPassage(start_token_index={self.start_token}, end_token_index={self.end_token}, passage=\n{self.passage})'
            
class TokenizedSplitStrategy(SplitStrategy):

    def __init__(self, tokenizer, psg_len, psg_stride):
        super().__init__()
        self.tokenizer = tokenizer
        self.psg_len = psg_len
        self.psg_stride = psg_stride

    # def full_tokenize(self, s):
    #         return self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()

    def full_tokenize(self, s):
        tokens = self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()
        # Ensure tokens is always a list
        if not isinstance(tokens, list):
            tokens = [tokens]
        return tokens

    def split(self, file_content, *args, **kwargs):
        # Tokenize the entire file content
        file_tokens = self.full_tokenize(file_content)
        total_tokens = len(file_tokens)
        res = []
        for cur_start in range(0, total_tokens, self.psg_stride):
            # get tokens for current passage
            cur_psg = tokenizer.decode(file_tokens[cur_start:cur_start+self.psg_len])
            res.append(TokenizedPassage(cur_start, min(total_tokens-1, cur_start+self.psg_len-1), cur_psg))
    
        return res

In [37]:
class PassageSplitter:
    def __init__(self, strategy: SplitStrategy):
        self.strategy = strategy

    def split_passages(self, file_content, *args, **kwargs):
        return self.strategy.split(file_content=file_content, *args, **kwargs)

In [38]:
# example to understand the output
# tmp = PassageSplitter(DiffSplitStrategy())
# tmp.split_passages(file_content=dummy_file_list[0], diff=sample_diff)[0]

# tmp = PassageSplitter(TokenizedSplitStrategy(tokenizer=tokenizer, psg_len=args.psg_len, psg_stride=args.psg_stride))

# dummy_file_list[0].split('\n')[:45]
# tmp = PassageSplitter(TokenizedLineSplitStrategy(tokenizer=tokenizer, psg_len=args.psg_len, psg_stride=args.psg_stride))
# tmp.split_passages(file_content=dummy_file_list[0], diff=sample_diff)[0]

In [39]:
class PatchResult:
    def __init__(self, file_path, passage, score):
        self.file_path = file_path
        self.score = score
        self.passage = passage

    def __repr__(self):
        class_name = self.__class__.__name__
        return f'{class_name}(file_path={self.file_path}, passage={self.passage}, score={self.score})'
        
class PatchCodeReranker(Reranker):
    def __init__(self, parameters, split_strategy: SplitStrategy):
        super().__init__(parameters)

        # specific to CodeReranker type

        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model.to(self.device)
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model


        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        self.psg_stride = parameters.get('psg_stride', self.psg_len)
        print(f"Initialized Patch Code Reranker with parameters: {parameters}")

        # Passage/patch splitting
        self.split_strategy = split_strategy
        self.passage_splitter = PassageSplitter(split_strategy)


    def rerank(self, query, aggregated_results: List[AggregatedSearchResult], train_commit_id):
        """
        Rerank a aggregated search result list by splitting into patches and getting model scores for each patch.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.

        Returns:
        - List[PatchResult]         [IMP: DOES NOT return file_list]
        """
        self.model.eval()
        query_passage_pairs = self.split_into_query_passage_pairs(query, aggregated_results, train_commit_id)

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning []')
            return []

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, obj.passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, file_path, obj in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        # convert the scores to PatchResult objects
        patch_results = [PatchResult(file_path, obj, score) for (query, file_path, obj), score in zip(query_passage_pairs, scores)]

        # sort patch_results by the scores
        sorted_patch_results = sorted(patch_results, key=lambda res: res.score, reverse=True)

        return sorted_patch_results

    def split_into_query_passage_pairs(self, query, aggregated_results, train_commit_id):        
        query_passage_pairs = []
        for agg_result in aggregated_results:
            # get any file result
            most_recent_search_result = agg_result.contributing_results[0] # doesn't matter which version we take, we only care about file_path
            
            # get the file_path
            file_path = most_recent_search_result.file_path

            file_content = get_file_at_commit_from_git(file_path, train_commit_id)
            if not file_content:
                # useless file
                continue

            # warning these asserts are useless since we are using NaNs
            assert file_content is not None, f'file_content is None for commit_id: {train_commit_id}, file_path: {file_path}'
            assert file_path is not None, f'file_path is None for commit_id: {train_commit_id}'
            assert query is not None, 'query is None'

            if pd.isna(file_content):
                # if file_content is NaN, then we can just set file_content to empty string
                print(f'WARNING: file_content is NaN for commit_id: {train_commit_id}, file_path: {file_path}, setting file_content to empty string')
                file_content = ''

            cur_result_passages = self.passage_splitter.split_passages(file_content)

            query_passage_pairs.extend((query, file_path, obj) for obj in cur_result_passages)
            
        return query_passage_pairs

    def rerank_pipeline(self, query, aggregated_results, train_commit_id):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        # bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results, train_commit_id)
        # min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        # for i, result in enumerate(bottom_results):
            # result.score = min_top_score - i - 1
        # combine the results
        # reranked_results.extend(bottom_results)
        # assert(len(reranked_results) == len(aggregated_results))
        return reranked_results

In [40]:
split_strategy = TokenizedLineSplitStrategy(tokenizer=tokenizer, psg_len=args.psg_len, psg_stride=args.psg_stride)
code_reranker = PatchCodeReranker(code_reranker_params, split_strategy)
# cur_best_model_path = 'data/2_7/facebook_react/models/X/best_model'
cur_best_model_path = '../data/2_7/facebook_react/models/combined_diffs/best_model'

code_reranker.model = AutoModelForSequenceClassification.from_pretrained(cur_best_model_path, num_labels=1, problem_type='regression')
code_reranker.model.to(code_reranker.device)

Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 965.97 MB


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized Patch Code Reranker with parameters: {'model_name': 'microsoft/codebert-base', 'aggregation_strategy': 'maxp', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 350, 'psg_stride': 250, 'psg_cnt': 25}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [41]:
# code_reranker_results = code_reranker.rerank_pipeline(dummy_train_query, bert_rerank_results, dummy_commit_id)

In [42]:
# query_results = {}

# for i, row in tqdm(gold_df.iterrows(), total=gold_df.shape[0]):
#     qcid = row['commit_id']
#     qcdate = row['commit_date']
#     qcmsg = row['commit_message']
#     res = bm25_searcher.pipeline(qcmsg, qcdate, ranking_depth=K, aggregation_method=BM25_AGGR_STRAT)
#     res = bert_reranker.rerank_pipeline(qcmsg, res)    
#     res = code_reranker.rerank_pipeline(qcmsg, res, qcid)

#     query_results[qcid] = res

#     with open('query_results.pkl', 'wb') as file:
#         pickle.dump(query_results, file)

In [43]:
query_results = {}
with open('query_results.pkl', 'rb') as file:
    query_results = pickle.load(file)

In [44]:
file_results = []
for i, row in tqdm(gold_df.iterrows(), total=gold_df.shape[0]):
    qcid = row['commit_id']
    # qcdate = row['commit_date']
    # qcmsg = row['commit_message']
    qactual_files = row['actual_files_modified']
    # qfile_list = [combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['previous_file_content'].values[0] for x in qactual_files]

    file_results.append(evaluator.evaluate(query_results[qcid], qactual_files))

avg_file_scores = {metric: round(np.mean([result[metric] for result in file_results]), 4) for metric in file_results[0]}
avg_file_scores

100%|██████████| 100/100 [00:00<00:00, 142.56it/s]


{'MAP': 0.143,
 'P@1': 0.2,
 'P@10': 0.221,
 'P@20': 0.1995,
 'P@30': 0.1787,
 'MRR': 0.3194,
 'R@1': 0.0639,
 'R@10': 0.2758,
 'R@100': 0.4473,
 'R@1000': 0.4756}

In [45]:
# file code reranker
# {
#   "MAP.max": 0.2378,
#   "P@1.max": 0.21,
#   "P@10.max": 0.103,
#   "P@20.max": 0.074,
#   "P@30.max": 0.0563
#   "MRR.max": 0.3114,
#   "R@1.max": 0.0804,
#   "R@10.max": 0.2716,
#   "R@100.max": 0.4756,
#   "R@1000.max": 0.5452,
# }

## patch eval

In [46]:
# diff_splitter = PassageSplitter(DiffSplitStrategy())
# 
# sample_file, sample_diff = dummy_file_list[0], dummy_diff_list[0]
# ground_truth = {}
# for gt_file_path, gt_file_content, gt_diff in zip(dummy_file_path_list, dummy_file_list, dummy_diff_list):
#     patch_list = diff_splitter.split_passages(file_content=gt_file_content, diff=gt_diff)
#     ground_truth[gt_file_path] = []
#     for lsp in patch_list:
#         ground_truth[gt_file_path].append(PatchResult(file_path=gt_file_path, score=-1, passage=lsp))

In [47]:
def calculate_line_overlap(predicted_passage, ground_truth_list):
    """
    Calculate the overlap score between a predicted passage and a set of actual passages.
    
    Args:
    - actual_passages (List[LineSpanPassage]): The actual ground truth passages.
    - predicted_passage (LineSpanPassage): The predicted passage.
    
    Returns:
    - float: The best overlap score between the predicted passage and any of the actual passages.
    """
    best_overlap_score = 0
    actual_passages = [x.passage for x in ground_truth_list]
    for actual in actual_passages:
        if predicted_passage.start_line <= actual.end_line and actual.start_line <= predicted_passage.end_line:
            # Calculate overlap
            overlap_start = max(predicted_passage.start_line, actual.start_line)
            overlap_end = min(predicted_passage.end_line, actual.end_line)
            overlap = overlap_end - overlap_start + 1

            # Calculate the score as the proportion of overlap over the actual passage length
            actual_length = actual.end_line - actual.start_line + 1
            overlap_score = overlap / actual_length

            best_overlap_score = max(best_overlap_score, overlap_score)
    
    return best_overlap_score

class SearchEvaluator:
    def __init__(self, metrics):
        self.metrics = metrics

    @staticmethod
    def precision_at_k(relevant, k):
        return sum(relevant[:k]) / k

    @staticmethod
    def mean_reciprocal_rank(relevant):
        for idx, value in enumerate(relevant):
            if value == 1:
                return 1 / (idx + 1)
        return 0

    @staticmethod
    def calculate_average_precision(relevant):
        pred_rel = [1] * len(relevant)
        relevant_documents_count = 0
        cumulative_precision = 0.0

        # We iterate through the predicted relevance scores
        for i in range(len(pred_rel)):
            # Check if the prediction at this rank is correct (i.e., if it is a relevant document)
            if pred_rel[i] == 1 and relevant[i] == 1:
                relevant_documents_count += 1
                precision_at_i = relevant_documents_count / (i + 1)
                cumulative_precision += precision_at_i

        # The average precision is the cumulative precision divided by the number of relevant documents
        average_precision = cumulative_precision / sum(relevant) if sum(relevant) > 0 else 0
        return average_precision

    # @staticmethod
    # def calculate_recall(relevant, total_modified_files, k):
    #   # Does not work for commit based approach as it can have multiple mentions of the same file across commits leading to a higher than 1 recall
    #     print(total_modified_files)
    #     print(relevant)
    #     return sum(relevant[:k]) / total_modified_files

    @staticmethod
    def calculate_recall(retrieved_files, total_modified, relevant, k):
        # this complicated mess is required as compared to the above much simpler code to support both commit-based and file-based approaches
        # in file-based approach, this is equivalent to the above code
        # in code-based approach, duplicates could be present in retrieved_files, which is why we need to filter them out (the above code would not work in this case)

        return len({file for idx, file in enumerate(retrieved_files[:k])
                        if relevant[idx] == 1
                    }) / total_modified if total_modified > 0 else 0


    def evaluate(self, search_results, actual_modified, eval_type='file'):
        if eval_type == 'random_split_patch':
            retrieved = [result.passage for result in search_results]
            relevant = [1 if file in actual_modified else 0 for file in retrieved]
            total_modified = len(actual_modified)
        elif eval_type == 'file':
            retrieved = [result.file_path for result in search_results]
            relevant = [1 if file in actual_modified else 0 for file in retrieved]
            total_modified = len(actual_modified)
        elif eval_type == 'patch2':
            # search results = List[PatchResults] and actual_modified = Dict[file_path: List[PatchResults]]
            relevant = [0 for _ in range(len(search_results))]
            retrieved = [0 for i in range(len(search_results))]
            for i, pr in enumerate(search_results):
                if pr.file_path not in actual_modified:
                    continue
                # file_path matches, now compute max similarity of passages based on line numbers
                relevant[i] = calculate_line_overlap(pr.passage, actual_modified[pr.file_path])
            threshold = 0.5
            retrieved = [i if val >= threshold else 0 for i, val in enumerate(relevant)]
            relevant = [1 if i >= threshold else 0 for i in relevant]
            total_modified = sum([len(x) for k, x in actual_modified.items()])
            
        evaluations = {}
        for metric in self.metrics:
            if metric == 'MAP':
                evaluations[metric] = self.calculate_average_precision(relevant)
            elif metric == 'MRR':
                evaluations[metric] = self.mean_reciprocal_rank(relevant)
            elif metric.startswith('P@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.precision_at_k(relevant, k)
            elif metric.startswith('R@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.calculate_recall(retrieved, total_modified, relevant, k)

        return {k: round(v, 4) for k, v in evaluations.items()}



evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

In [48]:
# evaluator.evaluate(code_reranker_results, ground_truth, eval_type='patch2')

In [50]:
diff_splitter = PassageSplitter(DiffSplitStrategy())
def get_ground_truth(qfile_path_list, qfile_list, qdiff_list):
    ground_truth = {}
    for gt_file_path, gt_file_content, gt_diff in zip(qfile_path_list, qfile_list, qdiff_list):
        patch_list = diff_splitter.split_passages(file_content=gt_file_content, diff=gt_diff)
        ground_truth[gt_file_path] = []
        for lsp in patch_list:
            ground_truth[gt_file_path].append(PatchResult(file_path=gt_file_path, score=-1, passage=lsp))

    return ground_truth

patch_results = []
for i, row in tqdm(gold_df.iterrows(), total=gold_df.shape[0]):
    qcid = row['commit_id']
    # qcdate = row['commit_date']
    # qcmsg = row['commit_message']
    qfile_path_list = row['actual_files_modified']
    # qfile_list = [combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['previous_file_content'].values[0] for x in qfile_path_list]
    # qdiff_list = [combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['diff'].values[0] for x in qfile_path_list]

    qfile_list = [combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['previous_file_content'].values[0] if not pd.isna(combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['previous_file_content'].values[0]) else '' for x in qfile_path_list]
    qdiff_list = [combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['diff'].values[0] if not pd.isna(combined_df.query(f"commit_id=='{qcid}' & file_path=='{x}'")['diff'].values[0]) else '' for x in qfile_path_list]

    qground = get_ground_truth(qfile_path_list, qfile_list, qdiff_list)

    patch_results.append(evaluator.evaluate(query_results[qcid], qground, eval_type='patch2'))
    

avg_patch_scores = {metric: round(np.mean([result[metric] for result in patch_results]), 4) for metric in patch_results[0]}
avg_patch_scores

100%|██████████| 100/100 [00:10<00:00,  9.79it/s]


{'MAP': 0.082,
 'P@1': 0.07,
 'P@10': 0.061,
 'P@20': 0.0485,
 'P@30': 0.0427,
 'MRR': 0.1321,
 'R@1': 0.0044,
 'R@10': 0.0633,
 'R@100': 0.1924,
 'R@1000': 0.4068}