Goal is, given an input ranking of AggregatedCodeResult

# Changes
1. Eval code files must be split on the same function as the original train splits (need a Split Class)
2. So your eval list will contain code snippets instead of files

## before 
actual = set(f2,f4,f8)
predictions = [f1, f4, f2, f7, f8]


## now
actual = set(p2.1, p2.2, p2.3, p4.1, p4.2, p8.1)
precictions = [p1.1, p7.8, p8.3, p4.1, p2.3, ....)

In [24]:
import sys
sys.path.append('../src')
sys.path.append('src')

In [2]:
from typing import List
from utils import AggregatedSearchResult, get_combined_df, full_tokenize
from bm25_v2 import BM25Searcher
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import pandas as pd
import numpy as np
from BERTReranker_v4 import BERTReranker
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
# from CodeReranker import BERTCodeReranker

In [3]:
class SearchEvaluator:
    def __init__(self, metrics):
        self.metrics = metrics

    @staticmethod
    def precision_at_k(relevant, k):
        return sum(relevant[:k]) / k

    @staticmethod
    def mean_reciprocal_rank(relevant):
        for idx, value in enumerate(relevant):
            if value == 1:
                return 1 / (idx + 1)
        return 0

    @staticmethod
    def calculate_average_precision(relevant):
        pred_rel = [1] * len(relevant)
        relevant_documents_count = 0
        cumulative_precision = 0.0

        # We iterate through the predicted relevance scores
        for i in range(len(pred_rel)):
            # Check if the prediction at this rank is correct (i.e., if it is a relevant document)
            if pred_rel[i] == 1 and relevant[i] == 1:
                relevant_documents_count += 1
                precision_at_i = relevant_documents_count / (i + 1)
                cumulative_precision += precision_at_i

        # The average precision is the cumulative precision divided by the number of relevant documents
        average_precision = cumulative_precision / sum(relevant) if sum(relevant) > 0 else 0
        return average_precision

    # @staticmethod
    # def calculate_recall(relevant, total_modified_files, k):
    #   # Does not work for commit based approach as it can have multiple mentions of the same file across commits leading to a higher than 1 recall
    #     print(total_modified_files)
    #     print(relevant)
    #     return sum(relevant[:k]) / total_modified_files

    @staticmethod
    def calculate_recall(retrieved_files, actual_modified, relevant, k):
        # this complicated mess is required as compared to the above much simpler code to support both commit-based and file-based approaches
        # in file-based approach, this is equivalent to the above code
        # in code-based approach, duplicates could be present in retrieved_files, which is why we need to filter them out (the above code would not work in this case)

        return len({file for idx, file in enumerate(retrieved_files[:k])
                        if relevant[idx] == 1
                    }) / len(actual_modified) if len(actual_modified) > 0 else 0


    def evaluate(self, search_results, actual_modified, eval_type='file'):
        if eval_type == 'patch':
            retrieved = [result.passage for result in search_results]
        else:
            retrieved = [result.file_path for result in search_results]
        relevant = [1 if file in actual_modified else 0 for file in retrieved]

        evaluations = {}
        for metric in self.metrics:
            if metric == 'MAP':
                evaluations[metric] = self.calculate_average_precision(relevant)
            elif metric == 'MRR':
                evaluations[metric] = self.mean_reciprocal_rank(relevant)
            elif metric.startswith('P@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.precision_at_k(relevant, k)
            elif metric.startswith('R@'):
                k = int(metric.split('@')[1])
                evaluations[metric] = self.calculate_recall(retrieved, actual_modified, relevant, k)

        return {k: round(v, 4) for k, v in evaluations.items()}

In [4]:
class ModelEvaluator:
    def __init__(self, model, eval_model, combined_df, seed=42):
        self.model = model
        self.eval_model = eval_model
        self.combined_df = combined_df
        self.seed = seed

    def sample_commits(self, n):
        if self.combined_df.commit_id.nunique() < n:
            raise ValueError(f'Not enough commits to sample. Required: {n}, available: {self.combined_df.commit_id.nunique()}')

        midpoint_date = np.median(self.combined_df['commit_date'])
        recent_df = self.combined_df[self.combined_df['commit_date'] > midpoint_date]

        return recent_df.drop_duplicates(subset='commit_id').sample(n=n, replace=False, random_state=self.seed)

    def evaluate_df(self, df, k=1000, aggregation_strategy=None, rerankers=None):
        results = []
        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
            cur_query = row['commit_message']
            search_results = self.model.pipeline(cur_query, row['commit_date'], ranking_depth=k, aggregation_method=aggregation_strategy)
            for reranker in rerankers:
                if reranker.__class__.__name__ == 'BERTCodeReranker':
                    search_results = reranker.rerank_pipeline(cur_query, search_results, row['commit_id'])
                else:
                    search_results = reranker.rerank_pipeline(cur_query, search_results)


            if 'actual_modified_files' in df.columns:
                actual_modified_files = row['actual_modified_files']
            else:
                actual_modified_files = self.combined_df[self.combined_df['commit_id'] == row['commit_id']]['file_path'].tolist()
            evaluation = self.eval_model.evaluate(search_results, actual_modified_files)
            results.append(evaluation)
        return results

    def evaluate_sampling(self, n=100, k=1000, output_file_path=None, overwrite_eval=False, aggregation_strategy=None, rerankers=None, gold_df=None): #, repo_path=None):
        # if repo_path is None:
        #     print("Repo path not provided, using current working directory")
            # repo_path = os.getcwd()
        if rerankers is None:
            rerankers = []

        if output_file_path is None:
            print("WARNING: Output file path not provided, not writing results to file")
            # output_file_path = os.path.join(repo_path, f'{self.model.__class__.__name__}_results.txt')

        # output_file_path = os.path.join(repo_path, output_file)
        model_name = self.model.__class__.__name__

        if not overwrite_eval and output_file_path and os.path.exists(output_file_path):
            print(f'Output file {output_file_path} already exists - not writing to file, set overwrite_eval flag to True for that...')
            # print the contents of the file
            # with open(output_file_path, "r") as file:
            #     print(file.read())
            # return
            output_file_path=None

        if gold_df is None:
            sampled_commits = self.sample_commits(n)
            results = self.evaluate_df(sampled_commits, k, aggregation_strategy, rerankers)
        else:
            print(f'Found gold_df, evaluating on {len(gold_df)} commits')
            print(gold_df.info())
            results = self.evaluate_df(gold_df, k, aggregation_strategy, rerankers)

        avg_scores = {metric: round(np.mean([result[metric] for result in results]), 4) for metric in results[0]}

        if output_file_path is not None:
            with open(output_file_path, "w") as file:
                file.write(f"Model Name: {model_name}\n")
                # write name of each reranker
                if len(rerankers) > 0:
                    file.write("Rerankers:\n")
                    for reranker in rerankers:
                        reranker_model_name = reranker.model.config.name_or_path
                        # replace / with _
                        reranker_model_name = reranker_model_name.replace('/', '_')
                        file.write(f"{reranker.__class__.__name__} ({reranker_model_name}) @ {reranker.rerank_depth}\n")


                file.write(f"Sample Size: {n}\n")
                file.write("Evaluation Metrics:\n")
                for key, value in avg_scores.items():
                    file.write(f"{key}: {value}\n")

            print(f'Evaluation results written to {output_file_path}')

        return avg_scores

In [5]:
class Reranker:
    def __init__(self, parameters):
        self.parameters = parameters
        
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        print(f'Using device: {self.device}')
        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']

    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        raise NotImplementedError

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

In [6]:
class BERTCodeReranker:
    def __init__(self, parameters, combined_df):
        self.combined_df = combined_df
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)

        print(f'Using device: {self.device}')

        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

        print(f"Initialized Code File BERT reranker with parameters: {parameters}")


    def rerank(self, query, aggregated_results: List[AggregatedSearchResult], train_commit_id):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
        # print(f'Reranking {len(aggregated_results)} results')

        self.model.eval()

        query_passage_pairs, per_result_contribution = self.split_into_query_passage_pairs(query, aggregated_results, train_commit_id)


        # for agg_result in aggregated_results:
        #     query_passage_pairs.extend(
        #         (query, result.commit_message)
        #         for result in agg_result.contributing_results[: self.psg_cnt]
        #     )

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        # Now assign the scores to the aggregated results by mapping the scores to the contributing results
        for i, agg_result in enumerate(aggregated_results):
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
            assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
            end_index = score_index + per_result_contribution[i] # only use psg_cnt contributing_results
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index


            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def split_into_query_passage_pairs(self, query, aggregated_results, train_commit_id):
        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        def full_tokenize(s):
            return self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()
        query_passage_pairs = []
        per_result_contribution = []
        if self.combined_df is not None:
            combined_df = self.combined_df


        for agg_result in aggregated_results:
            # agg_result.contributing_results.sort(key=lambda res: res.commit_date, reverse=True)
            # get most recent file version
            most_recent_search_result = agg_result.contributing_results[0]
            # get the file_path and commit_id
            file_path = most_recent_search_result.file_path
            # commit_id = most_recent_search_result.commit_id
            # get the file content from combined_df
            # file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['cur_file_content'].values[0]

            file_content = get_file_at_commit_from_git(file_path, train_commit_id)

            # file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['previous_file_content'].values[0]

            # now need to split this file content into psg_cnt passages
            # first tokenize the file content

            # warning these asserts are useless since we are using NaNs
            # assert file_content is not None, f'file_content is None for commit_id: {commit_id}, file_path: {file_path}'
            # assert file_path is not None, f'file_path is None for commit_id: {commit_id}'
            assert query is not None, 'query is None'

            # query_tokens = full_tokenize(query)
            path_tokens = full_tokenize(file_path)

            if pd.isna(file_content):
                # if file_content is NaN, then we can just set file_content to empty string
                print(f'WARNING: file_content is NaN for commit_id: {train}, file_path: {file_path}, setting file_content to empty string')
                file_content = ''

            file_tokens = full_tokenize(file_content)


            # now split the file content into psg_cnt passages
            cur_result_passages = []
            # get the input ids
            # input_ids = file_content['input_ids'].squeeze()
            # get the number of tokens in the file content
            total_tokens = len(file_tokens)

            for cur_start in range(0, total_tokens, self.psg_stride):
                cur_passage = []
                # add query tokens and path tokens
                # cur_passage.extend(query_tokens)
                cur_passage.extend(path_tokens)

                # add the file tokens
                cur_passage.extend(file_tokens[cur_start:cur_start+self.psg_len])

                # now convert cur_passage into a string
                cur_passage_decoded = self.tokenizer.decode(cur_passage)

                # add the cur_passage to cur_result_passages
                cur_result_passages.append(cur_passage_decoded)

                # if len(cur_result_passages) == self.psg_cnt:
                #     break

            # now add the query, passage pairs to query_passage_pairs
            per_result_contribution.append(len(cur_result_passages))
            query_passage_pairs.extend((query, passage) for passage in cur_result_passages)
        return query_passage_pairs, per_result_contribution

    def rerank_pipeline(self, query, aggregated_results, train_commit_id):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results, train_commit_id)
        min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        for i, result in enumerate(bottom_results):
            result.score = min_top_score - i - 1
        # combine the results
        reranked_results.extend(bottom_results)
        assert(len(reranked_results) == len(aggregated_results))
        return reranked_results

In [7]:
class PatchResult:
    def __init__(self, passage, score):
        self.score = score
        self.passage = passage

    def __repr__(self):
        class_name = self.__class__.__name__
        return f'{class_name}({self.passage}, {self.score})'

In [8]:
class PatchCodeReranker(Reranker):
    def __init__(self, parameters):
        super().__init__(parameters)

        # specific to CodeReranker type

        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model.to(self.device)
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

        
        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        self.psg_stride = parameters.get('psg_stride', self.psg_len)
        print(f"Initialized Code File BERT reranker with parameters: {parameters}")


    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        self.model.eval()
        query_passage_pairs, _ = self.split_into_query_passage_pairs(query, aggregated_results)

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        # convert the scores to PatchResult objects
        patch_results = [PatchResult(passage[1], score) for passage, score in zip(query_passage_pairs, scores)]

        # sort patch_results by the scores
        sorted_patch_results = sorted(patch_results, key=lambda res: res.score, reverse=True)

        return sorted_patch_results

    def split_into_query_passage_pairs(self, query, aggregated_results):
        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        def full_tokenize(s):
            return self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()
        query_passage_pairs = []
        per_result_contribution = []
        for agg_result in aggregated_results:
            agg_result.contributing_results.sort(key=lambda res: res.commit_date, reverse=True)
            # get most recent file version
            most_recent_search_result = agg_result.contributing_results[0]
            # get the file_path and commit_id
            file_path = most_recent_search_result.file_path
            commit_id = most_recent_search_result.commit_id
            file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['cur_file_content'].values[0]

            # file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['previous_file_content'].values[0]

            # warning these asserts are useless since we are using NaNs
            assert file_content is not None, f'file_content is None for commit_id: {commit_id}, file_path: {file_path}'
            assert file_path is not None, f'file_path is None for commit_id: {commit_id}'
            assert query is not None, 'query is None'

            if pd.isna(file_content):
                # if file_content is NaN, then we can just set file_content to empty string
                print(f'WARNING: file_content is NaN for commit_id: {commit_id}, file_path: {file_path}, setting file_content to empty string')
                file_content = ''

            cur_result_passages = split_random_chunks(file_content, tokenizer)

            query_passage_pairs.extend((query, passage) for passage in cur_result_passages)
        return query_passage_pairs, per_result_contribution

    def rerank_pipeline(self, query, aggregated_results):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        # bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results)
        # min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        # for i, result in enumerate(bottom_results):
            # result.score = min_top_score - i - 1
        # combine the results
        # reranked_results.extend(bottom_results)
        # assert(len(reranked_results) == len(aggregated_results))
        return reranked_results



# Loading some dummy data

In [9]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='../data/2_7/facebook_react/index_commit_tokenized',
    repo_path='../data/2_7/facebook_react', k=1000, n=100,
    model_path='microsoft/codebert-base', overwrite_cache=False,
    batch_size=32, num_epochs=10, learning_rate=5e-05,
    run_name='debug',
    notes='debug (ignore)',
    num_positives=10, num_negatives=10, train_depth=1000, num_workers=8,
    train_commits=1000, psg_cnt=25, use_gpu=True,
    rerank_depth=100, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4',
    overwrite_eval=False, sanity_check=True, debug=False,
    psg_len=350, psg_stride=250, ignore_gold_in_training=False,
    eval_folder='repr_0.1663', use_gpt_train=True,
    aggregation_strategy='sump',
    bert_best_model='../data/combined_commit_train/best_model',
    best_model_path='../data/2_7/facebook_react/models/bce/best_model'

)

metrics =['MAP', 'P@1', 'P@10', 'P@20', 'P@30', 'MRR', 'R@1', 'R@10', 'R@100', 'R@1000']
repo_path = args.repo_path
repo_name = repo_path.split('/')[-1]
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

test_path = os.path.join('..', 'gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
# test_path = os.path.join('gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
gold_df = pd.read_parquet(test_path)
gold_df = gold_df.rename(columns={'commit_message': 'original_message', f'transformed_message_{args.openai_model}': 'commit_message'})

Loaded index at ../data/2_7/facebook_react/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [10]:
import git

In [11]:
local_path = '../repos/facebook_react'
repo = git.Repo(local_path)

In [12]:
code_reranker_params = {
        'model_name': args.model_path,
        'aggregation_strategy': 'maxp',
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': args.rerank_depth,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': BM25_AGGR_STRAT,
        'psg_len': args.psg_len,
        'psg_stride': args.psg_stride,
        'psg_cnt': args.psg_cnt,
    }

bert_params = {
        'model_name': args.model_path,
        'psg_cnt': 5,
        'aggregation_strategy': 'sump',
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': 250,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': 'sump',
    }

In [13]:
dummy_train_row = gold_df.iloc[6]
dummy_commit_id = dummy_train_row.commit_id
dummy_train_row

commit_id                         323efbc33c27a602a4aab8519f58feba1e0a216c
commit_date                                                     1512398372
original_message         Ensure value and defaultValue do not assign fu...
actual_files_modified    [packages/react-dom/src/__tests__/ReactDOMInpu...
commit_message           Input properties 'value' and 'defaultValue' ac...
Name: 6, dtype: object

In [14]:
dummy_train_query = dummy_train_row.commit_message
dummy_file_path_list = dummy_train_row.actual_files_modified

In [15]:
dummy_file_path_list

array(['packages/react-dom/src/__tests__/ReactDOMInput-test.js',
       'packages/react-dom/src/client/ReactDOMFiberInput.js',
       'packages/react-dom/src/events/ChangeEventPlugin.js',
       'packages/react-dom/src/shared/DOMProperty.js'], dtype=object)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
def split_random_chunks(file, tokenizer, stride=250, psg_len=350):
    file_tokens = full_tokenize(file, tokenizer)
    total_tokens = len(file_tokens)
    res = []
    for cur_start in range(0, total_tokens, stride):
        # get tokens for current passage
        res.append(tokenizer.decode(file_tokens[cur_start:cur_start+psg_len]))

    return res

In [17]:
dummy_file = combined_df.query(f"commit_id=='{dummy_commit_id}' & file_path=='{dummy_file_path_list[0]}'")['previous_file_content'].values[0]

In [18]:
dummy_file_list = [combined_df.query(f"commit_id=='{dummy_commit_id}' & file_path=='{x}'")['previous_file_content'].values[0] for x in dummy_file_path_list]

In [19]:
dummy_file_patch_list = [chunk for x in dummy_file_list for chunk in split_random_chunks(x, tokenizer)]

Token indices sequence length is longer than the specified maximum sequence length for this model (21407 > 512). Running this sequence through the model will result in indexing errors


In [20]:
dummy_diff_list = [combined_df.query(f"commit_id=='{dummy_commit_id}' & file_path=='{x}'")['diff'].values[0] for x in dummy_file_path_list]

In [65]:
dummy_diff_list[0]

'@@ -248,6 +248,23 @@ describe(\'ReactDOMInput\', () => {\n     }\n   });\n \n+  it(\'performs a state change from "" to 0\', () => {\n+    class Stub extends React.Component {\n+      state = {\n+        value: \'\',\n+      };\n+      render() {\n+        return <input type="number" value={this.state.value} readOnly={true} />;\n+      }\n+    }\n+\n+    var stub = ReactTestUtils.renderIntoDocument(<Stub />);\n+    var node = ReactDOM.findDOMNode(stub);\n+    stub.setState({value: 0});\n+\n+    expect(node.value).toEqual(\'0\');\n+  });\n+\n   it(\'distinguishes precision for extra zeroes in string number values\', () => {\n     spyOnDev(console, \'error\');\n     class Stub extends React.Component {\n@@ -595,6 +612,7 @@ describe(\'ReactDOMInput\', () => {\n     var node = container.firstChild;\n \n     expect(node.value).toBe(\'0\');\n+    expect(node.defaultValue).toBe(\'0\');\n   });\n \n   it(\'should properly transition from 0 to an empty value\', function() {\n@@ -606,6 +624,43 

In [21]:
sample_diff = dummy_diff_list[0]

In [22]:
def get_file_at_commit_from_git(file, commit_id):
    # Access the specified commit
    commit = repo.commit(commit_id)
    
    # Check if the commit has parents
    if commit.parents:
        # Access the first parent of the commit
        parent_commit = commit.parents[0]
        
        # Attempt to get the file content from the parent commit
        try:
            blob = parent_commit.tree / file
            file_content = blob.data_stream.read().decode('utf-8')
            return file_content
        except KeyError:
            # Handle the case where the file does not exist in the parent commit
            return "The file was not present in the parent commit."
    else:
        # Handle the case where the specified commit is the initial commit and has no parents
        return "The specified commit has no parents (it might be the initial commit)."

In [25]:
# get_file_at_commit_from_git('packages/react-dom/src/__tests__/ReactDOMInput-test.js', '59763bf7f3ab3b06cd8ab5a5a83ae3dafc667aa9') == tmp

# Getting BM25 Results

In [26]:
bm25_results = bm25_searcher.pipeline(dummy_train_query, dummy_train_row['commit_date'], ranking_depth=K, aggregation_method=BM25_AGGR_STRAT)

In [52]:
# bm25 will not necessarily retrieve the latest version of a file
# https://github.com/facebook/react/commits/323efbc33c27a602a4aab8519f58feba1e0a216c/packages/react-dom/src/__tests__/ReactDOMInput-test.js

def check(cmt, file_path, res):
    flag = False
    for x in res:
        if x.file_path == file_path:
            flag = True
            for y in x.contributing_results:
                if y.commit_id == cmt:
                    return True
        if flag:
            break
    return False

check('59763bf7f3ab3b06cd8ab5a5a83ae3dafc667aa9', 'packages/react-dom/src/__tests__/ReactDOMInput-test.js', bm25_searcher.pipeline(dummy_train_query, dummy_train_row['commit_date'], ranking_depth=28000, aggregation_method=BM25_AGGR_STRAT))

True

In [29]:
dummy_train_query

"Input properties 'value' and 'defaultValue' accepting and assigning functions and symbols leads to improper handling and inconsistencies in numeric equality checks."

In [27]:
evaluator.evaluate(bm25_results, dummy_file_path_list)

{'MAP': 0.0867,
 'P@1': 0.0,
 'P@10': 0.1,
 'P@20': 0.05,
 'P@30': 0.0333,
 'MRR': 0.1429,
 'R@1': 0.0,
 'R@10': 0.25,
 'R@100': 0.75,
 'R@1000': 0.75}

# Getting BERT Rerank @ 250 on top of BM25

In [30]:
bert_reranker = BERTReranker(bert_params)
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained(args.bert_best_model, num_labels=1, problem_type='regression')
bert_reranker.model.to(bert_reranker.device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 476.73 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump'}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [31]:
bert_rerank_results = bert_reranker.rerank_pipeline(dummy_train_query, bm25_results)

In [32]:
evaluator.evaluate(bert_rerank_results, dummy_file_path_list)

{'MAP': 0.1421,
 'P@1': 0.0,
 'P@10': 0.0,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 0.0833,
 'R@1': 0.0,
 'R@10': 0.0,
 'R@100': 0.75,
 'R@1000': 0.75}

In [None]:
# def aside():
#     results = [sorted(x.contributing_results, key=lambda res: res.commit_date, reverse=True)[0] for x in bert_rerank_results]
#     files_content = [combined_df.query(f"commit_id=='{x.commit_id}' & file_path=='{x.file_path}'")['cur_file_content'].values[0] for x in results]
#     patches = [chunk for x in files_content for chunk in split_random_chunks(x, tokenizer)]
#     print(len(set(patches).intersection(set(dummy_file_patch_list))))
    
# aside()

# File Code Reranker

In [33]:
code_reranker_params['aggregation_strategy'] = 'sump'
file_code_reranker = BERTCodeReranker(code_reranker_params, combined_df)
cur_best_model_path = '../data/2_7/facebook_react/models/combined_diffs/best_model'
# cur_best_model_path = '../data/2_7/facebook_react/models/X/best_model'


file_code_reranker.model = AutoModelForSequenceClassification.from_pretrained(cur_best_model_path, num_labels=1, problem_type='regression')
file_code_reranker.model.to(file_code_reranker.device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 965.97 MB
Initialized Code File BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 350, 'psg_stride': 250, 'psg_cnt': 25}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [34]:
file_code_reranker_results = file_code_reranker.rerank_pipeline(dummy_train_query, bert_rerank_results, dummy_commit_id)

Token indices sequence length is longer than the specified maximum sequence length for this model (2673 > 512). Running this sequence through the model will result in indexing errors


In [35]:
evaluator.evaluate(file_code_reranker_results, dummy_file_path_list)

{'MAP': 0.7,
 'P@1': 1.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}

After latest file version (gold df iloc 6, combined_Df, sump codereranker)


{'MAP': 0.7,
 'P@1': 1.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}

{'MAP': 0.5536,
 'P@1': 1.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}


Pre latest version of file

{'MAP': 0.3869,
 'P@1': 0.0,
 'P@10': 0.3,
 'P@20': 0.15,
 'P@30': 0.1,
 'MRR': 0.5,
 'R@1': 0.0,
 'R@10': 0.75,
 'R@100': 0.75,
 'R@1000': 0.75}

In [None]:
# results = [sorted(x.contributing_results, key=lambda res: res.commit_date, reverse=True)[0] for x in file_code_reranker_results]
# def aside():
#     tmp = [results[1]]
#     print(tmp)
#     # print(dummy_train_row.commit_date >= results[1].commit_date)
#     files_content = [combined_df.query(f"commit_id=='{x.commit_id}' & file_path=='{x.file_path}'")['cur_file_content'].values[0] for x in tmp]
#     return files_content[0]
#     # patches = [chunk for x in files_content for chunk in split_random_chunks(x, tokenizer)]
#     # print(len(set(patches).intersection(set(dummy_file_patch_list))))
    
# tmp = aside()

In [None]:
bert_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=BM25_AGGR_STRAT, rerankers=[bert_reranker], gold_df=gold_df, overwrite_eval=False)
bert_eval

In [None]:
model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=BM25_AGGR_STRAT, rerankers=[bert_reranker], gold_df=gold_df.head(10), overwrite_eval=False)

In [None]:
# output of code rereank on 10 gold

In [None]:
rerankers = [bert_reranker, file_code_reranker]

fixed_file_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=BM25_AGGR_STRAT, rerankers=rerankers, gold_df=gold_df, overwrite_eval=False)
fixed_file_eval

In [None]:
100
bert
{'MAP': 0.1967,
 'P@1': 0.18,
 'P@10': 0.089,
 'P@20': 0.0665,
 'P@30': 0.053,
 'MRR': 0.2788,
 'R@1': 0.0614,
 'R@10': 0.2212,
 'R@100': 0.4599,
 'R@1000': 0.5801}


10 commits
maxp 
{'MAP': 0.2364,
 'P@1': 0.1,
 'P@10': 0.17,
 'P@20': 0.125,
 'P@30': 0.1,
 'MRR': 0.2842,
 'R@1': 0.0062,
 'R@10': 0.3876,
 'R@100': 0.6463,
 'R@1000': 0.674}

# Patch Code Reranker

In [53]:
code_reranker = PatchCodeReranker(code_reranker_params)
# cur_best_model_path = 'data/2_7/facebook_react/models/X/best_model'
cur_best_model_path = '../data/2_7/facebook_react/models/X/best_model'

code_reranker.model = AutoModelForSequenceClassification.from_pretrained(cur_best_model_path, num_labels=1, problem_type='regression')
code_reranker.model.to(code_reranker.device)

Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 965.97 MB


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized Code File BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 350, 'psg_stride': 250, 'psg_cnt': 25}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [54]:
code_reranker_results = code_reranker.rerank_pipeline(dummy_train_query, bert_rerank_results)

In [55]:
code_reranker_results

[PatchResult(Operations.deleteValueForProperty(stubNode, 'title');
       expect(stubNode.getAttribute('title')).toBe(null);
       // JSDOM does not behave correctly for attributes/properties
       //expect(stubNode.title).toBe('');
     });
 
     it('should not remove attributes for special properties', () => {
       stubNode = document.createElement('input');
       ReactDOMComponentTree.precacheNode(stubInstance, stubNode);
 
       stubNode.setAttribute('value', 'foo');
 
       DOMPropertyOperations.deleteValueForProperty(stubNode, 'value');
       // JSDOM does not behave correctly for attributes/properties
       //expect(stubNode.getAttribute('value')).toBe('foo');
       expect(stubNode.value).toBe('');
     });
 
     it('should not leave all options selected when deleting multiple', () => {
       stubNode = document.createElement('select');
       ReactDOMComponentTree.precacheNode(stubInstance, stubNode);
 
       stubNode.multiple = true;
       stubNode.appendChild(d

In [57]:
dummy_file_patch_list[0]

"/**\n * Copyright (c) 2013-present, Facebook, Inc.\n *\n * This source code is licensed under the MIT license found in the\n * LICENSE file in the root directory of this source tree.\n *\n * @emails react-core\n */\n\n'use strict';\n\nvar emptyFunction = require('fbjs/lib/emptyFunction');\n\ndescribe('ReactDOMInput', () => {\n  var React;\n  var ReactDOM;\n  var ReactDOMServer;\n  var ReactTestUtils;\n  var setUntrackedValue;\n\n  function normalizeCodeLocInfo(str) {\n    return str && str.replace(/\\(at.+?:\\d+\\)/g, '(at **)');\n  }\n\n  function dispatchEventOnNode(node, type) {\n    node.dispatchEvent(new Event(type, {bubbles: true, cancelable: true}));\n  }\n\n  beforeEach(() => {\n    jest.resetModules();\n\n    setUntrackedValue = Object.getOwnPropertyDescriptor(\n      HTMLInputElement.prototype,\n      'value',\n    ).set;\n\n    React = require('react');\n    ReactDOM = require('react-dom');\n    ReactDOMServer = require('react-dom/server');\n    ReactTestUtils = require('re

In [60]:
code_reranker_results[0].passage

"Operations.deleteValueForProperty(stubNode, 'title');\n      expect(stubNode.getAttribute('title')).toBe(null);\n      // JSDOM does not behave correctly for attributes/properties\n      //expect(stubNode.title).toBe('');\n    });\n\n    it('should not remove attributes for special properties', () => {\n      stubNode = document.createElement('input');\n      ReactDOMComponentTree.precacheNode(stubInstance, stubNode);\n\n      stubNode.setAttribute('value', 'foo');\n\n      DOMPropertyOperations.deleteValueForProperty(stubNode, 'value');\n      // JSDOM does not behave correctly for attributes/properties\n      //expect(stubNode.getAttribute('value')).toBe('foo');\n      expect(stubNode.value).toBe('');\n    });\n\n    it('should not leave all options selected when deleting multiple', () => {\n      stubNode = document.createElement('select');\n      ReactDOMComponentTree.precacheNode(stubInstance, stubNode);\n\n      stubNode.multiple = true;\n      stubNode.appendChild(document.crea

In [None]:
code_reranker_results[0].passage in dummy_file_patch_list

In [None]:
evaluator.evaluate(code_reranker_results, dummy_file_patch_list, eval_type='patch')

In [None]:
{'MAP': 0.018,
 'P@1': 0.0,
 'P@10': 0.0,
 'P@20': 0.0,
 'P@30': 0.0,
 'MRR': 0.0085,
 'R@1': 0.0,
 'R@10': 0.0,
 'R@100': 0.0,
 'R@1000': 0.186}

In [None]:
len(set([x.passage for x in code_reranker_results]).intersection(set(dummy_file_patch_list)))