In [1]:
import sys
sys.path.append('src')

In [29]:
import argparse
import os
import sys
from typing import List

import numpy as np
import pandas as pd
import torch
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)


from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator
from utils import (
    AggregatedSearchResult,
    get_combined_df,
    prepare_triplet_data_from_df,
    sanity_check_triplets,
    set_seed,
    tokenize,
    get_recent_df
)
from BERTReranker_v4 import BERTReranker
from CodeReranker import BERTCodeReranker
# set seed
set_seed(42)

In [3]:
# print torch devices available
print('Available devices: ', torch.cuda.device_count())
print('Current cuda device: ', torch.cuda.current_device())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Available devices:  1
Current cuda device:  0
Quadro RTX 6000


In [19]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='smalldata/ftr/index_commit_tokenized', repo_path='smalldata/ftr', k=1000, n=100, no_bm25=True, model_path='microsoft/codebert-base', overwrite_cache=False, batch_size=32, num_epochs=10, learning_rate=5e-05, num_positives=10, num_negatives=10, train_depth=1000, num_workers=8, train_commits=1000, psg_cnt=5, aggregation_strategy='sump', use_gpu=True, rerank_depth=250, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4', overwrite_eval=False, sanity_check_triplets=False, debug=False, eval_before_training=False, do_combined_train=False, repo_paths=None, best_model_path=None
)

In [13]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']
repo_path = args.repo_path
repo_name = repo_path.split('/')[-1]
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'

In [7]:
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

Loaded index at smalldata/ftr/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [9]:
bm25_output_path = os.path.join(eval_path, f'bm25_baseline_N{n}_K{K}_metrics.txt')
print(f'BM25 output path: {bm25_output_path}')

bm25_baseline_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=bm25_output_path, aggregation_strategy=BM25_AGGR_STRAT)

print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)

BM25 output path: smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt


100%|██████████| 100/100 [00:23<00:00,  4.25it/s]

Evaluation results written to smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt
BM25 Baseline Evaluation
{'MAP': 0.1542, 'P@10': 0.087, 'P@100': 0.0267, 'P@1000': 0.0041, 'MRR': 0.2133, 'Recall@100': 0.5077, 'Recall@1000': 0.6845}





In [30]:
bert_params = {
        'model_name': args.model_path,
        'psg_cnt': args.psg_cnt,
        'aggregation_strategy': args.aggregation_strategy,
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': args.rerank_depth,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': BM25_AGGR_STRAT,
    }

In [20]:
params = {
        'model_name': args.model_path,
        'psg_cnt': 25,
        'aggregation_strategy': args.aggregation_strategy,
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': 100,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': BM25_AGGR_STRAT,
        'psg_len': 250,
        'psg_stride': 200,
    }

### Prepare training data

In [22]:
recent_df = get_recent_df(combined_df, repo_name=repo_name, ignore_gold_in_training=True)

        # Step 6: randomly sample 1500 rows from recent_df
recent_df = recent_df.sample(params['train_commits'])

Preparing training data...
Number of commits after midpoint date: 5804
Number of commits after filtering by commit message length: 1543
Gold commit file gold/ftr/ftr_gpt4_gold_commit_ids.txt does not exist, but ignore_gold_in_training is set to True, so continuing...


In [18]:
# prepare data first
if not os.path.exists(os.path.join(repo_path, 'cache')):
    os.makedirs(os.path.join(repo_path, 'cache'))
triplet_cache = os.path.join(repo_path, 'cache', 'triplet_data_cache.pkl')
diff_cache = os.path.join(repo_path, 'cache', 'diff_data.parquet')

In [14]:
def aside():
    tokenizer = AutoTokenizer.from_pretrained(params['model_name'])
    def tokenize(x):
        # tokenize with no max length
        return tokenizer.encode(x, add_special_tokens=True, truncation=False, max_length=None)
    combined_df.info()
    # print the average number of words in commit_message column

    # sample 100 rows from combined_df
    # sample_df = combined_df.sample(100, random_state=52)
    # sample_df = combined_df[:10000]

    avg_words = sample_df['commit_message'].str.split().str.len().mean()
    print(f'Average number of words in commit message (whitespace): {avg_words}')
    avg_words = sample_df['commit_message'].apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of words in commit message (AutoTokenizer): {avg_words}')

    # print approx number of tokens in passed to bert which is 2 * avg_words * 1.5
    approx_tokens = 2 * avg_words * 1.5
    print(f'Approx number of tokens passed to bert: {approx_tokens}')

    # print remaining number of tokens in bert (max is 512)
    print(f'Approx number of tokens remaining for code: {512 - approx_tokens}')

    # print average number of code tokens in diff column by using tokenize function but only on the non-null diff values
    avg_code_tokens = sample_df['diff'].dropna().apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of code tokens in diff column: {avg_code_tokens}')

    avg_file_tokens = sample_df['cur_file_content'].dropna().apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of code tokens in cur_file_content column: {avg_file_tokens}')

aside()

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73765 entries, 0 to 73764
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   owner                  73765 non-null  string
 1   repo_name              73765 non-null  string
 2   commit_date            73765 non-null  int64 
 3   commit_id              73765 non-null  string
 4   commit_message         73765 non-null  string
 5   file_path              73765 non-null  string
 6   previous_commit_id     73765 non-null  string
 7   previous_file_content  73765 non-null  string
 8   cur_file_content       73765 non-null  string
 9   diff                   58037 non-null  string
 10  status                 73765 non-null  object
 11  is_merge_request       73765 non-null  bool  
 12  file_extension         73765 non-null  object
dtypes: bool(1), int64(1), object(2), string(9)
memory usage: 6.8+ MB
Average number of words in commit message (whitespace): 

In [19]:
def test_code_prep(df, searcher, search_depth, num_positives, num_negatives):
    code_data = []
    print(f'Preparing code data from dataframe of size: {len(df)} with search_depth: {search_depth}')
    # for _, row in df.iterrows():
    total_positives, total_negatives = 0, 0
    for _, row in tqdm(df.iterrows(), total=len(df)):
        cur_positives = 0
        cur_negatives = 0
        commit_message = row['commit_message']
        actual_files_modified = row['actual_files_modified']

        agg_search_results = searcher.pipeline(commit_message, row['commit_date'], search_depth, 'sump', sort_contributing_result_by_date=True)

        for agg_result in agg_search_results:
            most_recent_search_result = agg_result.contributing_results[0]
            file_path = most_recent_search_result.file_path
            commit_id = most_recent_search_result.commit_id

            if file_path in actual_files_modified and cur_positives < num_positives:
                # this is a positive sample
                code_data.append((commit_message, file_path, commit_id, 1))
                cur_positives += 1
                total_positives += 1
            elif file_path not in actual_files_modified and cur_negatives < num_negatives:
                # this is a negative sample
                code_data.append((commit_message, file_path, commit_id, 0))
                cur_negatives += 1
                total_negatives += 1

            if cur_positives == num_positives and cur_negatives == num_negatives:
                break



        # go from top to bottom, first num_positives non-0 scores are positive samples and the next num_negatives are negative samples
        # for agg_result in agg_search_results:
        #     cur_commit_msg = agg_result.contributing_results[0].commit_message
        #     if cur_positives < num_positives and agg_result.score > 0:
        #         # meaning there is at least one file in the agg_result that is in actual_files_modified
        #         # pos_commits.append(agg_result)
        #         data.append((commit_message, cur_commit_msg, 1))
        #         cur_positives += 1
        #         pos_commit_ids.add(agg_result.commit_id)
        #     elif cur_negatives < num_negatives:
        #         # neg_commits.append(agg_result)
        #         data.append((commit_message, cur_commit_msg, 0))
        #         cur_negatives += 1
        #         neg_commit_ids.add(agg_result.commit_id)
        #     if cur_positives == num_positives and cur_negatives == num_negatives:
        #         break

        # assert len(pos_commit_ids.intersection(neg_commit_ids)) == 0, 'Positive and negative commit ids should not intersect'
        # print(f"Total positives: {cur_positives}, Total negatives: {cur_negatives}")
        # total_positives += cur_positives
        # total_negatives += cur_negatives

    # convert to pandas dataframe
    # data = pd.DataFrame(data, columns=['query', 'passage', 'label'])
    code_df = pd.DataFrame(code_data, columns=['query', 'file_path', 'commit_id', 'label'])
    # print distribution of labels
    print(f"Total positives: {total_positives}, Total negatives: {total_negatives}")
    # print percentage of positives and negatives
    denom = total_positives + total_negatives
    print(f"Percentage of positives: {total_positives / denom}, Percentage of negatives: {total_negatives / denom}")
    return code_df

In [34]:
code_df = test_code_prep(recent_df, bm25_searcher, params['train_depth'], params['num_positives'], params['num_negatives'])

Preparing code data from dataframe of size: 1500 with search_depth: 1000


100%|██████████| 1500/1500 [06:53<00:00,  3.63it/s]

Total positives: 5753, Total negatives: 15000
Percentage of positives: 0.2772129330699176, Percentage of negatives: 0.7227870669300824





In [92]:
code_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20753 entries, 0 to 20752
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query      20753 non-null  object
 1   file_path  20753 non-null  object
 2   commit_id  20753 non-null  object
 3   label      20753 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 648.7+ KB


In [94]:
def process_diff_data(diff_data, df):
    # given diff_data, we want to use commit_id and file_path to get the diff from the df

    # first we need to get the diff from the df
    # we can use the commit_id and file_path to get the diff
    res_df = []
    null_rows = 0
    # for _, row in diff_data.iterrows():
    for _, row in tqdm(diff_data.iterrows(), total=len(diff_data)):
        commit_id = row['commit_id']
        file_path = row['file_path']
        # get the diff from the df
        diff = df[(df['commit_id'] == commit_id) & (df['file_path'] == file_path)]['cur_file_content']
        # check if diff is NA/NaN
        if diff.isnull().values.any():
            # if it is, then we can just skip this row
            null_rows += 1
            continue
        diff = diff.values[0]

        res_df.append((commit_id, file_path, row['query'], diff, row['label']))

    res_df = pd.DataFrame(res_df, columns=['commit_id', 'file_path', 'query', 'passage', 'label'])
    # make query and passage into strings and label into int
    res_df['query'] = res_df['query'].astype(str)
    res_df['passage'] = res_df['passage'].astype(str)
    res_df['label'] = res_df['label'].astype(int)
    print(f"Number of null rows: {null_rows}")
    return res_df

processed_diff_data = process_diff_data(code_df, combined_df)

100%|██████████| 20753/20753 [04:16<00:00, 80.97it/s]

Number of null rows: 0





In [95]:
processed_diff_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20753 entries, 0 to 20752
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   commit_id  20753 non-null  object
 1   file_path  20753 non-null  object
 2   query      20753 non-null  object
 3   passage    20753 non-null  object
 4   label      20753 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 810.8+ KB


In [97]:
def sanity_check_code(data):
    problems = 0
    for i, row in tqdm(data.iterrows(), total=len(data)):
        try:
            if row['label'] == 0:
                assert data[(data['query'] == row['query']) & (data['commit_id'] == row['commit_id']) & (data['file_path'] == row['file_path'])]['label'].values[0] == 0
            else:
                assert data[(data['query'] == row['query']) & (data['commit_id'] == row['commit_id']) & (data['file_path'] == row['file_path'])]['label'].values[0] == 1
        except AssertionError:
            print(f"Assertion failed at index {i}: {row}")
            # break  # Optional: break after the first failure, remove if you want to see all failures
            # remove the row with label 0

            if row['label'] == 0:
                problems += 1
                # data.drop(i, inplace=True)
                data = data.drop(i)
                # print(f"Dropped row at index {i}")

    print(f"Total number of problems in sanity check of training data: {problems}")
    return data

In [23]:
processed_code_df = pd.read_parquet(os.path.join(repo_path, 'cache', 'code_data.parquet'))

In [32]:
bert_reranker = BERTReranker(params)
save_model_name = params['model_name'].replace('/', '_')
# repo_name = 'facebook_react'
bert_best_model_path = os.path.join('2_7/facebook_react', 'models', f"{save_model_name}_bertrr_gpt_train", 'best_model')
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained(bert_best_model_path)
bert_reranker.model.to(bert_reranker.device)
rerankers = [bert_reranker]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 953.46 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 25, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 250, 'psg_stride': 200}


## Reranking with training

In [38]:
# class BERTCodeReranker:
#     def __init__(self, parameters):
#         self.parameters = parameters
#         self.model_name = parameters['model_name']
#         self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
#         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
#         self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
#         self.model.to(self.device)

#         print(f'Using device: {self.device}')

#         # print GPU info
#         if torch.cuda.is_available() and parameters['use_gpu']:
#             print(f"Using GPU: {torch.cuda.get_device_name(0)}")
#             print(f'GPU Device Count: {torch.cuda.device_count()}')
#             print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


#         self.psg_len = parameters['psg_len']
#         self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
#         self.psg_stride = parameters.get('psg_stride', self.psg_len)
#         self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
#         self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
#         self.rerank_depth = parameters['rerank_depth']
#         self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

#         print(f"Initialized Code File BERT reranker with parameters: {parameters}")


#     def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
#         """
#         Rerank the BM25 aggregated search results using BERT model scores.

#         query: The issue query string.
#         aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
#         """
#         # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
#         # print(f'Reranking {len(aggregated_results)} results')

#         self.model.eval()

#         query_passage_pairs, per_result_contribution = self.split_into_query_passage_pairs(query, aggregated_results)


#         # for agg_result in aggregated_results:
#         #     query_passage_pairs.extend(
#         #         (query, result.commit_message)
#         #         for result in agg_result.contributing_results[: self.psg_cnt]
#         #     )

#         if not query_passage_pairs:
#             print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
#             print(query, aggregated_results, self.psg_cnt)
#             return aggregated_results

#         # tokenize the query passage pairs
#         encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

#         # create tensors for the input ids, attention masks
#         input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
#         attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

#         # Create a dataloader for feeding the data to the model
#         dataset = TensorDataset(input_ids, attention_masks)
#         dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

#         scores = self.get_scores(dataloader, self.model)

#         score_index = 0
#         # Now assign the scores to the aggregated results by mapping the scores to the contributing results
#         for i, agg_result in enumerate(aggregated_results):
#             # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
#             assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
#             end_index = score_index + per_result_contribution[i] # only use psg_cnt contributing_results
#             cur_passage_scores = scores[score_index:end_index]
#             score_index = end_index


#             # Aggregate the scores for the current aggregated result
#             agg_score = self.aggregate_scores(cur_passage_scores)
#             agg_result.score = agg_score  # Assign the aggregated score

#         assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

#         # Sort by the new aggregated score
#         aggregated_results.sort(key=lambda res: res.score, reverse=True)

#         return aggregated_results

#     def get_scores(self, dataloader, model):
#         scores = []
#         with torch.no_grad():
#             for batch in dataloader:
#                 # Unpack the batch and move it to GPU
#                 b_input_ids, b_attention_mask = batch
#                 b_input_ids = b_input_ids.to(self.device)
#                 b_attention_mask = b_attention_mask.to(self.device)

#                 # Get scores from the model
#                 outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
#                 scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
#         return scores

#     def aggregate_scores(self, passage_scores):
#         """
#         Aggregate passage scores based on the specified strategy.
#         """
#         if len(passage_scores) == 0:
#             return 0.0

#         if self.aggregation_strategy == 'firstp':
#             return passage_scores[0]
#         if self.aggregation_strategy == 'maxp':
#             return max(passage_scores)
#         if self.aggregation_strategy == 'avgp':
#             return sum(passage_scores) / len(passage_scores)
#         if self.aggregation_strategy == 'sump':
#             return sum(passage_scores)
#         # else:
#         raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")


#     def split_into_query_passage_pairs(self, query, aggregated_results):
#         # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
#         def full_tokenize(s):
#             return self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()
#         query_passage_pairs = []
#         per_result_contribution = []
#         for agg_result in aggregated_results:
#             agg_result.contributing_results.sort(key=lambda res: res.commit_date, reverse=True)
#             # get most recent file version
#             most_recent_search_result = agg_result.contributing_results[0]
#             # get the file_path and commit_id
#             file_path = most_recent_search_result.file_path
#             commit_id = most_recent_search_result.commit_id
#             # get the file content from combined_df
#             file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['cur_file_content'].values[0]

#             # assert file_content is not None, f'file_content is None for commit_id: {commit_id} and file_path: {file_path}'

#             # now need to split this file content into psg_cnt passages
#             # first tokenize the file content
#             # check if file_content is pd.NA
#             query_tokens = full_tokenize(query)
#             path_tokens = full_tokenize(file_path)
#             if pd.isna(file_content):
#                 # query_passage_pairs.extend((query, file_path))
#                 # per_result_contribution.append(1)
#                 # continue
#                 file_content = ''
#             file_tokens = full_tokenize(file_content)



#             # now split the file content into psg_cnt passages
#             cur_result_passages = []
#             # get the input ids
#             # input_ids = file_content['input_ids'].squeeze()
#             # get the number of tokens in the file content
#             total_tokens = len(file_tokens)

#             for cur_start in range(0, total_tokens, self.psg_stride):
#                 cur_passage = []
#                 # add query tokens and path tokens
#                 cur_passage.extend(query_tokens)
#                 cur_passage.extend(path_tokens)

#                 # add the file tokens
#                 cur_passage.extend(file_tokens[cur_start:cur_start+self.psg_len])

#                 # now convert cur_passage into a string
#                 cur_passage_decoded = self.tokenizer.decode(cur_passage)

#                 # add the cur_passage to cur_result_passages
#                 cur_result_passages.append(cur_passage_decoded)

#                 if len(cur_result_passages) == self.psg_cnt:
#                     break

#             # now add the query, passage pairs to query_passage_pairs
#             per_result_contribution.append(len(cur_result_passages))
#             query_passage_pairs.extend((query, passage) for passage in cur_result_passages)
#         return query_passage_pairs, per_result_contribution

#     def rerank_pipeline(self, query, aggregated_results):
#         if len(aggregated_results) == 0:
#             return aggregated_results
#         top_results = aggregated_results[:self.rerank_depth]
#         bottom_results = aggregated_results[self.rerank_depth:]
#         reranked_results = self.rerank(query, top_results)
#         min_top_score = reranked_results[-1].score
#         # now adjust the scores of bottom_results
#         for i, result in enumerate(bottom_results):
#             result.score = min_top_score - i - 1
#         # combine the results
#         reranked_results.extend(bottom_results)
#         assert(len(reranked_results) == len(aggregated_results))
#         return reranked_results



In [34]:
code_reranker = BERTCodeReranker(params)
# code_reranker.rerank_depth = 100
# rerankers = [bert_reranker, code_reranker]
rerankers = [code_reranker]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 953.46 MB
Initialized Code File BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 25, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 250, 'psg_stride': 200}


In [145]:
from calendar import c


def prep_line(line):
        return line.rstrip().lstrip()

def parse_diff(diff):
    return [
        line[1:] if line.startswith('+') else line
        for line in diff.split('\n')
        if not (line.startswith('-') or len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
        and len(prep_line(line)) > 0
    ]

def prepare_code_triplets(diff_data, code_reranker, cache_file, overwrite=False):

    if cache_file and os.path.exists(cache_file) and not overwrite:
        print(f"Loading data from cache file: {cache_file}")
        # with open(cache_file, 'rb') as file:
        #     return pickle.load(file)
        return pd.read_parquet(cache_file)

    def full_tokenize(s):
        return code_reranker.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()



    def count_matching_lines(passage_lines, diff_lines):
        # Create a 2D array to store the lengths of the longest common subsequences
        dp = [[0] * (len(diff_lines) + 1) for _ in range(len(passage_lines) + 1)]

        # Fill the dp array
        for i in range(1, len(passage_lines) + 1):
            for j in range(1, len(diff_lines) + 1):
                if prep_line(passage_lines[i - 1]) == prep_line(diff_lines[j - 1]):
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

        return dp[-1][-1]

    triplets = []

    for _, row in tqdm(diff_data.iterrows(), total=len(diff_data)):
        file_tokens = full_tokenize(row['passage'])
        total_tokens = len(file_tokens)
        cur_diff = combined_df[(combined_df['commit_id'] == row['commit_id']) & (combined_df['file_path'] == row['file_path'])]['diff'].values[0]
        if pd.isna(cur_diff):
            # if diff is NA/NaN
            continue
        cur_diff_lines = parse_diff(cur_diff)
        cur_psg_cnt = 0
        cur_triplets = []
        for cur_start in range(0, total_tokens, code_reranker.psg_stride):
            cur_passage = []

            cur_passage.extend(file_tokens[cur_start:cur_start+code_reranker.psg_len])

            # now convert cur_passage into a string
            cur_passage_decoded = code_reranker.tokenizer.decode(cur_passage)

            cur_passage_lines = cur_passage_decoded.split('\n')

            # check if there are lines matching the diff lines
            # if there are, then we can add this directly to the triplets
            # common_lines = set(cur_passage_lines).intersection(set(cur_diff_lines))
            common_line_count = count_matching_lines(cur_passage_lines, cur_diff_lines)

            # add the cur_passage to cur_result_passages
            cur_triplets.append((common_line_count, (row['query'], row['file_path'], cur_passage_decoded, row['label'])))

        # sort the cur_triplets by the number of common lines
        cur_triplets.sort(key=lambda x: x[0], reverse=True)
        # now add the top code_reranker.psg_cnt to triplets
        for triplet in cur_triplets[:code_reranker.psg_cnt]:
            # print(f"Found {triplet[0]} matching lines for diff in cur_passage at index")
            triplets.append(triplet[1])


    # convert to pandas dataframe
    triplets = pd.DataFrame(triplets, columns=['query', 'file_path', 'passage', 'label'])
    if cache_file:
        # with open(cache_file, 'wb') as file:
        #     pickle.dump(triplets, file)
        #     print(f"Saved data to cache file: {cache_file}")
        print(f"Saving data to cache file: {cache_file}")
        triplets.to_parquet(cache_file)
    return triplets

In [57]:
def prepare_code_triplets(diff_data, code_reranker, cache_file, overwrite=False):
    # given diff_data, the passage column is way too long. We need to split it into passages of length psg_len with stride psg_stride
    # then we can create triplets from that

    # diff_data has columns: commit_id, file_path, query, passage, label

    # if cache_file and os.path.exists(cache_file) and not overwrite:
    #     print(f"Loading data from cache file: {cache_file}")
    #     # with open(cache_file, 'rb') as file:
    #     #     return pickle.load(file)
    #     return pd.read_parquet(cache_file)

    def full_tokenize(s):
        return code_reranker.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()

    triplets = []

    for _, row in tqdm(diff_data.iterrows(), total=len(diff_data)):
        file_tokens = full_tokenize(row['passage'])
        total_tokens = len(file_tokens)
        # cur_diff = combined_df[(combined_df['commit_id'] == row['commit_id']) & (combined_df['file_path'] == row['file_path'])]['diff'].values[0]
        cur_psg_cnt = 0
        for cur_start in range(0, total_tokens, code_reranker.psg_stride):
            cur_passage = []

            cur_passage.extend(file_tokens[cur_start:cur_start+code_reranker.psg_len])

            # now convert cur_passage into a string
            cur_passage_decoded = code_reranker.tokenizer.decode(cur_passage)


            # add the cur_passage to cur_result_passages
            triplets.append((row['query'], row['file_path'], cur_passage_decoded, row['label']))

            cur_psg_cnt += 1

            if cur_psg_cnt == code_reranker.psg_cnt:
                break
        break

    # convert to pandas dataframe
    triplets = pd.DataFrame(triplets, columns=['query', 'file_path', 'passage', 'label'])
    # Write data to cache file
    # if cache_file:
    #     # with open(cache_file, 'wb') as file:
    #     #     pickle.dump(triplets, file)
    #     #     print(f"Saved data to cache file: {cache_file}")
    #     print(f"Saving data to cache file: {cache_file}")
    #     triplets.to_parquet(cache_file)
    return triplets

In [149]:
triplet_cache = 'smalldata/ftr/cache/diff_code_triplets.parquet'

In [143]:
processed_code_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20753 entries, 0 to 20752
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   commit_id  20753 non-null  object
 1   file_path  20753 non-null  object
 2   query      20753 non-null  object
 3   passage    20753 non-null  object
 4   label      20753 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 810.8+ KB


In [147]:
triplets = prepare_code_triplets(processed_code_df.head(5000), code_reranker, triplet_cache, overwrite=True)

100%|██████████| 5000/5000 [58:23<00:00,  1.43it/s]   


Saving data to cache file: smalldata/ftr/cache/diff_code_triplets.pkl


In [148]:
triplets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97267 entries, 0 to 97266
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query      97267 non-null  object
 1   file_path  97267 non-null  object
 2   passage    97267 non-null  object
 3   label      97267 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 3.0+ MB


In [150]:
triplets.to_parquet(triplet_cache)

In [127]:
# distribution of labels (number of 0s and 1s)
triplets['label'].value_counts()

label
0    176
1     25
Name: count, dtype: int64

In [119]:
# def aside():
#     df = prepare_code_triplets(processed_code_df, code_reranker, None, False)
#     df.info()
#     print(df.head())

# aside()

In [64]:
triplets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 666907 to 359581
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   query      100000 non-null  object
 1   file_path  100000 non-null  object
 2   passage    100000 non-null  object
 3   label      100000 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


In [115]:
def aside2():
    tokenizer = AutoTokenizer.from_pretrained(params['model_name'])
    def tokenize(x):
        # tokenize with no max length
        return tokenizer.encode(x, add_special_tokens=False, truncation=False, max_length=None)
    # print the average number of words in commit_message column

    # sample 100 rows from combined_df
    # sample_df = combined_df.sample(100, random_state=52)
    # sample_df = combined_df[:10000]
    sample_df = diff_data
    avg_words = sample_df['query'].apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of tokens in query: {avg_words}')

    avg_path = sample_df['file_path'].apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of tokens in path: {avg_path}')


    # print remaining number of tokens in bert (max is 512)
    print(f'Avg number of tokens remaining for code: {512 - avg_words - avg_path}')

    avg_file_tokens = sample_df['passage'].dropna().apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of code tokens in cur_file_content column: {avg_file_tokens}')

aside2()

Token indices sequence length is longer than the specified maximum sequence length for this model (632 > 512). Running this sequence through the model will result in indexing errors


Average number of tokens in query: 237.96949838577555
Average number of tokens in path: 18.898038837758396
Avg number of tokens remaining for code: 255.13246277646607
Average number of code tokens in cur_file_content column: 17368.505324531394


In [None]:
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=None, aggregation_strategy=params['aggregation_strategy']))
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=[bert_reranker], aggregation_strategy=params['aggregation_strategy']))
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy']))



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:02<00:00,  3.72it/s]


{'MAP': 0.1859, 'P@10': 0.08, 'P@100': 0.028, 'P@1000': 0.0043, 'MRR': 0.2331, 'Recall@100': 0.4443, 'Recall@1000': 0.5752}


100%|██████████| 10/10 [00:56<00:00,  5.62s/it]


{'MAP': 0.1676, 'P@10': 0.08, 'P@100': 0.038, 'P@1000': 0.0043, 'MRR': 0.2541, 'Recall@100': 0.5199, 'Recall@1000': 0.5752}


100%|██████████| 10/10 [01:36<00:00,  9.63s/it]

{'MAP': 0.1038, 'P@10': 0.07, 'P@100': 0.038, 'P@1000': 0.0043, 'MRR': 0.1732, 'Recall@100': 0.5199, 'Recall@1000': 0.5752}





In [173]:
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy']))



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:49<00:00, 10.91s/it]

{'MAP': 0.2113, 'P@10': 0.12, 'P@100': 0.038, 'P@1000': 0.0043, 'MRR': 0.3317, 'Recall@100': 0.5199, 'Recall@1000': 0.5752}





In [48]:
print(model_evaluator.evaluate_sampling(n=100, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy']))



  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [17:34<00:00, 10.55s/it]

{'MAP': 0.1807, 'P@10': 0.117, 'P@100': 0.0295, 'P@1000': 0.0041, 'MRR': 0.2545, 'Recall@100': 0.527, 'Recall@1000': 0.6845}





In [14]:
save_model_name = params['model_name'].replace('/', '_')
# hf_output_dir = os.path.join('smalldata', 'ftr', f'100k_code_{save_model_name}_model_output')
hf_output_dir = os.path.join(repo_path, 'models', f'code_{save_model_name}_model_output')
save_model_name, hf_output_dir

('microsoft_codebert-base',
 '2_7/apache_kafka/models/code_microsoft_codebert-base_model_output')

In [69]:
def do_training(triplet_data, reranker, hf_output_dir, args):
    def tokenize_hf(example):
        len(example)
        return reranker.tokenizer(example['query'], example['passage'], truncation=True, padding='max_length', max_length=reranker.max_seq_length, return_tensors='pt', add_special_tokens=True)


    # triplet_data = triplet_data.sample(1000, random_state=42)
    print('Training the model...')
    print('Label distribution:')
    print(triplet_data['label'].value_counts())

    # merge columns file_path and passage into one column called passage
    triplet_data['passage'] = triplet_data['file_path'] + ' ' + triplet_data['passage']

    # if args.sanity_check:
    #     print('Running sanity check on training data...')
    #     triplet_data = sanity_check(triplet_data)
    # Step 7: convert triplet_data to HuggingFace Dataset
    # convert triplet_data to HuggingFace Dataset
    triplet_data['label'] = triplet_data['label'].astype(float)
    train_df, val_df = train_test_split(triplet_data, test_size=0.2, random_state=42, stratify=triplet_data['label'])
    train_hf_dataset = HFDataset.from_pandas(train_df, split='train') # type: ignore
    val_hf_dataset = HFDataset.from_pandas(val_df, split='validation') # type: ignore
    # Step 8: tokenize the data
    tokenized_train_dataset = train_hf_dataset.map(tokenize_hf, batched=True)
    tokenized_val_dataset = val_hf_dataset.map(tokenize_hf, batched=True)

    # Step 9: set format for pytorch
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(['query', 'passage', 'file_path'])
    tokenized_val_dataset = tokenized_val_dataset.remove_columns(['query', 'passage', 'file_path'])

    # rename label column to labels
    tokenized_train_dataset = tokenized_train_dataset.rename_column('label', 'labels')
    tokenized_val_dataset = tokenized_val_dataset.rename_column('label', 'labels')

    # set format to pytorch
    tokenized_train_dataset = tokenized_train_dataset.with_format('torch')
    tokenized_val_dataset = tokenized_val_dataset.with_format('torch')
    print('Training dataset features:')
    print(tokenized_train_dataset.features)

    # Step 10: set up training arguments
    train_args = TrainingArguments(
        output_dir=hf_output_dir,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=args.num_epochs,
        metric_for_best_model='eval_loss',
        load_best_model_at_end=True,
        save_total_limit=2,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        logging_steps=100,
        fp16=True,
        dataloader_num_workers=args.num_workers,
        )

    # small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(100))
    # small_val_dataset = tokenized_val_dataset.shuffle(seed=42).select(range(100))

    # if args.debug:
    #     print('Running in debug mode, using small datasets')
    #     tokenized_train_dataset = small_train_dataset
    #     tokenized_val_dataset = small_val_dataset

    # Step 11: set up trainer
    trainer = Trainer(
        model = reranker.model,
        args = train_args,
        train_dataset = tokenized_train_dataset, # type: ignore
        eval_dataset = tokenized_val_dataset, # type: ignore
        # compute_metrics=compute_metrics,
    )

    # Step 12: train the model
    trainer.train()

    # Step 13: save the model
    best_model_path = os.path.join(hf_output_dir, 'best_model')
    trainer.save_model(best_model_path)
    print(f'Saved model to {best_model_path}')
    print('Training complete')

In [67]:
args.sanity_check = False

In [70]:
do_training(triplets, code_reranker, hf_output_dir, args)

Training the model...
Label distribution:
label
0.0    76703
1.0    23297
Name: count, dtype: int64


Map: 100%|██████████| 80000/80000 [00:41<00:00, 1934.68 examples/s]
Map: 100%|██████████| 20000/20000 [00:10<00:00, 1918.61 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training dataset features:
{'labels': Value(dtype='float64', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,0.0771,0.064053
2,0.0396,0.034442
3,0.0235,0.023169


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [40]:
code_reranker.model = AutoModelForSequenceClassification.from_pretrained(os.path.join(repo_path, 'models', f"code_{save_model_name}_model_output", 'best_model'))
code_reranker.psg_cnt = 25
code_reranker.model.to(code_reranker.device)
rerankers = [bert_reranker, code_reranker]

In [41]:
model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], rerankers=rerankers, overwrite_eval=args.overwrite_eval)



  0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (98775 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [1:15:51<00:00, 45.52s/it]


{'MAP': 0.2703,
 'P@10': 0.119,
 'P@100': 0.034,
 'P@1000': 0.0053,
 'MRR': 0.4074,
 'Recall@100': 0.5517,
 'Recall@1000': 0.7426}

## Evaluate on Gold

In [48]:
gold_df.iloc[0]['transformed_message_gpt3']

'There are several issues that need to be addressed to enhance the user experience:\n\n1. The modals currently do not have a shadow, affecting visibility and overall aesthetic of the UI.\n2. The default setting of "collapse new nodes" option is currently enabled, which may not be the most user-friendly approach.\n3. The label "Collapse newly added components by default" may confuse users, a clearer phrasing would help understanding.\n4. The CSS media query for the settings popup is currently not optimized for smaller sizes, resulting in labels being hidden.\n5. The "Inspect the matching DOM element" button is present in standalone mode, despite not serving any functional purpose.\n6. There is a size issue with the settings icon, it\'s currently at 20x20 viewbox instead of the intended 24x24.\n7. There is a bug where "window.addEventListener" and "window.removeEventListener" are not defined in Hermes, causing operation failure.'

In [49]:
gold_dir = os.path.join('gold', 'facebook_react')
gold_data_path = os.path.join(gold_dir, f'{repo_name}_{args.openai_model}_gold.csv')
print(f'Model: {args.openai_model}')
gold_df = pd.read_csv(gold_data_path)
assert gold_df[f'transformed_message_{args.openai_model}'].notnull().all()
# rename commit_message to original_message
gold_df = gold_df.rename(columns={'commit_message': 'original_message'})
# rename transformed_message to commit_message
gold_df = gold_df.rename(columns={f'transformed_message_{args.openai_model}': 'commit_message'})
print(f'Found gold data for {repo_name} with shape {gold_df.shape} at {gold_data_path}')
print(gold_df.info())

Model: gpt4
Found gold data for facebook_react with shape (100, 5) at gold/facebook_react/facebook_react_gpt4_gold.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   commit_id              100 non-null    object
 1   commit_date            100 non-null    int64 
 2   original_message       100 non-null    object
 3   actual_files_modified  100 non-null    object
 4   commit_message         100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB
None


In [50]:
print('Running BM25 on gold data...')
# bm25_gold_output_path = os.path.join(eval_path, f'bm25_v2_{args.openai_model}_gold_metrics.txt')
bm25_gold_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=params['bm25_aggr_strategy'], gold_df=gold_df, overwrite_eval=args.overwrite_eval)
print("BM25 Gold Evaluation")
print(bm25_gold_eval)

Running BM25 on gold data...
Found gold_df, evaluating on 100 commits
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   commit_id              100 non-null    object
 1   commit_date            100 non-null    int64 
 2   original_message       100 non-null    object
 3   actual_files_modified  100 non-null    object
 4   commit_message         100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB
None


100%|██████████| 100/100 [00:28<00:00,  3.54it/s]

BM25 Gold Evaluation
{'MAP': 0.1223, 'P@10': 0.055, 'P@100': 0.0186, 'P@1000': 0.0026, 'MRR': 0.1908, 'Recall@100': 0.3617, 'Recall@1000': 0.5439}





In [51]:
rerankers

[<BERTReranker_v4.BERTReranker at 0x7ef810f3bc40>,
 <__main__.BERTCodeReranker at 0x7ef93dff1f40>]

In [72]:
print('Running BERT on gold data...')
# bert_gold_output_path = os.path.join(eval_path, f'bert_v2_{args.openai_model}_gold.txt')
bert_gold_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], rerankers=rerankers, gold_df=gold_df, overwrite_eval=args.overwrite_eval)

print("BERT Gold Evaluation")
print(bert_gold_eval)

Running BERT on gold data...
Found gold_df, evaluating on 100 commits
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   commit_id              100 non-null    object
 1   commit_date            100 non-null    int64 
 2   original_message       100 non-null    object
 3   actual_files_modified  100 non-null    object
 4   commit_message         100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB
None


  0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2127 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [52:50<00:00, 31.71s/it]

BERT Gold Evaluation
{'MAP': 0.1821, 'P@10': 0.085, 'P@100': 0.0186, 'P@1000': 0.0026, 'MRR': 0.2484, 'Recall@100': 0.3617, 'Recall@1000': 0.5439}





In [None]:
0.062 MAP

In [77]:
diff_data['commit_id'].nunique()

831

In [78]:
# find intersection of commit ids between diff_data and gold_df
diff_commit_ids = set(diff_data['commit_id'].unique())
gold_commit_ids = set(gold_df['commit_id'].unique())

print(f"Number of commit ids in diff_data: {len(diff_commit_ids)}")
print(f"Number of commit ids in gold_df: {len(gold_commit_ids)}")

print(f"Number of commit ids in both diff_data and gold_df: {len(diff_commit_ids.intersection(gold_commit_ids))}")

Number of commit ids in diff_data: 831
Number of commit ids in gold_df: 100
Number of commit ids in both diff_data and gold_df: 33


In [105]:
def fix_old_parquet():
    repo_name = 'angular_angular'
    repo_path = os.path.join('gold', repo_name)
    # csv_file = os.path.join(repo_path, f'v2_{repo_name}_gpt4_gold.csv')
    parquet_file = os.path.join(repo_path, f'v2_{repo_name}_gpt4_gold.parquet')

    # if there is a column called transformed_message_gpt3, then we need to fix the parquet file by renaming it to transformed_message_gpt4

    parquet_df = pd.read_parquet(parquet_file)
    print(parquet_df.info())

    if 'transformed_message_gpt3' in parquet_df.columns:
        print('Found transformed_message_gpt3 column in parquet file')
        # rename it to transformed_message_gpt4
        parquet_df = parquet_df.rename(columns={'transformed_message_gpt3': 'transformed_message_gpt4'})
        print(parquet_df.info())
        # now save it back to the parquet file
        # parquet_df.to_parquet(parquet_file)
        print('Saved parquet file')

#     v2_csv_file = os.path.join(repo_path, f'v2_{repo_name}_gpt4_gold.csv')
#     v2_parquet_file = os.path.join(repo_path, f'v2_{repo_name}_gpt4_gold.parquet')

#     csv_df = pd.read_csv(csv_file)
#     parquet_df = pd.read_parquet(parquet_file)

#     v2_csv_df = pd.read_csv(v2_csv_file)
#     v2_parquet_df = pd.read_parquet(v2_parquet_file)

#     # ensure commit ids in both csv and parquet are the same
#     csv_commit_ids = set(csv_df['commit_id'].unique())
#     parquet_commit_ids = set(parquet_df['commit_id'].unique())

#     print(f'Common commit ids: {len(csv_commit_ids.intersection(parquet_commit_ids))}')

#     assert csv_commit_ids == parquet_commit_ids, 'Commit ids in csv and parquet are not the same'

#     # ensure commit ids in both v2 csv and v2 parquet are the same
#     v2_csv_commit_ids = set(v2_csv_df['commit_id'].unique())
#     v2_parquet_commit_ids = set(v2_parquet_df['commit_id'].unique())

#     print(f'Common commit ids: {len(v2_csv_commit_ids.intersection(v2_parquet_commit_ids))}')

#     assert v2_csv_commit_ids == v2_parquet_commit_ids, 'Commit ids in v2 csv and v2 parquet are not the same'

#     # ensure commit ids in both csv and v2 csv are the same

#     print(f'Common commit ids: {len(csv_commit_ids.intersection(v2_csv_commit_ids))}')

#     assert csv_commit_ids == v2_csv_commit_ids, 'Commit ids in csv and v2 csv are not the same'

#     # only now store all commit ids in in a file called facebook_react_gpt4_gold_commit_ids.txt with each commit id on a new line

#     with open(os.path.join(repo_path, f'{repo_name}_gpt4_gold_commit_ids.txt'), 'w') as f:
#         for commit_id in csv_df['commit_id'].unique():
#             f.write(f'{commit_id}\n')


# get_gold_commits()
fix_old_parquet()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   commit_id                 100 non-null    string
 1   commit_date               100 non-null    int64 
 2   commit_message            100 non-null    string
 3   actual_files_modified     100 non-null    object
 4   transformed_message_gpt4  100 non-null    object
dtypes: int64(1), object(2), string(2)
memory usage: 4.0+ KB
None
