In [1]:
import sys
sys.path.append('src')

In [2]:
import argparse
import os
import sys
from typing import List

import numpy as np
import pandas as pd
import torch
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)


from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator
from utils import (
    AggregatedSearchResult,
    get_combined_df,
    prepare_triplet_data_from_df,
    sanity_check_triplets,
    set_seed,
    tokenize
)
from BERTReranker_v4 import BERTReranker
# set seed
set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# print torch devices available
print('Available devices: ', torch.cuda.device_count())
print('Current cuda device: ', torch.cuda.current_device())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Available devices:  1
Current cuda device:  0
Quadro RTX 6000


In [4]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='smalldata/ftr/index_commit_tokenized', repo_path='smalldata/ftr', k=1000, n=100, no_bm25=True, model_path='microsoft/graphcodebert-base', overwrite_cache=False, batch_size=32, num_epochs=10, learning_rate=5e-05, num_positives=10, num_negatives=10, train_depth=1000, num_workers=8, train_commits=1500, psg_cnt=5, aggregation_strategy='sump', use_gpu=True, rerank_depth=250, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4', overwrite_eval=False, sanity_check_triplets=False, debug=False, eval_before_training=False, do_combined_train=False, repo_paths=None, best_model_path=None
)

In [5]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']
repo_path = args.repo_path
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'

In [6]:
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

Loaded index at smalldata/ftr/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [7]:
bm25_output_path = os.path.join(eval_path, f'bm25_baseline_N{n}_K{K}_metrics.txt')
print(f'BM25 output path: {bm25_output_path}')

bm25_baseline_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=bm25_output_path, aggregation_strategy=BM25_AGGR_STRAT)

print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)

BM25 output path: smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt
Output file smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt already exists, set overwrite_eval flag to False, skipping...
Model Name: BM25Searcher
Sample Size: 100
Evaluation Metrics:
MAP: 0.1542
P@10: 0.087
P@100: 0.0267
P@1000: 0.0041
MRR: 0.2133
Recall@100: 0.5077
Recall@1000: 0.6845

BM25 Baseline Evaluation
None


In [8]:
params = {
        'model_name': args.model_path,
        'psg_cnt': args.psg_cnt,
        'aggregation_strategy': args.aggregation_strategy,
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': args.rerank_depth,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': BM25_AGGR_STRAT,
    }

### Prepare training data

In [9]:
# Prepare the data for training
print('Preparing training data...')
# Step 1: Filter out only the columns we need
filtered_df = combined_df[['commit_date', 'commit_message', 'commit_id', 'file_path', 'diff']]

# Step 2: Group by commit_id
grouped_df = filtered_df.groupby(['commit_id', 'commit_date', 'commit_message'])['file_path'].apply(list).reset_index()
grouped_df.rename(columns={'file_path': 'actual_files_modified'}, inplace=True)

# Step 3: Determine midpoint and filter dataframe
midpoint_date = np.median(grouped_df['commit_date'])
recent_df = grouped_df[grouped_df['commit_date'] > midpoint_date]
print(f'Number of commits after midpoint date: {len(recent_df)}')

# Step 4: Filter out commits with less than average length commit messages
average_commit_len = recent_df['commit_message'].str.split().str.len().mean()
# filter out commits with less than average length
recent_df = recent_df[recent_df['commit_message'].str.split().str.len() > average_commit_len] # type: ignore
print(f'Number of commits after filtering by commit message length: {len(recent_df)}')

# Step 5: randomly sample 1500 rows from recent_df
recent_df = recent_df.sample(params['train_commits'])
print(f'Number of commits after sampling: {len(recent_df)}')

Preparing training data...
Number of commits after midpoint date: 5804
Number of commits after filtering by commit message length: 1543
Number of commits after sampling: 1500


In [10]:
# prepare data first
if not os.path.exists(os.path.join(repo_path, 'cache')):
    os.makedirs(os.path.join(repo_path, 'cache'))
triplet_cache = os.path.join(repo_path, 'cache', 'triplet_data_cache.pkl')
diff_cache = os.path.join(repo_path, 'cache', 'diff_data.parquet')

In [79]:
def aside():
    tokenizer = AutoTokenizer.from_pretrained(params['model_name'])
    def tokenize(x):
        # tokenize with no max length
        return tokenizer.encode(x, add_special_tokens=True, truncation=False, max_length=None)
    combined_df.info()
    # print the average number of words in commit_message column

    # sample 100 rows from combined_df
    # sample_df = combined_df.sample(100, random_state=52)
    sample_df = combined_df

    avg_words = sample_df['commit_message'].str.split().str.len().mean()
    print(f'Average number of words in commit message (whitespace): {avg_words}')
    avg_words = sample_df['commit_message'].apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of words in commit message (AutoTokenizer): {avg_words}')

    # print approx number of tokens in passed to bert which is 2 * avg_words * 1.5
    approx_tokens = 2 * avg_words * 1.5
    print(f'Approx number of tokens passed to bert: {approx_tokens}')

    # print remaining number of tokens in bert (max is 512)
    print(f'Approx number of tokens remaining for code: {512 - approx_tokens}')

    # print average number of code tokens in diff column by using tokenize function but only on the non-null diff values
    avg_code_tokens = sample_df['diff'].dropna().apply(lambda x: len(tokenize(x))).mean()
    print(f'Average number of code tokens in diff column: {avg_code_tokens}')

aside()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73765 entries, 0 to 73764
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   owner                  73765 non-null  string
 1   repo_name              73765 non-null  string
 2   commit_date            73765 non-null  int64 
 3   commit_id              73765 non-null  string
 4   commit_message         73765 non-null  string
 5   file_path              73765 non-null  string
 6   previous_commit_id     73765 non-null  string
 7   previous_file_content  73765 non-null  string
 8   cur_file_content       73765 non-null  string
 9   diff                   58037 non-null  string
 10  status                 73765 non-null  object
 11  is_merge_request       73765 non-null  bool  
 12  file_extension         73765 non-null  object
dtypes: bool(1), int64(1), object(2), string(9)
memory usage: 6.8+ MB


Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


Average number of words in commit message (whitespace): 60.66661695926252
Average number of words in commit message (AutoTokenizer): 105.2017759099844
Approx number of tokens passed to bert: 315.6053277299532
Approx number of tokens remaining for code: 196.39467227004678
Average number of code tokens in diff column: 775.7933559625756


In [None]:
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Average number of words in commit message (whitespace): 60.66661695926252
Average number of words in commit message (AutoTokenizer): 105.2017759099844
Approx number of tokens passed to bert: 315.6053277299532
Approx number of tokens remaining for code: 196.39467227004678
Average number of code tokens in diff column: 775.7933559625756

In [26]:
# def temp_prep(df, searcher, search_depth, num_positives, num_negatives):

#     data = []
#     print(f'Preparing data from dataframe of size: {len(df)} with search_depth: {search_depth}')
#     total_positives, total_negatives = 0, 0
#     for _, row in df.iterrows():
#     # for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):

#         cur_positives = 0
#         cur_negatives = 0
#         pos_commit_ids = set()
#         neg_commit_ids = set()
#         commit_message = row['commit_message']
#         actual_files_modified = row['actual_files_modified']

#         agg_search_results = searcher.pipeline(commit_message, row['commit_date'], search_depth, 'sump', aggregate_on='commit')

#         # for each agg_result, find out how many files it has edited are in actual_files_modified and sort by score

#         for agg_result in agg_search_results:
#             agg_result_files = set([result.file_path for result in agg_result.contributing_results])
#             intersection = agg_result_files.intersection(actual_files_modified)
#             # TODO maybe try this for training
#             agg_result.score = len(intersection) / len(agg_result_files) # how focused the commit is
#             # agg_result.score = len(intersection)

#         agg_search_results.sort(key=lambda res: res.score, reverse=True)

#         # go from top to bottom, first num_positives non-0 scores are positive samples and the next num_negatives are negative samples
#         for agg_result in agg_search_results:
#             cur_commit_msg = agg_result.contributing_results[0].commit_message
#             if cur_positives < num_positives and agg_result.score > 0:
#                 # meaning there is at least one file in the agg_result that is in actual_files_modified
#                 # pos_commits.append(agg_result)
#                 data.append((commit_message, cur_commit_msg, 1))
#                 cur_positives += 1
#                 pos_commit_ids.add(agg_result.commit_id)
#             elif cur_negatives < num_negatives:
#                 # neg_commits.append(agg_result)
#                 data.append((commit_message, cur_commit_msg, 0))
#                 cur_negatives += 1
#                 neg_commit_ids.add(agg_result.commit_id)
#             if cur_positives == num_positives and cur_negatives == num_negatives:
#                 break

#         assert len(pos_commit_ids.intersection(neg_commit_ids)) == 0, 'Positive and negative commit ids should not intersect'
#         # print(f"Total positives: {cur_positives}, Total negatives: {cur_negatives}")
#         total_positives += cur_positives
#         total_negatives += cur_negatives

#     # # Write data to cache file
#     # with open(cache_file, 'wb') as file:
#     #     pickle.dump(data, file)
#     #     print(f"Saved data to cache file: {cache_file}")


#     # print percentage of positives and negatives
#     denom = total_positives + total_negatives
#     print(f"Percentage of positives: {total_positives / denom}, Percentage of negatives: {total_negatives / denom}")
#     return data

Preparing data from dataframe of size: 2 with search_depth: 1000
Percentage of positives: 0.2, Percentage of negatives: 0.8


In [143]:
small_df = recent_df.sample(100, random_state=55)

In [12]:
def test_prep(df, searcher, search_depth, num_positives, num_negatives, use_diff=False):
    data = []
    diff_data = []
    print(f'Preparing data from dataframe of size: {len(df)} with search_depth: {search_depth}')
    # for _, row in df.iterrows():
    total_positives, total_negatives = 0, 0
    for _, row in tqdm(df.iterrows(), total=len(df)):
        cur_positives = 0
        cur_negatives = 0

        cur_diff_positives = 0
        cur_diff_negatives = 0
        pos_commit_ids = set()
        neg_commit_ids = set()
        commit_message = row['commit_message']
        actual_files_modified = row['actual_files_modified']

        agg_search_results = searcher.pipeline(commit_message, row['commit_date'], search_depth, 'sump', aggregate_on='commit')

        # for each agg_result, find out how many files it has edited are in actual_files_modified and sort by score

        for agg_result in agg_search_results:
            agg_result_files = set([result.file_path for result in agg_result.contributing_results])
            intersection = agg_result_files.intersection(actual_files_modified)
            # TODO maybe try this for training
            # agg_result.score = len(intersection) / len(agg_result_files) # how focused the commit is
            agg_result.score = len(intersection) / len(agg_result_files) # how focused the commit is
            # agg_result.score = math.log(cur_score+1)
            # agg_result.score = len(intersection)

        agg_search_results.sort(key=lambda res: res.score, reverse=True)

        if use_diff:
            for agg_result in agg_search_results:
                # now we want to get diffs for each file in agg_result which has multiple contributing results (files)
                # agg_result.contributing_results is a list of SearchResult objects
                # each SearchResult object has a file_path attribute
                # just get the first contributing result for now
                # TODO: use diff_cnt instead of just the first contributing result
                for contributing_result in agg_result.contributing_results:
                    # contributing_result = agg_result.contributing_results[0]
                    # get the just the file path and commit id
                    file_path = contributing_result.file_path
                    commit_id = contributing_result.commit_id
                    if file_path in actual_files_modified and cur_diff_positives < num_positives:
                        # this is a positive sample
                        diff_data.append((commit_message, file_path, commit_id, 1))
                        cur_diff_positives += 1
                    elif file_path not in actual_files_modified and cur_diff_negatives < num_negatives:
                        # this is a negative sample
                        diff_data.append((commit_message, file_path, commit_id, 0))
                        cur_diff_negatives += 1

                if cur_diff_positives == num_positives and cur_diff_negatives == num_negatives:
                    break


        # go from top to bottom, first num_positives non-0 scores are positive samples and the next num_negatives are negative samples
        for agg_result in agg_search_results:
            cur_commit_msg = agg_result.contributing_results[0].commit_message
            if cur_positives < num_positives and agg_result.score > 0:
                # meaning there is at least one file in the agg_result that is in actual_files_modified
                # pos_commits.append(agg_result)
                data.append((commit_message, cur_commit_msg, 1))
                cur_positives += 1
                pos_commit_ids.add(agg_result.commit_id)
            elif cur_negatives < num_negatives:
                # neg_commits.append(agg_result)
                data.append((commit_message, cur_commit_msg, 0))
                cur_negatives += 1
                neg_commit_ids.add(agg_result.commit_id)
            if cur_positives == num_positives and cur_negatives == num_negatives:
                break

        assert len(pos_commit_ids.intersection(neg_commit_ids)) == 0, 'Positive and negative commit ids should not intersect'
        # print(f"Total positives: {cur_positives}, Total negatives: {cur_negatives}")
        total_positives += cur_positives
        total_negatives += cur_negatives

    # convert to pandas dataframe
    data = pd.DataFrame(data, columns=['query', 'passage', 'label'])
    diff_data = pd.DataFrame(diff_data, columns=['query', 'file_path', 'commit_id', 'label'])
    # print distribution of labels
    print(f"Total positives: {total_positives}, Total negatives: {total_negatives}")
    # print percentage of positives and negatives
    denom = total_positives + total_negatives
    print(f"Percentage of positives: {total_positives / denom}, Percentage of negatives: {total_negatives / denom}")
    if use_diff:
        return data, diff_data
    return data

In [13]:
test_data, diff_data = test_prep(recent_df, bm25_searcher, params['train_depth'], params['num_positives'], params['num_negatives'], use_diff=True)

Preparing data from dataframe of size: 1500 with search_depth: 1000


 25%|██▌       | 381/1500 [01:39<03:48,  4.91it/s]

In [26]:
diff_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23975 entries, 0 to 23974
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query      23975 non-null  object
 1   file_path  23975 non-null  object
 2   commit_id  23975 non-null  object
 3   label      23975 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 749.3+ KB


In [None]:
diff_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query      54 non-null     object
 1   file_path  54 non-null     object
 2   commit_id  54 non-null     object
 3   label      54 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.8+ KB


In [27]:
def process_diff_data(diff_data, df):
    # given diff_data, we want to use commit_id and file_path to get the diff from the df

    # first we need to get the diff from the df
    # we can use the commit_id and file_path to get the diff
    res_df = []
    # for _, row in diff_data.iterrows():
    for _, row in tqdm(diff_data.iterrows(), total=len(diff_data)):
        commit_id = row['commit_id']
        file_path = row['file_path']
        # get the diff from the df
        diff = df[(df['commit_id'] == commit_id) & (df['file_path'] == file_path)]['diff']
        # check if diff is NA/NaN
        if diff.isnull().values.any():
            # if it is, then we can just skip this row
            continue
        diff = diff.values[0]

        res_df.append((commit_id, file_path, row['query'], diff, row['label']))

    res_df = pd.DataFrame(res_df, columns=['commit_id', 'file_path', 'query', 'passage', 'label'])
    # make query and passage into strings and label into int
    res_df['query'] = res_df['query'].astype(str)
    res_df['passage'] = res_df['passage'].astype(str)
    res_df['label'] = res_df['label'].astype(int)
    return res_df

processed_diff_data = process_diff_data(diff_data, combined_df)

100%|██████████| 23975/23975 [04:17<00:00, 93.11it/s]


In [28]:
processed_diff_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20096 entries, 0 to 20095
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   commit_id  20096 non-null  object
 1   file_path  20096 non-null  object
 2   query      20096 non-null  object
 3   passage    20096 non-null  object
 4   label      20096 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 785.1+ KB


In [155]:
processed_diff_data.iloc[0]['passage']



In [156]:
processed_diff_data.head()

Unnamed: 0,commit_id,file_path,query,passage,label
0,566259567773a0af56e6c19da615e6802d73f834,packages/react-dom/src/client/ReactDOMFiberCom...,Change warning() to automatically inject the s...,"@@ -8,7 +8,10 @@\n */\n \n // TODO: direct im...",1
1,566259567773a0af56e6c19da615e6802d73f834,packages/react-dom/src/client/ReactDOMFiberInp...,Change warning() to automatically inject the s...,"@@ -8,7 +8,10 @@\n */\n \n // TODO: direct im...",1
2,566259567773a0af56e6c19da615e6802d73f834,packages/react-dom/src/client/ReactDOMFiberSel...,Change warning() to automatically inject the s...,"@@ -8,16 +8,14 @@\n */\n \n // TODO: direct i...",1
3,566259567773a0af56e6c19da615e6802d73f834,packages/react-reconciler/src/ReactCapturedVal...,Change warning() to automatically inject the s...,"@@ -9,7 +9,7 @@\n \n import type {Fiber} from ...",0
4,013b7ad117834cbb99b4fc0a3d08fdb8622597c9,packages/react-reconciler/src/ReactFiberWorkLo...,Unify `use` and `renderDidSuspendDelayIfPossib...,"@@ -200,13 +200,14 @@ const LegacyUnbatchedCon...",1


In [101]:
# find number of rows where passage has <NA>\nName in it
combined_df[combined_df['commit_id'] == '1e3383a41154cb32d8d6b78b2451ee4dabfcb973' & ('packages/react-devtools-shared/src/__tests__' in combined_df['file_path'])]

pandas._libs.missing.NAType

In [29]:
# processed_diff_data.head(1000)
processed_diff_data.to_parquet(os.path.join(repo_path, 'cache', 'diff_data.parquet'))

In [15]:
# triplet_data = prepare_triplet_data_from_df(recent_df, bm25_searcher, search_depth=params['train_depth'], num_positives=params['num_positives'], num_negatives=params['num_negatives'], cache_file=triplet_cache, overwrite=args.overwrite_cache)

# # triplet_data = test_data

In [33]:
diff_data = pd.read_parquet(os.path.join(repo_path, 'cache', 'diff_data.parquet'))
# drop columns that we don't need aka commit_id and file_path
diff_data.drop(columns=['commit_id', 'file_path'], inplace=True)
diff_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20096 entries, 0 to 20095
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   query    20096 non-null  object
 1   passage  20096 non-null  object
 2   label    20096 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 471.1+ KB


In [34]:
# see distribution of labels
diff_data['label'].value_counts()

label
0    11877
1     8219
Name: count, dtype: int64

In [160]:
diff_data.iloc[0]['passage']



In [35]:
bert_reranker = BERTReranker(params)
save_model_name = params['model_name'].replace('/', '_')
repo_name = 'facebook_react'
bert_best_model_path = os.path.join('2_7', repo_name, f"{save_model_name}_model_output", 'best_model')
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained(bert_best_model_path)
bert_reranker.model.to(bert_reranker.device)
rerankers = [bert_reranker]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 1442.70 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/graphcodebert-base', 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1500, 'bm25_aggr_strategy': 'sump'}


## Reranking with training

In [36]:
class BERTCodeReranker:
    def __init__(self, parameters):
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)

        print(f'Using device: {self.device}')

        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        # self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        # self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

        print(f"Initialized BERT reranker with parameters: {parameters}")


    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
        # print(f'Reranking {len(aggregated_results)} results')

        self.model.eval()

        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        # TODO change this to be for diffs instead of commit_messages
        query_passage_pairs = []
        for agg_result in aggregated_results:
            # get the top psg_cnt contributing_results
            contributing_results = agg_result.contributing_results[: self.psg_cnt]
            # for each contributing_result, get the file_path and commit_id
            # TODO maybe limit this with a diff_cnt
            for contributing_result in contributing_results:
                file_path = contributing_result.file_path
                commit_id = contributing_result.commit_id
                # get the diff from the df
                diff = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['diff']
                assert diff.shape[0] == 1, f"diff should only have one row, but has {diff.shape[0]} rows"
                diff = str(diff.iloc[0])
                query_passage_pairs.append((query, diff))


        # for agg_result in aggregated_results:
        #     query_passage_pairs.extend(
        #         (query, result.commit_message)
        #         for result in agg_result.contributing_results[: self.psg_cnt]
        #     )

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        # Now assign the scores to the aggregated results by mapping the scores to the contributing results
        for agg_result in aggregated_results:
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
            assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
            end_index = score_index + len(agg_result.contributing_results[: self.psg_cnt]) # only use psg_cnt contributing_results
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index


            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def rerank_pipeline(self, query, aggregated_results):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results)
        min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        for i, result in enumerate(bottom_results):
            result.score = min_top_score - i - 1
        # combine the results
        reranked_results.extend(bottom_results)
        assert(len(reranked_results) == len(aggregated_results))
        return reranked_results

In [37]:
code_reranker = BERTCodeReranker(params)
code_reranker.rerank_depth = 100
rerankers = [bert_reranker, code_reranker]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 1443.45 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/graphcodebert-base', 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1500, 'bm25_aggr_strategy': 'sump'}


In [114]:
# evaluate on just 1 sample

tmp_results = model_evaluator.evaluate_sampling(n=1, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy'])



100%|██████████| 1/1 [00:08<00:00,  8.74s/it]


In [None]:
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=None, aggregation_strategy=params['aggregation_strategy']))
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=[bert_reranker], aggregation_strategy=params['aggregation_strategy']))
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy']))



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:02<00:00,  3.72it/s]


{'MAP': 0.1859, 'P@10': 0.08, 'P@100': 0.028, 'P@1000': 0.0043, 'MRR': 0.2331, 'Recall@100': 0.4443, 'Recall@1000': 0.5752}


100%|██████████| 10/10 [00:56<00:00,  5.62s/it]


{'MAP': 0.1676, 'P@10': 0.08, 'P@100': 0.038, 'P@1000': 0.0043, 'MRR': 0.2541, 'Recall@100': 0.5199, 'Recall@1000': 0.5752}


100%|██████████| 10/10 [01:36<00:00,  9.63s/it]

{'MAP': 0.1038, 'P@10': 0.07, 'P@100': 0.038, 'P@1000': 0.0043, 'MRR': 0.1732, 'Recall@100': 0.5199, 'Recall@1000': 0.5752}





In [173]:
print(model_evaluator.evaluate_sampling(n=10, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy']))



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:49<00:00, 10.91s/it]

{'MAP': 0.2113, 'P@10': 0.12, 'P@100': 0.038, 'P@1000': 0.0043, 'MRR': 0.3317, 'Recall@100': 0.5199, 'Recall@1000': 0.5752}





In [48]:
print(model_evaluator.evaluate_sampling(n=100, k=1000, output_file_path=None, rerankers=rerankers, aggregation_strategy=params['aggregation_strategy']))



  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [17:34<00:00, 10.55s/it]

{'MAP': 0.1807, 'P@10': 0.117, 'P@100': 0.0295, 'P@1000': 0.0041, 'MRR': 0.2545, 'Recall@100': 0.527, 'Recall@1000': 0.6845}





In [38]:
def do_training(triplet_data, reranker, hf_output_dir, args):
    def tokenize_hf(example):
        return reranker.tokenizer(example['query'], example['passage'], truncation=True, padding='max_length', max_length=reranker.max_seq_length, return_tensors='pt', add_special_tokens=True)
    print('Training the model...')
    print('Label distribution:')
    print(triplet_data['label'].value_counts())

    if args.sanity_check:
        print('Running sanity check on training data...')
        triplet_data = sanity_check(triplet_data)

    # Step 7: convert triplet_data to HuggingFace Dataset
    # convert triplet_data to HuggingFace Dataset
    triplet_data['label'] = triplet_data['label'].astype(float)
    train_df, val_df = train_test_split(triplet_data, test_size=0.2, random_state=42, stratify=triplet_data['label'])
    train_hf_dataset = HFDataset.from_pandas(train_df, split='train') # type: ignore
    val_hf_dataset = HFDataset.from_pandas(val_df, split='validation') # type: ignore

    # Step 8: tokenize the data
    tokenized_train_dataset = train_hf_dataset.map(tokenize_hf, batched=True)
    tokenized_val_dataset = val_hf_dataset.map(tokenize_hf, batched=True)

    # Step 9: set format for pytorch
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(['query', 'passage'])
    tokenized_val_dataset = tokenized_val_dataset.remove_columns(['query', 'passage'])

    # rename label column to labels
    tokenized_train_dataset = tokenized_train_dataset.rename_column('label', 'labels')
    tokenized_val_dataset = tokenized_val_dataset.rename_column('label', 'labels')

    # set format to pytorch
    tokenized_train_dataset = tokenized_train_dataset.with_format('torch')
    tokenized_val_dataset = tokenized_val_dataset.with_format('torch')
    print('Training dataset features:')
    print(tokenized_train_dataset.features)

    # Step 10: set up training arguments
    train_args = TrainingArguments(
        output_dir=hf_output_dir,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=args.num_epochs,
        metric_for_best_model='eval_loss',
        load_best_model_at_end=True,
        save_total_limit=2,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        logging_steps=100,
        fp16=True,
        dataloader_num_workers=args.num_workers,
        )

    small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(100))
    small_val_dataset = tokenized_val_dataset.shuffle(seed=42).select(range(100))

    if args.debug:
        print('Running in debug mode, using small datasets')
        tokenized_train_dataset = small_train_dataset
        tokenized_val_dataset = small_val_dataset

    # Step 11: set up trainer
    trainer = Trainer(
        model = reranker.model,
        args = train_args,
        train_dataset = tokenized_train_dataset, # type: ignore
        eval_dataset = tokenized_val_dataset, # type: ignore
        # compute_metrics=compute_metrics,
    )

    # Step 12: train the model
    trainer.train()

    # Step 13: save the model
    best_model_path = os.path.join(hf_output_dir, 'best_model')
    trainer.save_model(best_model_path)
    print(f'Saved model to {best_model_path}')
    print('Training complete')

In [119]:
repo_path

'smalldata/ftr'

In [39]:
args.sanity_check = False

In [40]:
def sanity_check_triplets(data):
    """
    Perform a sanity check on the triplets data.

    Args:
        data: The input data containing triplets.

    Returns:
        The sanitized data after removing problematic rows.

    Examples:
        >>> data = pd.DataFrame({'query': ['apple', 'banana', 'apple'], 'passage': ['red fruit', 'yellow fruit', 'red fruit'], 'label': [0, 1, 0]})
        >>> sanity_check_triplets(data)
        Assertion failed at index 0: query      apple
        passage    red fruit
        label             0
        Name: 0, dtype: object
        Dropped row at index 0
        Total number of problems in sanity check of training data: 1
        # Output: DataFrame without the problematic row
    """
    problems = 0
    for i, row in tqdm(data.iterrows(), total=len(data)):
        try:
            if row['label'] == 0:
                assert data[(data['query'] == row['query']) & (data['passage'] == row['passage'])]['label'].values[0] == 0
            else:
                assert data[(data['query'] == row['query']) & (data['passage'] == row['passage'])]['label'].values[0] == 1
        except AssertionError:
            print(f"Assertion failed at index {i}: {row}")
            # break  # Optional: break after the first failure, remove if you want to see all failures
            # remove the row with label 0

            if row['label'] == 0:
                problems += 1
                data.drop(i, inplace=True)
                print(f"Dropped row at index {i}")

    print(f"Total number of problems in sanity check of training data: {problems}")
    return data

In [41]:
diff_data = sanity_check_triplets(diff_data)

  4%|▍         | 775/20096 [00:02<00:52, 368.03it/s]

Assertion failed at index 712: query      Land forked reconciler changes (#24817)\n\nThi...
passage    @@ -44,6 +44,7 @@ import {\n   SyncLane,\n   N...
label                                                      1
Name: 712, dtype: object
Assertion failed at index 714: query      Land forked reconciler changes (#24817)\n\nThi...
passage    @@ -453,27 +453,26 @@ export function includes...
label                                                      1
Name: 714, dtype: object


  5%|▍         | 923/20096 [00:02<00:52, 365.72it/s]

Assertion failed at index 863: query      Move update scheduling to microtask (#26512)\n...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 863, dtype: object
Dropped row at index 863
Assertion failed at index 864: query      Move update scheduling to microtask (#26512)\n...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 864, dtype: object
Dropped row at index 864


 13%|█▎        | 2664/20096 [00:07<00:47, 368.39it/s]

Assertion failed at index 2607: query      Diff properties in the commit phase instead of...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 2607, dtype: object
Dropped row at index 2607
Assertion failed at index 2608: query      Diff properties in the commit phase instead of...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 2608, dtype: object
Dropped row at index 2608


 17%|█▋        | 3382/20096 [00:09<00:45, 368.96it/s]

Assertion failed at index 3334: query      Add a feature flag to disable legacy context (...
passage    @@ -23,6 +23,7 @@ export const replayFailedUni...
label                                                      1
Name: 3334, dtype: object


 17%|█▋        | 3456/20096 [00:09<00:45, 367.99it/s]

Assertion failed at index 3415: query      offscreen double invoke effects (#19523)\n\nTh...
passage    @@ -45,6 +45,7 @@ export const disableTextarea...
label                                                      0
Name: 3415, dtype: object
Dropped row at index 3415


 24%|██▍       | 4893/20096 [00:13<00:41, 369.15it/s]

Assertion failed at index 4835: query      Bugfix: Effect clean up when deleting suspende...
passage    @@ -2082,9 +2082,18 @@ function updateSuspense...
label                                                      0
Name: 4835, dtype: object
Dropped row at index 4835


 33%|███▎      | 6674/20096 [00:18<00:36, 368.83it/s]

Assertion failed at index 6600: query      Apply #20778 to new fork, too (#20782)\n\n* Ap...
passage    @@ -34,6 +34,7 @@ import {\n   disableSchedule...
label                                                      0
Name: 6600, dtype: object
Dropped row at index 6600


 36%|███▌      | 7275/20096 [00:19<00:34, 369.67it/s]

Assertion failed at index 7201: query      [Flight] use opaque config for flight in `dom-...
passage    @@ -10,3 +10,4 @@\n export * from 'react-clien...
label                                                      1
Name: 7201, dtype: object
Assertion failed at index 7207: query      [Flight] use opaque config for flight in `dom-...
passage    @@ -9,3 +9,4 @@\n \n export * from '../ReactFl...
label                                                      1
Name: 7207, dtype: object


 52%|█████▏    | 10390/20096 [00:28<00:26, 368.29it/s]

Assertion failed at index 10321: query      [Flight] Taint APIs (#27445)\n\nThis lets a re...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 10321, dtype: object
Dropped row at index 10321
Assertion failed at index 10322: query      [Flight] Taint APIs (#27445)\n\nThis lets a re...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 10322, dtype: object
Dropped row at index 10322


 54%|█████▍    | 10844/20096 [00:29<00:25, 369.20it/s]

Assertion failed at index 10776: query      Move unstable_scheduleHydration to ReactDOMHyd...
passage    @@ -11,6 +11,7 @@ export {\n   __SECRET_INTERN...
label                                                      0
Name: 10776, dtype: object
Dropped row at index 10776


 65%|██████▍   | 12971/20096 [00:35<00:19, 369.81it/s]

Assertion failed at index 12905: query      Track nearest Suspense handler on stack (#2458...
passage    @@ -7,6 +7,7 @@\n  * @flow\n  */\n \n+import t...
label                                                      0
Name: 12905, dtype: object
Dropped row at index 12905


 80%|███████▉  | 16007/20096 [00:43<00:11, 369.57it/s]

Assertion failed at index 15936: query      Always skip unmounted/unmounting error boundar...
passage    @@ -45,6 +45,7 @@ export const disableTextarea...
label                                                      0
Name: 15936, dtype: object
Dropped row at index 15936


 92%|█████████▏| 18588/20096 [00:50<00:04, 367.32it/s]

Assertion failed at index 18546: query      Don't prerender siblings of suspended componen...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 18546, dtype: object
Dropped row at index 18546
Assertion failed at index 18547: query      Don't prerender siblings of suspended componen...
passage    @@ -57,6 +57,7 @@ export const enableUseRefAcc...
label                                                      0
Name: 18547, dtype: object
Dropped row at index 18547


100%|██████████| 20096/20096 [00:54<00:00, 370.01it/s]

Total number of problems in sanity check of training data: 14





In [42]:
# print passage of first row
print(diff_data.iloc[0]['passage'])

@@ -8,7 +8,10 @@
  */
 
 // TODO: direct imports like some-package/src/* are bad. Fix me.
-import ReactDebugCurrentFiber from 'react-reconciler/src/ReactDebugCurrentFiber';
+import {
+  getCurrentFiberOwnerNameInDevOrNull,
+  getCurrentFiberStackInDevOrNull,
+} from 'react-reconciler/src/ReactCurrentFiber';
 import {registrationNameModules} from 'events/EventPluginRegistry';
 
@@ -45,10 +48,6 @@ import {validateProperties as validateARIAProperties} from '../shared/ReactDOMIn
 import {validateProperties as validateInputProperties} from '../shared/ReactDOMNullInputValuePropHook';
 import {validateProperties as validateUnknownProperties} from '../shared/ReactDOMUnknownPropertyHook';
 
-const {
-  getCurrentFiberOwnerName,
-  getCurrentFiberStackAddendum,
-} = ReactDebugCurrentFiber;
 let didWarnInvalidHydration = false;
 let didWarnShadyDOM = false;
 
@@ -62,7 +61,7 @@ const HTML = '__html';
 
 const {html: HTML_NAMESPACE} = Namespaces;
 
-let getStack = () => '';
+let getStackInDevOrNull

In [43]:
# print length of passages in diff_data
diff_data['passage'].str.len().describe()

count     20082.000000
mean       4010.196644
std       10365.837954
min         112.000000
25%         563.000000
50%        1376.000000
75%        4167.000000
max      321990.000000
Name: passage, dtype: float64

In [44]:
do_training(diff_data, code_reranker, os.path.join(repo_path, f"code_{save_model_name}_model_output"), args)

Training the model...
Label distribution:
label
0    11863
1     8219
Name: count, dtype: int64


Map: 100%|██████████| 16065/16065 [00:20<00:00, 775.49 examples/s]
Map: 100%|██████████| 4017/4017 [00:05<00:00, 775.57 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training dataset features:
{'labels': Value(dtype='float64', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,0.2023,0.198104
2,0.167,0.166507
3,0.1348,0.162599
4,0.1005,0.161759
5,0.0683,0.162765
6,0.0517,0.176837


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [47]:
code_reranker.model = AutoModelForSequenceClassification.from_pretrained(os.path.join(repo_path, f"code_{save_model_name}_model_output", 'checkpoint-3018'))
code_reranker.model.to(code_reranker.device)
rerankers = [bert_reranker, code_reranker]

## Evaluate on Gold

In [12]:
import pandas as pd

In [13]:
bert_reranker = BERTReranker(params)
rerankers = [bert_reranker]
save_model_name = params['model_name'].replace('/', '_')
hf_output_dir = os.path.join(repo_path, f'{save_model_name}_model_output')
best_model_path = os.path.join(hf_output_dir, 'best_model')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 476.73 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_len': 400, 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 16, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 3, 'mlp_lr': 0.001, 'bert_lr': 5e-05, 'hidden_dim': 128, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'weight_decay': 0.01, 'dropout_prob': 0.5, 'train_commits': 1500}


In [16]:
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained(best_model_path).to(bert_reranker.device)
rerankers = [bert_reranker]

In [15]:
model_evaluator.evaluate_sampling(n=10, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'])



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:02<00:00,  3.50it/s]


{'MAP': 0.1859,
 'P@10': 0.08,
 'P@100': 0.028,
 'P@1000': 0.0043,
 'MRR': 0.2331,
 'Recall@100': 0.4443,
 'Recall@1000': 0.5752}

In [12]:
model_evaluator.evaluate_sampling(n=10, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], rerankers=rerankers)



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:57<00:00,  5.79s/it]


{'MAP': 0.0137,
 'P@10': 0.0,
 'P@100': 0.007,
 'P@1000': 0.0043,
 'MRR': 0.0059,
 'Recall@100': 0.0598,
 'Recall@1000': 0.5752}

In [17]:
model_evaluator.evaluate_sampling(n=10, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], rerankers=rerankers)



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:54<00:00,  5.49s/it]


{'MAP': 0.1497,
 'P@10': 0.13,
 'P@100': 0.033,
 'P@1000': 0.0043,
 'MRR': 0.1727,
 'Recall@100': 0.4681,
 'Recall@1000': 0.5752}

In [17]:
repo_name = 'facebook_react'
oai_model = 'gpt-3'

gold_file_path = os.path.join('gold', repo_name, f'{repo_name}_{oai_model}_gold.parquet')

In [18]:
gold_df = pd.read_parquet(gold_file_path)

# rename the column transformed_message_gpt3 to transformed_message_{oai_model}
gold_df = gold_df.rename(columns={'transformed_message_gpt3': f'transformed_message_{oai_model}'})
# rename commit_message to original_message
gold_df = gold_df.rename(columns={'commit_message': 'original_message'})
# rename transformed_message to commit_message
gold_df = gold_df.rename(columns={f'transformed_message_{oai_model}': 'commit_message'})
gold_df

Unnamed: 0,commit_id,commit_date,original_message,actual_files_modified,commit_message
0,af1b039bdd5a8b5def5d51acad00b79e9b7b377c,1586481094,ESLint rule to forbid cross fork imports (#185...,"[.eslintrc.js, scripts/eslint-rules/__tests__/...","When syncing changes across implementations, i..."
1,5aa0c5671fdddc46092d46420fff84a82df558ac,1623102438,Fix Issue with Undefined Lazy Imports By Refac...,[packages/react-reconciler/src/__tests__/React...,"When lazy importing, there is an issue with un..."
2,af08b5cbcaf4d3e3ad965a9165e41688733a7771,1509740372,Release script follow-up work after 16.1.0-bet...,[scripts/release/build-commands/add-git-tag.js...,When using the release script after the 16.1.0...
3,24dbe851e8a3a3a5233654183fd80b0d64b99295,1576610956,fix(dev-tools): fix show correct displayName w...,[packages/react-devtools-shared/src/backend/re...,"When using `React.forwardRef()`, the displayNa..."
4,ddc4b65cfe17b3f08ff9f18f8804ff5b663788c8,1586291681,Clear finished discrete updates during commit ...,[packages/react-reconciler/src/ReactFiberWorkL...,If a root is finished with a priority lower th...
...,...,...,...,...,...
95,05a55a4b09b7b7c8f63778fb8252a001ca66f8d7,1642620847,Fix change events for custom elements (#22938)...,[packages/react-dom/src/__tests__/DOMPropertyO...,"When using custom elements, there may be issue..."
96,27b5699694f20220e0448f0ba3eb6bfa0d3a64ed,1644619917,Simplify cache pool contexts (#23280) The `po...,[packages/react-reconciler/src/ReactFiberCache...,Reading from `pooledCache` variable to track c...
97,09916479219a61ae86d2ec8ce159a161337b9007,1613595642,Use setImmediate when available over MessageCh...,[packages/scheduler/src/SchedulerFeatureFlags....,"When available, it is preferable to use setImm..."
98,c826dc50de288758a0b783b2fd37b40a3b512fc4,1681936268,Add (Client) Functions as Form Actions (#26674...,"[fixtures/flight/src/Button.js, fixtures/fligh...",When using `<form action={...}>` or `<button f...


In [22]:
model_evaluator.evaluate_sampling(k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], gold_df=gold_df)

Found gold_df, evaluating on 100 commits
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   commit_id              100 non-null    string
 1   commit_date            100 non-null    int64 
 2   original_message       100 non-null    string
 3   actual_files_modified  100 non-null    object
 4   commit_message         100 non-null    object
dtypes: int64(1), object(2), string(2)
memory usage: 4.0+ KB
None


100%|██████████| 100/100 [00:26<00:00,  3.72it/s]


{'MAP': 0.1002,
 'P@10': 0.044,
 'P@100': 0.0192,
 'P@1000': 0.003,
 'MRR': 0.163,
 'Recall@100': 0.3575,
 'Recall@1000': 0.5623}

In [23]:
model_evaluator.evaluate_sampling(k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], gold_df=gold_df, rerankers=rerankers)

Found gold_df, evaluating on 100 commits
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   commit_id              100 non-null    string
 1   commit_date            100 non-null    int64 
 2   original_message       100 non-null    string
 3   actual_files_modified  100 non-null    object
 4   commit_message         100 non-null    object
dtypes: int64(1), object(2), string(2)
memory usage: 4.0+ KB
None


100%|██████████| 100/100 [09:19<00:00,  5.60s/it]


{'MAP': 0.119,
 'P@10': 0.062,
 'P@100': 0.0201,
 'P@1000': 0.003,
 'MRR': 0.1854,
 'Recall@100': 0.3989,
 'Recall@1000': 0.5623}