In [1]:
import sys
sys.path.append('../src')

In [2]:
import pandas as pd
import os

from utils import get_combined_df
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
path = '../data/2_7/facebook_react/cache_hope'
main_path = '../data/2_7/facebook_react'

In [39]:
combined_df = get_combined_df(main_path)

In [76]:
code_df = pd.read_parquet(os.path.join(path, 'code_df.parquet'))

In [80]:
len(set(code_df[code_df['label'] == 1].SR_file_path.tolist()))

533

In [10]:
params = {
        'model_name': args.model_path,
        'psg_cnt': args.psg_cnt,
        'aggregation_strategy': args.aggregation_strategy,
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': args.rerank_depth,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': 'sump',
        'psg_len': args.psg_len,
        'psg_stride': args.psg_stride
    }

code_reranker = BERTCodeReranker(params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu
Initialized Code File BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 25, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 350, 'psg_stride': 300}


In [42]:
def prepare_code_triplets(code_df, code_reranker, cache_file, combined_df, overwrite=False):
    print(f'Preparing code triplets from scratch for {len(code_df)} diffs with psg_len: {code_reranker.psg_len}, psg_stride: {code_reranker.psg_stride}, psg_cnt: {code_reranker.psg_cnt}')

    if cache_file and os.path.exists(cache_file) and not overwrite:
        print(f"Loading data from cache file: {cache_file}")
        return pd.read_parquet(cache_file)

    def prep_line(line):
        return line.rstrip().lstrip()

    def parse_diff(diff):
        return [
            line[1:] if line.startswith('+') else line
            for line in diff.split('\n')
            if not (line.startswith('-') or len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
            and len(prep_line(line)) > 2
        ]

    def parse_diff2(diff):
        return [
            line[1:] if (line.startswith('+') or line.startswith('-')) else line
            for line in diff.split('\n')
            if not (len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
        ]

    def full_tokenize(s):
        return code_reranker.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()

    def count_matching_lines(passage_lines, diff_lines):
        # Create a 2D array to store the lengths of the longest common subsequences
        dp = [[0] * (len(diff_lines) + 1) for _ in range(len(passage_lines) + 1)]

        # Fill the dp array
        for i in range(1, len(passage_lines) + 1):
            for j in range(1, len(diff_lines) + 1):
                if prep_line(passage_lines[i - 1]) == prep_line(diff_lines[j - 1]):
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

        return dp[-1][-1]

    triplets = []

    for _, row in tqdm(code_df.iterrows(), total=len(code_df)):
        file_tokens = full_tokenize(row['SR_file_content'])
        total_tokens = len(file_tokens)
        cur_diff = combined_df[(combined_df['commit_id'] == row['SR_commit_id']) & (combined_df['file_path'] == row['SR_file_path'])]['diff'].values[0]

        if pd.isna(cur_diff):
            # if diff is NA/NaN, then skip this row
            # possible when commit removes or renames this file or maybe god decided to remove the diff
            continue

        cur_diff_lines = parse_diff(cur_diff)
        cur_triplets = []
        for cur_start in range(0, total_tokens, code_reranker.psg_stride):
            cur_passage = []

            cur_passage.extend(file_tokens[cur_start:cur_start+code_reranker.psg_len])

            # now convert cur_passage into a string
            cur_passage_decoded = code_reranker.tokenizer.decode(cur_passage)

            cur_passage_lines = cur_passage_decoded.split('\n')

            # remove lines with less than 2 characters
            cur_passage_lines = [line for line in cur_passage_lines if len(prep_line(line)) > 2]

            # check if there are lines matching the diff lines
            # if there are, then we can add this directly to the triplets
            # common_lines = set(cur_passage_lines).intersection(set(cur_diff_lines))
            common_line_count = count_matching_lines(cur_passage_lines, cur_diff_lines)

            # add the cur_passage to cur_result_passages
            cur_triplets.append((common_line_count, (row['train_query'], row['SR_file_path'], cur_passage_decoded, row['label'])))

        # sort the cur_triplets by the number of common lines
        cur_triplets.sort(key=lambda x: x[0], reverse=True)

        # now we want to filter cur_triplets to have all tuplets with x[0] > 3 to be in order and shuffle the rest

        # now add the top code_reranker.psg_cnt to triplets
        for triplet in cur_triplets[:code_reranker.psg_cnt]:
            # print(f"Found {triplet[0]} matching lines for diff in cur_passage at index")
            triplets.append(triplet[1])


    # convert to pandas dataframe
    triplets = pd.DataFrame(triplets, columns=['query', 'file_path', 'passage', 'label'])
    if cache_file:
        # with open(cache_file, 'wb') as file:
        #     pickle.dump(triplets, file)
        #     print(f"Saved data to cache file: {cache_file}")
        print(f"Saving data to cache file: {cache_file}")
        triplets.to_parquet(cache_file)
    return triplets

In [58]:
# triplets_df = prepare_code_triplets(code_df, code_reranker, None, combined_df)
triplets_df = pd.read_parquet(os.path.join(path, 'diff_code_triplets.parquet'))

In [59]:
triplets_df['label'].value_counts()

label
0    42722
1    11820
Name: count, dtype: int64

In [66]:
train_df = pd.read_parquet('../gold/facebook_react/v2_facebook_react_gpt4_train.parquet')

In [74]:
res = []
for arr in train_df.actual_files_modified.tolist():
    res.extend(arr)

print(len(res))

2847


In [75]:
len(set(res))

1085

In [64]:
len(triplets_df[triplets_df['label'] == 1]['file_path'].unique())

322

In [56]:
for p in triplets_df[triplets_df['file_path'] == 'packages/react-reconciler/src/ReactFiberScheduler.js'].passage.values:
    if len(p) > 0:
        print(p)
        print('*' * 100)

function resolveLocksOnRoot(root: FiberRoot, expirationTime: ExpirationTime) {
  const firstBatch = root.firstBatch;
  if (
    firstBatch !== null &&
    firstBatch._defer &&
    firstBatch._expirationTime >= expirationTime
  ) {
    scheduleCallback(NormalPriority, () => {
      firstBatch._onComplete();
      return null;
    });
    return true;
  } else {
    return false;
  }
}
****************************************************************************************************
function prepareFreshStack(root, expirationTime) {
  root.finishedWork = null;
  root.finishedExpirationTime = NoWork;

  const timeoutHandle = root.timeoutHandle;
  if (timeoutHandle !== noTimeout) {
    // The root previous suspended and scheduled a timeout to commit a fallback
    // state. Now that we have additional work, cancel the timeout.
    root.timeoutHandle = noTimeout;
    // $FlowFixMe Complains noTimeout is not a TimeoutID, despite the check above
    cancelTimeout(timeoutHandle);
  }

  if (

In [28]:
code_df.label.value_counts()

label
0    5000
1    1750
Name: count, dtype: int64

In [29]:
sub_df.label.value_counts()

label
0    225310
1     61222
Name: count, dtype: int64

In [24]:
df.head()

Unnamed: 0,commit_id,file_path,query,passage,label
0,cc24d0ea56b0538d1ac61dc09faedd70ced5bb47,packages/react-reconciler/src/ReactFiberSchedu...,"Malformed data types (`commitDetails`, `intera...","/**\n * Copyright (c) Facebook, Inc. and its a...",0
1,c22b94f14a809abb376f07a53f36860a7c6a342e,src/renderers/shared/fiber/ReactChildFiber.js,"Malformed data types (`commitDetails`, `intera...","/**\n * Copyright 2013-present, Facebook, Inc....",0
2,1f74eca9937ad6f19b6291d21edfb8747bae88ca,src/renderers/dom/fiber/__tests__/ReactDOMFibe...,"Malformed data types (`commitDetails`, `intera...","/**\n * Copyright 2013-present, Facebook, Inc....",0
3,cc24d0ea56b0538d1ac61dc09faedd70ced5bb47,packages/react-reconciler/src/ReactFiberRoot.js,"Malformed data types (`commitDetails`, `intera...","/**\n * Copyright (c) Facebook, Inc. and its a...",0
4,9bd4d1fae21a6521c185cb114a15ca5dc74d6d9b,packages/react-reconciler/src/ReactFiberUnwind...,"Malformed data types (`commitDetails`, `intera...","/**\n * Copyright (c) 2013-present, Facebook, ...",0


In [30]:
data = df
for i, row in tqdm(data.iterrows(), total=len(data)):
        if row['label'] == 0:
            assert data[(data['query'] == row['query']) & (data['commit_id'] == row['commit_id']) & (data['file_path'] == row['file_path'])]['label'].values[0] == 0
        else:
            assert data[(data['query'] == row['query']) & (data['commit_id'] == row['commit_id']) & (data['file_path'] == row['file_path'])]['label'].values[0] == 1
        break


  0%|          | 0/6750 [00:00<?, ?it/s]


In [20]:
df.head()

Unnamed: 0,query,file_path,passage,label
0,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,flushPassiveEffects();\n return null;...,0
1,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,Time;\n if (childUpdateExpirationTime > n...,0
2,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,{\n let msUntilTimeout = computeMsUn...,0
3,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,root. Don't need to schedule a ping because\n...,0
4,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,if (workInProgress!== null) {\n // Ther...,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286532 entries, 0 to 286531
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   query      286532 non-null  object
 1   file_path  286532 non-null  object
 2   passage    286532 non-null  object
 3   label      286532 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 8.7+ MB


In [19]:
df.label.value_counts()

label
0    225310
1     61222
Name: count, dtype: int64

In [18]:
df.head(39)

Unnamed: 0,query,file_path,passage,label
0,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,flushPassiveEffects();\n return null;...,0
1,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,Time;\n if (childUpdateExpirationTime > n...,0
2,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,{\n let msUntilTimeout = computeMsUn...,0
3,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,root. Don't need to schedule a ping because\n...,0
4,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,if (workInProgress!== null) {\n // Ther...,0
5,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,);\n return null;\n ...,0
6,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,"function resolveLocksOnRoot(root: FiberRoot, e...",0
7,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,", try rendering\n // at the lower prior...",0
8,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,{\n ReactStrictModeWarnings.discardPending...,0
9,"Malformed data types (`commitDetails`, `intera...",packages/react-reconciler/src/ReactFiberSchedu...,"flushRoot(root: FiberRoot, expirationTime: Ex...",0


In [12]:
# distribution of label column
df['label'].value_counts()


label
0    5000
1    1750
Name: count, dtype: int64

In [7]:
class BERTCodeReranker:
    def __init__(self, parameters):
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)

        print(f'Using device: {self.device}')

        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

        print(f"Initialized Code File BERT reranker with parameters: {parameters}")


    def rerank(self, query, aggregated_results):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
        # print(f'Reranking {len(aggregated_results)} results')

        self.model.eval()

        query_passage_pairs, per_result_contribution = self.split_into_query_passage_pairs(query, aggregated_results)


        # for agg_result in aggregated_results:
        #     query_passage_pairs.extend(
        #         (query, result.commit_message)
        #         for result in agg_result.contributing_results[: self.psg_cnt]
        #     )

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        # Now assign the scores to the aggregated results by mapping the scores to the contributing results
        for i, agg_result in enumerate(aggregated_results):
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
            assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
            end_index = score_index + per_result_contribution[i] # only use psg_cnt contributing_results
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index


            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")


    def split_into_query_passage_pairs(self, query, aggregated_results):
        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        def full_tokenize(s):
            return self.tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()
        query_passage_pairs = []
        per_result_contribution = []
        for agg_result in aggregated_results:
            agg_result.contributing_results.sort(key=lambda res: res.commit_date, reverse=True)
            # get most recent file version
            most_recent_search_result = agg_result.contributing_results[0]
            # get the file_path and commit_id
            file_path = most_recent_search_result.file_path
            commit_id = most_recent_search_result.commit_id
            # get the file content from combined_df
            file_content = combined_df[(combined_df['commit_id'] == commit_id) & (combined_df['file_path'] == file_path)]['cur_file_content'].values[0]

            # now need to split this file content into psg_cnt passages
            # first tokenize the file content

            # warning these asserts are useless since we are using NaNs
            assert file_content is not None, f'file_content is None for commit_id: {commit_id}, file_path: {file_path}'
            assert file_path is not None, f'file_path is None for commit_id: {commit_id}'
            assert query is not None, f'query is None'

            query_tokens = full_tokenize(query)
            path_tokens = full_tokenize(file_path)

            if pd.isna(file_content):
                # if file_content is NaN, then we can just set file_content to empty string
                print(f'WARNING: file_content is NaN for commit_id: {commit_id}, file_path: {file_path}, setting file_content to empty string')
                file_content = ''

            file_tokens = full_tokenize(file_content)


            # now split the file content into psg_cnt passages
            cur_result_passages = []
            # get the input ids
            # input_ids = file_content['input_ids'].squeeze()
            # get the number of tokens in the file content
            total_tokens = len(file_tokens)

            for cur_start in range(0, total_tokens, self.psg_stride):
                cur_passage = []
                # add query tokens and path tokens
                # cur_passage.extend(query_tokens) # ??????????????
                cur_passage.extend(path_tokens)

                # add the file tokens
                cur_passage.extend(file_tokens[cur_start:cur_start+self.psg_len])

                # now convert cur_passage into a string
                cur_passage_decoded = self.tokenizer.decode(cur_passage)

                # add the cur_passage to cur_result_passages
                cur_result_passages.append(cur_passage_decoded)

                if len(cur_result_passages) == self.psg_cnt:
                    break

            # now add the query, passage pairs to query_passage_pairs
            per_result_contribution.append(len(cur_result_passages))
            query_passage_pairs.extend((query, passage) for passage in cur_result_passages)
        return query_passage_pairs, per_result_contribution

    def rerank_pipeline(self, query, aggregated_results):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results)
        min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        for i, result in enumerate(bottom_results):
            result.score = min_top_score - i - 1
        # combine the results
        reranked_results.extend(bottom_results)
        assert(len(reranked_results) == len(aggregated_results))
        return reranked_results

In [9]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='../smalldata/ftr/index_commit_tokenized', repo_path='../smalldata/ftr', k=1000, n=100, model_path='microsoft/codebert-base', overwrite_cache=False, batch_size=32, num_epochs=10, learning_rate=5e-05, num_positives=10, num_negatives=10, train_depth=1000, num_workers=8, train_commits=1000, psg_cnt=25, aggregation_strategy='sump', use_gpu=True, rerank_depth=100, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4', overwrite_eval=False, sanity_check=True, debug=False, best_model_path=None, bert_best_model='../data/combined_commit_train/best_model', psg_len=350, psg_stride=300, ignore_gold_in_training=False, eval_folder='code_rerank', use_gpt_train=True
)

In [5]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']
repo_path = args.repo_path
repo_name = repo_path.split('/')[-1]
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

test_path = os.path.join('..', 'gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
gold_df = pd.read_parquet(test_path)

Loaded index at ../smalldata/ftr/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [6]:
bm25_baseline_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=BM25_AGGR_STRAT)

print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)



  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:23<00:00,  4.33it/s]

BM25 Baseline Evaluation
{'MAP': 0.1542, 'P@10': 0.087, 'P@100': 0.0267, 'P@1000': 0.0041, 'MRR': 0.2133, 'Recall@100': 0.5077, 'Recall@1000': 0.6845}





In [7]:
bert_params = {
        'model_name': args.model_path,
        'psg_cnt': 5,
        'aggregation_strategy': args.aggregation_strategy,
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': 250,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': 'sump',
    }

In [8]:
bert_model_path = os.path.join('data', 'combined_commit_train', 'best_model')
bert_reranker = BERTReranker(bert_params)
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained(args.bert_best_model, num_labels=1, problem_type='regression')
bert_reranker.model.to(bert_reranker.device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 476.73 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump'}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 953.46 MB
Initialized Code File BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 25, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 100, 'num_epochs': 10, 'lr': 5e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'sump', 'psg_len': 350, 'psg_stride': 250}


In [10]:
rerankers = [bert_reranker, code_reranker]

In [14]:
bert_gold_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], rerankers=rerankers, gold_df=gold_df.iloc[1:2])

print("BERT Gold Evaluation")
print(bert_gold_eval)

Found gold_df, evaluating on 1 commits
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 1 to 1
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   commit_id                 1 non-null      string
 1   commit_date               1 non-null      int64 
 2   commit_message            1 non-null      string
 3   actual_files_modified     1 non-null      object
 4   transformed_message_gpt4  1 non-null      object
dtypes: int64(1), object(2), string(2)
memory usage: 172.0+ bytes
None


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
subdf = combined_df.head(5)

In [17]:
subdf.head()

Unnamed: 0,owner,repo_name,commit_date,commit_id,commit_message,file_path,previous_commit_id,previous_file_content,cur_file_content,diff,status,is_merge_request,file_extension
0,facebook,react,1696522497,dddfe688206dafa5646550d351eb9a8e9c53654a,pull implementations from the right react-dom ...,packages/react-dom/server-rendering-stub.js,546178f9109424f6a0176ea8702a7620c4417569,"/**  * Copyright (c) Meta Platforms, Inc. and ...","/**  * Copyright (c) Meta Platforms, Inc. and ...","@@ -30,7 +30,10 @@ export {  } from './src/ser...",modified,False,js
1,facebook,react,1696521194,546178f9109424f6a0176ea8702a7620c4417569,`react-dom/server-rendering-stub`: restore exp...,packages/react-dom/server-rendering-stub.js,16619f106ab5ba8e6aca19d55be46cce22e4a7ff,"/**  * Copyright (c) Meta Platforms, Inc. and ...","/**  * Copyright (c) Meta Platforms, Inc. and ...","@@ -28,3 +28,30 @@ export {  useFormState,  ...",modified,False,js
2,facebook,react,1696452492,0fba3ecf73900a1b54ed6d3b0617462ac92d2fef,[Fizz] Reset error component stack and fix err...,packages/react-dom/src/__tests__/ReactDOMFizzS...,6f132439578ee11e04b41a278df51c52b0dc8563,"/**  * Copyright (c) Meta Platforms, Inc. and ...","/**  * Copyright (c) Meta Platforms, Inc. and ...","@@ -981,4 +981,149 @@ describe('ReactDOMFizzSt...",modified,False,js
3,facebook,react,1696452492,0fba3ecf73900a1b54ed6d3b0617462ac92d2fef,[Fizz] Reset error component stack and fix err...,packages/react-server/src/ReactFizzServer.js,6f132439578ee11e04b41a278df51c52b0dc8563,"/**  * Copyright (c) Meta Platforms, Inc. and ...","/**  * Copyright (c) Meta Platforms, Inc. and ...","@@ -1110,7 +1110,6 @@ function replaySuspenseB...",modified,False,js
4,facebook,react,1696450581,6f132439578ee11e04b41a278df51c52b0dc8563,Move ReactCurrentDispatcher back to shared int...,packages/react-server/src/ReactFlightServer.js,ca237d6f0ab986e799f192224d3066f76d66b73b,"/**  * Copyright (c) Meta Platforms, Inc. and ...","/**  * Copyright (c) Meta Platforms, Inc. and ...","@@ -108,6 +108,7 @@ import {  } from 'shared/R...",modified,False,js


In [22]:
# get the first row
code = subdf.iloc[0].cur_file_content