In [1]:
import sys
sys.path.append('src')

In [2]:
import os

# import pickle
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.init as init
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer

from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator
from utils import (
    AggregatedSearchResult,
    TripletDataset,
    get_combined_df,
    prepare_triplet_data_from_df,
    set_seed,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
set_seed(42)

In [4]:
# print torch devices available
print('Available devices: ', torch.cuda.device_count())
print('Current cuda device: ', torch.cuda.current_device())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Available devices:  1
Current cuda device:  0
Quadro RTX 6000


In [47]:
# V1
# # class BERTReranker:
#     def __init__(self, parameters):
#         self.parameters = parameters
#         self.model_name = parameters['model_name']
#         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
#         # self.model = AutoModel.from_pretrained(self.model_name, num_labels=1)
#         self.model = AutoModel.from_pretrained(self.model_name)
#         # self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1)
#         self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
#         self.model.to(self.device)

#         print(f'Using device: {self.device}')

#         if torch.cuda.is_available() and parameters['use_gpu']:
#             # print GPU info
#             print(f"Using GPU: {torch.cuda.get_device_name(0)}")
#             print(f'GPU Device Count: {torch.cuda.device_count()}')
#             print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


#         self.psg_len = parameters['psg_len']
#         self.psg_cnt = parameters['psg_cnt']
#         # self.psg_stride = parameters.get('psg_stride', self.psg_len)
#         self.aggregation_strategy = parameters['aggregation_strategy']
#         self.batch_size = parameters['batch_size']
#         # self.max_title_len = parameters.get('max_title_len', 0)
#         # self.use_title = self.max_title_len > 0
#         self.rerank_depth = parameters['rerank_depth']
#         # self.max_seq_length = parameters.get('max_seq_length', 512)
#         self.max_seq_length = self.tokenizer.model_max_length

#         print(f"Initialized BERT reranker with parameters: {parameters}")

#         # input_dim = parameters['INPUT_DIM']  # Default BERT hidden size
#         # hidden_dim = parameters['HIDDEN_DIM']   # Example hidden size
#         # output_dim = parameters['OUTPUT_DIM']  # We want a single score as output

#         self.mlp = MLP(self.model.config.hidden_size, parameters['hidden_dim'], 1, parameters['dropout_prob']).to(self.device)

#     def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
#         """
#         Rerank the BM25 aggregated search results using BERT model scores.

#         query: The issue query string.
#         aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
#         """
#         # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
#         # print(f'Reranking {len(aggregated_results)} results')

#         # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
#         query_passage_pairs = []
#         for agg_result in aggregated_results:
#             query_passage_pairs.extend(
#                 (query, result.commit_msg)
#                 for result in agg_result.contributing_results[: self.psg_cnt]
#             )

#         if not query_passage_pairs:
#             print('WARNING: No query passage pairs to rerank')
#             print(query, aggregated_results, self.psg_cnt)
#             return aggregated_results

#         # tokenize the query passage pairs
#         encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

#         # create tensors for the input ids, attention masks
#         input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
#         attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

#         # Create a dataloader for feeding the data to the model
#         dataset = TensorDataset(input_ids, attention_masks)
#         dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)

#         scores = self.get_scores(dataloader, self.model)

#         score_index = 0
#         # Now assign the scores to the aggregated results by mapping the scores to the contributing results
#         for agg_result in aggregated_results:
#             # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
#             assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
#             end_index = score_index + len(agg_result.contributing_results[: self.psg_cnt])
#             cur_passage_scores = scores[score_index:end_index]
#             score_index = end_index


#             # Aggregate the scores for the current aggregated result
#             agg_score = self.aggregate_scores(cur_passage_scores)
#             agg_result.score = agg_score  # Assign the aggregated score

#         assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

#         # Sort by the new aggregated score
#         aggregated_results.sort(key=lambda res: res.score, reverse=True)

#         return aggregated_results

#     def get_scores(self, dataloader, model):
#         scores = []
#         with torch.no_grad():
#             for batch in dataloader:
#                 # Unpack the batch and move it to GPU
#                 b_input_ids, b_attention_mask = batch
#                 b_input_ids = b_input_ids.to(self.device)
#                 b_attention_mask = b_attention_mask.to(self.device)

#                 # Get the pooled output from BERT's [CLS] token
#                 # pooled_output = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output

#                 cls_output = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]

#                 # # Pass the pooled output through the MLP to get the scores
#                 # logits = self.mlp(pooled_output).squeeze(-1) # type: ignore
#                 logits = self.mlp(cls_output).squeeze(-1) # type: ignore

#                 # # Collect the scores (detach them from the computation graph and move to CPU)
#                 scores.extend(logits.detach().cpu().numpy())


#                 # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
#                 # logits = outputs.logits
#                 # scores.extend(logits.detach().cpu().numpy().squeeze(-1))

#         return scores

#     def aggregate_scores(self, passage_scores):
#         """
#         Aggregate passage scores based on the specified strategy.
#         """
#         if len(passage_scores) == 0:
#             return 0.0


#         if self.aggregation_strategy == 'firstp':
#             return passage_scores[0]
#         if self.aggregation_strategy == 'maxp':
#             return max(passage_scores)
#         if self.aggregation_strategy == 'avgp':
#             return sum(passage_scores) / len(passage_scores)
#         if self.aggregation_strategy == 'sump':
#             return sum(passage_scores)
#         # else:
#         raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

#     def rerank_pipeline(self, query, aggregated_results):
#         if len(aggregated_results) == 0:
#             return aggregated_results
#         top_results = aggregated_results[:self.rerank_depth]
#         bottom_results = aggregated_results[self.rerank_depth:]
#         reranked_results = self.rerank(query, top_results)
#         min_top_score = reranked_results[-1].score
#         # now adjust the scores of bottom_results
#         for i, result in enumerate(bottom_results):
#             result.score = min_top_score - i - 1
#         # combine the results
#         reranked_results.extend(bottom_results)
#         assert(len(reranked_results) == len(aggregated_results))
#         return reranked_results

In [5]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    # repo_path='smalldata/ftr',
    # index_path='smalldata/ftr/index_commit_tokenized/',
    repo_path='2_7/apache_spark/',
    index_path='2_7/apache_spark/index_commit_tokenized/',
    k=1000,
    n=100,
    overwrite_cache=False,
    freeze_bert=False,
)

In [6]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']
repo_path = args.repo_path
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'

In [7]:
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

Loaded index at 2_7/apache_spark/index_commit_tokenized/
Index Stats: {'total_terms': 58140950, 'documents': 188006, 'non_empty_documents': 188006, 'unique_terms': 24952}


In [10]:
bm25_output_path = os.path.join(eval_path, f'bm25_baseline_N{n}_K{K}_metrics.txt')
print(f'BM25 output path: {bm25_output_path}')

bm25_baseline_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=bm25_output_path, aggregation_strategy=BM25_AGGR_STRAT, repo_path=repo_path)

print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)

BM25 output path: ../smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:23<00:00,  4.29it/s]

Evaluation results written to ../smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt
BM25 Baseline Evaluation
{'MAP': 0.1542, 'P@10': 0.087, 'P@100': 0.0267, 'P@1000': 0.0041, 'MRR': 0.2133, 'Recall@100': 0.5077, 'Recall@1000': 0.6845}





In [8]:
# Reranking with BERT
params = {
    'model_name': 'microsoft/codebert-base',
    'psg_len': 400,
    'psg_cnt': 5,
    # 'psg_stride': 32,
    'aggregation_strategy': 'sump',
    'batch_size': 16,
    # 'batch_size': 512,
    # 'batch_size': 1,
    'use_gpu': True,
    'rerank_depth': 250,
    'num_epochs': 3,
    # 'mlp_lr': 1e-2,
    'mlp_lr': 1e-3,
    'bert_lr': 5e-5,
    'hidden_dim': 128,
    'num_positives': 10,
    'num_negatives': 10,
    'train_depth': 1000,
    'num_workers': 8,
    'weight_decay': 0.01,
    'dropout_prob': 0.5,
    'train_commits': 1500,
}


In [9]:
# get average length of commit messages
combined_df['commit_message'].str.split().str.len().mean()

60.66661695926252

In [9]:
filtered_df = combined_df[['commit_date', 'commit_message', 'commit_id', 'file_path', 'diff']]

# Step 2: Group by commit_id
grouped_df = filtered_df.groupby(['commit_id', 'commit_date', 'commit_message'])['file_path'].apply(list).reset_index()
grouped_df.rename(columns={'file_path': 'actual_files_modified'}, inplace=True)

# Step 3: Determine midpoint and filter dataframe
midpoint_date = np.median(grouped_df['commit_date'])
recent_df = grouped_df[grouped_df['commit_date'] > midpoint_date]
print(f'Number of commits after midpoint date: {len(recent_df)}')

# Step 4: Filter out commits with less than average length commit messages
average_commit_len = recent_df['commit_message'].str.split().str.len().mean()
# filter out commits with less than average length
recent_df = recent_df[recent_df['commit_message'].str.split().str.len() > average_commit_len]
print(f'Number of commits after filtering by commit message length: {len(recent_df)}')

# Step 5: randomly sample 1500 rows from recent_df
recent_df = recent_df.sample(params['train_commits'])
print(f'Number of commits after sampling: {len(recent_df)}')


Number of commits after midpoint date: 16839
Number of commits after filtering by commit message length: 5280
Number of commits after sampling: 1500


In [10]:
# prepare data first
if not os.path.exists(os.path.join(repo_path, 'cache')):
    os.makedirs(os.path.join(repo_path, 'cache'))
triplet_cache = os.path.join(repo_path, 'cache', 'triplet_data_cache.pkl')

In [26]:
# def temp_prep(df, searcher, search_depth, num_positives, num_negatives):

#     data = []
#     print(f'Preparing data from dataframe of size: {len(df)} with search_depth: {search_depth}')
#     total_positives, total_negatives = 0, 0
#     for _, row in df.iterrows():
#     # for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):

#         cur_positives = 0
#         cur_negatives = 0
#         pos_commit_ids = set()
#         neg_commit_ids = set()
#         commit_message = row['commit_message']
#         actual_files_modified = row['actual_files_modified']

#         agg_search_results = searcher.pipeline(commit_message, row['commit_date'], search_depth, 'sump', aggregate_on='commit')

#         # for each agg_result, find out how many files it has edited are in actual_files_modified and sort by score

#         for agg_result in agg_search_results:
#             agg_result_files = set([result.file_path for result in agg_result.contributing_results])
#             intersection = agg_result_files.intersection(actual_files_modified)
#             # TODO maybe try this for training
#             agg_result.score = len(intersection) / len(agg_result_files) # how focused the commit is
#             # agg_result.score = len(intersection)

#         agg_search_results.sort(key=lambda res: res.score, reverse=True)

#         # go from top to bottom, first num_positives non-0 scores are positive samples and the next num_negatives are negative samples
#         for agg_result in agg_search_results:
#             cur_commit_msg = agg_result.contributing_results[0].commit_message
#             if cur_positives < num_positives and agg_result.score > 0:
#                 # meaning there is at least one file in the agg_result that is in actual_files_modified
#                 # pos_commits.append(agg_result)
#                 data.append((commit_message, cur_commit_msg, 1))
#                 cur_positives += 1
#                 pos_commit_ids.add(agg_result.commit_id)
#             elif cur_negatives < num_negatives:
#                 # neg_commits.append(agg_result)
#                 data.append((commit_message, cur_commit_msg, 0))
#                 cur_negatives += 1
#                 neg_commit_ids.add(agg_result.commit_id)
#             if cur_positives == num_positives and cur_negatives == num_negatives:
#                 break

#         assert len(pos_commit_ids.intersection(neg_commit_ids)) == 0, 'Positive and negative commit ids should not intersect'
#         # print(f"Total positives: {cur_positives}, Total negatives: {cur_negatives}")
#         total_positives += cur_positives
#         total_negatives += cur_negatives

#     # # Write data to cache file
#     # with open(cache_file, 'wb') as file:
#     #     pickle.dump(data, file)
#     #     print(f"Saved data to cache file: {cache_file}")


#     # print percentage of positives and negatives
#     denom = total_positives + total_negatives
#     print(f"Percentage of positives: {total_positives / denom}, Percentage of negatives: {total_negatives / denom}")
#     return data

Preparing data from dataframe of size: 2 with search_depth: 1000
Percentage of positives: 0.2, Percentage of negatives: 0.8


In [11]:
triplet_data = prepare_triplet_data_from_df(recent_df, bm25_searcher, search_depth=params['train_depth'], num_positives=params['num_positives'], num_negatives=params['num_negatives'], cache_file=triplet_cache, overwrite=args.overwrite_cache)

Loading data from cache file: 2_7/apache_spark/cache/triplet_data_cache.pkl


In [12]:
# see distribution of labels
triplet_data['label'].value_counts()

label
0    14973
1     7670
Name: count, dtype: int64

In [13]:
def sanity_check(data):
    # check that there no (query, passage) pairs with both labels 0 and 1
    # for i, row in data.iterrows():
    for i, row in tqdm(data.iterrows(), total=len(data)):
        if row['label'] == 0:
            assert data[(data['query'] == row['query']) & (data['passage'] == row['passage'])]['label'].values[0] == 0
        else:
            assert data[(data['query'] == row['query']) & (data['passage'] == row['passage'])]['label'].values[0] == 1

sanity_check(triplet_data)

 91%|█████████ | 20533/22643 [01:44<00:10, 196.27it/s]


AssertionError: 

In [18]:
def test():
    problem_query = triplet_data.iloc[20533]['query']
    # find all passages with this query and label 0
    passages0 = triplet_data[(triplet_data['query'] == problem_query) & (triplet_data['label'] == 0)]['passage'].values
    # find all passages with this query and label 1
    passages1 = triplet_data[(triplet_data['query'] == problem_query) & (triplet_data['label'] == 1)]['passage'].values

    # print common passages
    print(set(passages0).intersection(set(passages1)))
    # check that there is no intersection between the two sets
    assert len(set(passages0).intersection(set(passages1))) == 0

test()

{"[SPARK-25496][SQL] Deprecate from_utc_timestamp and to_utc_timestamp\n\n## What changes were proposed in this pull request?\n\nIn the PR, I propose to deprecate the `from_utc_timestamp()` and `to_utc_timestamp`, and disable them by default. The functions can be enabled back via the SQL config `spark.sql.legacy.utcTimestampFunc.enabled`. By default, any calls of the functions throw an analysis exception.\n\nOne of the reason for deprecation is functions violate semantic of `TimestampType` which is number of microseconds since epoch in UTC time zone. Shifting microseconds since epoch by time zone offset doesn't make sense because the result doesn't represent microseconds since epoch in UTC time zone any more, and cannot be considered as `TimestampType`.\n\n## How was this patch tested?\n\nThe changes were tested by `DateExpressionsSuite` and `DateFunctionsSuite`.\n\nCloses #24195 from MaxGekk/conv-utc-timestamp-deprecate.\n\nLead-authored-by: Maxim Gekk <max.gekk@gmail.com>\nCo-authore

AssertionError: 

In [None]:
# find all rows

In [19]:
def sanity_check(data):
    problems = 0
    for i, row in tqdm(data.iterrows(), total=len(data)):
        try:
            if row['label'] == 0:
                assert data[(data['query'] == row['query']) & (data['passage'] == row['passage'])]['label'].values[0] == 0
            else:
                assert data[(data['query'] == row['query']) & (data['passage'] == row['passage'])]['label'].values[0] == 1
        except AssertionError:
            print(f"Assertion failed at index {i}: {row}")
            # break  # Optional: break after the first failure, remove if you want to see all failures
            # remove the row with label 0

            if row['label'] == 0:
                problems += 1
                data.drop(i, inplace=True)
                print(f"Dropped row at index {i}")

    print(f"Total number of problems: {problems}")
    return data
# Call the function with your data
triplet_data = sanity_check(triplet_data)

 91%|█████████ | 20557/22643 [01:49<00:11, 185.47it/s]

Assertion failed at index 20533: query      [SPARK-31318][SQL] Split Parquet/Avro configs ...
passage    [SPARK-25496][SQL] Deprecate from_utc_timestam...
label                                                      0
Name: 20533, dtype: object
Dropped row at index 20533


100%|██████████| 22643/22643 [02:00<00:00, 187.45it/s]

Total number of problems: 1





In [22]:
X, y = triplet_data[['query', 'passage']], triplet_data['label']

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [40]:
class TripletDataset(Dataset):
    def __init__(self, data, labels, tokenizer, max_len):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        query = str(self.data.iloc[index]['query'])
        passage = str(self.data.iloc[index]['passage'])
        label = self.labels.iloc[index]

        # encode_plus returns a dictionary with the following keys
        # ['input_ids', 'token_type_ids', 'attention_mask']
        encoded_pair = self.tokenizer.encode_plus([query, passage], max_length=self.max_len, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True)
        input_ids = encoded_pair['input_ids'].squeeze()
        # token_type_ids = encoded_pair['token_type_ids'].squeeze()
        attention_mask = encoded_pair['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            # 'token_type_ids': token_type_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.float)
        }

In [45]:
train_dataset = TripletDataset(X_train, y_train, bert_reranker.tokenizer, bert_reranker.max_seq_length)
val_dataset = TripletDataset(X_val, y_val, bert_reranker.tokenizer, bert_reranker.max_seq_length)

## Reranking without training

In [16]:
class BERTReranker:
    def __init__(self, parameters):
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, problem_type='regression')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)

        print(f'Using device: {self.device}')

        # print GPU info
        if torch.cuda.is_available() and parameters['use_gpu']:
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        # self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt'] # how many contributing_results to use per file for reranking
        # self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy'] # how to aggregate the scores of the psg_cnt contributing_results
        self.batch_size = parameters['batch_size'] # batch size for reranking efficiently
        self.rerank_depth = parameters['rerank_depth']
        self.max_seq_length = self.tokenizer.model_max_length # max sequence length for the model

        print(f"Initialized BERT reranker with parameters: {parameters}")


    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
        # print(f'Reranking {len(aggregated_results)} results')

        self.model.eval()

        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        query_passage_pairs = []
        for agg_result in aggregated_results:
            query_passage_pairs.extend(
                (query, result.commit_message)
                for result in agg_result.contributing_results[: self.psg_cnt]
            )

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank, returning original results from previous stage')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) # shuffle=False very important for reconstructing the results back into the original order

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        # Now assign the scores to the aggregated results by mapping the scores to the contributing results
        for agg_result in aggregated_results:
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
            assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
            end_index = score_index + len(agg_result.contributing_results[: self.psg_cnt]) # only use psg_cnt contributing_results
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index


            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get scores from the model
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                scores.extend(outputs.logits.detach().cpu().numpy().squeeze(-1))
        return scores

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0

        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def rerank_pipeline(self, query, aggregated_results):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results)
        min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        for i, result in enumerate(bottom_results):
            result.score = min_top_score - i - 1
        # combine the results
        reranked_results.extend(bottom_results)
        assert(len(reranked_results) == len(aggregated_results))
        return reranked_results




    def train_model(self, train_hf_dataset, val_hf_dataset, train_args, save_dir=None):
        train_hf_dataset = train_dataset.map(tokenize_hf, batched=True)

In [17]:
bert_reranker = BERTReranker(params)
rerankers = [bert_reranker]
save_model_name = params['model_name'].replace('/', '_')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 476.73 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_len': 400, 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 16, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 3, 'mlp_lr': 0.001, 'bert_lr': 5e-05, 'hidden_dim': 128, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'weight_decay': 0.01, 'dropout_prob': 0.5, 'train_commits': 1500}


In [69]:
def test1():
    # get a sample commit from combined_df
    random_commit = combined_df.sample(1, random_state=6456)
    actual_files_modified = combined_df[combined_df['commit_id'] == random_commit['commit_id'].values[0]]['file_path'].values
    print(random_commit['commit_message'].values[0])

    bm25_results = bm25_searcher.pipeline(random_commit['commit_message'].values[0], random_commit['commit_date'].values[0], args.k, 'sump')
    baseline = evaluator.evaluate(bm25_results, actual_files_modified)
    print(f'BM25 baseline: {baseline}')

    # rerank with BERT
    bert_results = bert_reranker.rerank_pipeline(random_commit['commit_message'].values[0], bm25_results)
    bert_eval = evaluator.evaluate(bert_results, actual_files_modified)
    print(f'BERT reranker: {bert_eval}')

test1()

Merge pull request #1362 from spicyj/cb-queue

Make MountReady more reusable, reduce allocations

BM25 baseline: {'MAP': 0.3271, 'P@10': 0.1, 'P@100': 0.04, 'P@1000': 0.004, 'MRR': 1.0, 'Recall@100': 1.0, 'Recall@1000': 1.0}
BERT reranker: {'MAP': 0.306, 'P@10': 0.1, 'P@100': 0.04, 'P@1000': 0.004, 'MRR': 1.0, 'Recall@100': 1.0, 'Recall@1000': 1.0}


In [16]:
# BM25 Baseline
# {'MAP': 0.1542, 'P@10': 0.087, 'P@100': 0.0267, 'P@1000': 0.0041, 'MRR': 0.2133, 'Recall@100': 0.5077, 'Recall@1000': 0.6845}

# BERT reranker with training
# {'MAP': 0.0842, 'P@10': 0.054, 'P@100': 0.0231, 'P@1000': 0.0041, 'MRR': 0.1404, 'Recall@100': 0.4266, 'Recall@1000': 0.6845}

# BERT reranker with training on 1 epoch
# {'MAP': 0.1471, 'P@10': 0.071, 'P@100': 0.0266, 'P@1000': 0.0041, 'MRR': 0.2243, 'Recall@100': 0.4841, 'Recall@1000': 0.6845}

# BERT reranker with training on 10 epochs
# {MAP: 0.1934, P@10: 0.104, P@100: 0.0282, P@1000: 0.0041, MRR: 0.2704, Recall@100: 0.499, Recall@1000: 0.6845}

In [66]:
bert_without_training = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], repo_path=repo_path, rerankers=rerankers)
print(f'Results without training: {bert_without_training}')



100%|██████████| 100/100 [08:58<00:00,  5.39s/it]

Evaluation results written to smalldata/ftr/BM25Searcher_results.txt
Results without training: {'MAP': 0.1471, 'P@10': 0.071, 'P@100': 0.0266, 'P@1000': 0.0041, 'MRR': 0.2243, 'Recall@100': 0.4841, 'Recall@1000': 0.6845}





## Reranking with training

In [None]:
def train_reranker(bertranker, train_dataloader, validation_dataloader, freeze_bert, save_dir):
    save_dir = os.path.join(save_dir, 'models')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Set BERT parameters to not require gradients
    # for param in bertranker.model.parameters():
    #     param.requires_grad = False if freeze_bert else True


    # if freeze_bert:
    #     optimizer = torch.optim.Adam(bertranker.mlp.parameters(), lr=bertranker.parameters['mlp_lr'], weight_decay=bertranker.parameters['weight_decay'])
    # else:
    #     optimizer = torch.optim.Adam([
    #         {'params': bertranker.model.parameters(), 'lr': bertranker.parameters['bert_lr'], 'weight_decay': bertranker.parameters['weight_decay']},
    #         {'params': bertranker.mlp.parameters(), 'lr': bertranker.parameters['mlp_lr'], 'weight_decay': bertranker.parameters['weight_decay']}
    #             ], lr=bertranker.parameters['mlp_lr'])

    optimizer = torch.optim.Adam(bertranker.model.parameters(), lr=bertranker.parameters['bert_lr'])

    # one optimizer for both BERT and MLP with same learning rate


    print(f'Optimizer: {optimizer}')

    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)
    # Set up the loss function
    criterion = nn.BCEWithLogitsLoss()  #

    # Set up training variables
    num_epochs = bertranker.parameters['num_epochs']
    # print train and val dataloader sizes
    print(f'Train dataloader size: {len(train_dataloader)}')
    print(f'Val dataloader size: {len(validation_dataloader)}')
    # Training loop
    print('Starting training loop')

    if freeze_bert:
        print('BERT is frozen, training only MLP')
    else:
        print('BERT is unfrozen, training BERT and MLP')
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    # model_name = 'bert_reranker_frozen' if freeze_bert else 'bert_reranker'
    model_name = bertranker.parameters['model_name'].replace('/', '_') + '_frozen' if freeze_bert else bertranker.parameters['model_name'].replace('/', '_')
    model_name += '_frozen' if freeze_bert else ''
    print(f'Model name: {model_name}')
    # for epoch in range(epochs):
    for epoch in tqdm(range(num_epochs)):
        # self.model.eval()  # Make sure the BERT model is in evaluation mode
        # if freeze_bert:
        #     bertranker.model.eval()  # BERT finetuning should be in eval mode
        # else:
        #     bertranker.model.train()  # BERT finetuning should be in train mode

        bertranker.model.train()  # BERT finetuning should be in train mode
        bertranker.mlp.train()  # MLP should be in training mode
        total_loss = 0

        for batch in train_dataloader:
            # breakpoint()
            b_input_ids, b_attention_mask, b_labels = batch
            b_input_ids = b_input_ids.to(bertranker.device)
            b_attention_mask = b_attention_mask.to(bertranker.device)
            b_labels = b_labels.float().to(bertranker.device)

            # Forward pass
            if freeze_bert:
                with torch.no_grad():
                    # pooled_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
                    cls_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]

            else:
                pooled_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
            cls_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]

            logits = bertranker.mlp(cls_output).squeeze(-1) # type: ignore

            # outputs = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
            # logits = bertranker.mlp(outputs.logits).squeeze(-1) # type: ignore
            # logits = outputs.logits.squeeze(-1) # type: ignore
            # Compute loss
            loss = criterion(logits, b_labels)
            total_loss += loss.item()

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Calculate average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Validation step
        bertranker.model.eval()
        bertranker.mlp.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for batch in validation_dataloader:
                b_input_ids, b_attention_mask, b_labels = batch
                b_input_ids = b_input_ids.to(bertranker.device)
                b_attention_mask = b_attention_mask.to(bertranker.device)
                b_labels = b_labels.float().to(bertranker.device)

                pooled_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
                cls_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]
                logits = bertranker.mlp(cls_output).squeeze(-1) # type: ignore

                # outputs = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                # logits = outputs.logits.squeeze(-1) # type: ignore

                # Compute loss
                loss = criterion(logits, b_labels.float())
                total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # scheduler.step(avg_val_loss)
        # Save losses
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)

        # Print progress
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Average training loss: {avg_train_loss}")
        print(f"Validation Loss: {avg_val_loss}")
        print(f'Best validation loss: {best_val_loss}')

        # save graph of losses
        plt.plot(train_losses, label='Training loss', color='blue', linestyle='dashed', linewidth=1, marker='o', markerfacecolor='blue', markersize=3)
        plt.plot(val_losses, label='Validation loss', color='red', linestyle='dashed', linewidth=1, marker='o', markerfacecolor='red', markersize=3)
        plt.legend(frameon=False)
        plt.savefig(os.path.join(save_dir, f'{model_name}_losses.png'))
        plt.close()

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # model name with frozen or unfrozen bert
            save_path = os.path.join(save_dir, f'{model_name}_best_model.pth')
            mlp_save_path = os.path.join(save_dir, f'{model_name}_best_mlp.pth')
            torch.save(bertranker.model.state_dict(), save_path)
            torch.save(bertranker.mlp.state_dict(), mlp_save_path)

            print(f"Model saved with validation loss: {best_val_loss}")

            # evaluate on train set



        # Here you can add early stopping based on validation loss

    print("Training complete!")

In [39]:
# import from_pandas method
from datasets import Dataset as HFDataset
from transformers import TrainingArguments, Trainer
import evaluate

In [25]:
# convert triplet_data to HuggingFace Dataset
triplet_data['label'] = triplet_data['label'].astype(float)
train_df, val_df = train_test_split(triplet_data, test_size=0.2, random_state=42, stratify=triplet_data['label'])
train_hf_dataset = HFDataset.from_pandas(train_df, split='train')
val_hf_dataset = HFDataset.from_pandas(val_df, split='validation')

In [None]:
# training methods
def tokenize_hf(example):
    return bert_reranker.tokenizer(example['query'], example['passage'], truncation=True, padding='max_length', max_length=bert_reranker.max_seq_length, return_tensors='pt', add_special_tokens=True)

In [27]:
tokenized_train_dataset = train_hf_dataset.map(tokenize_hf, batched=True)
tokenized_val_dataset = val_hf_dataset.map(tokenize_hf, batched=True)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
Map: 100%|██████████| 17164/17164 [00:15<00:00, 1074.31 examples/s]
Map: 100%|██████████| 4292/4292 [00:03<00:00, 1093.68 examples/s]


In [28]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['query', 'passage'])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['query', 'passage'])

# rename label column to labels
tokenized_train_dataset = tokenized_train_dataset.rename_column('label', 'labels')
tokenized_val_dataset = tokenized_val_dataset.rename_column('label', 'labels')

# set format to pytorch
tokenized_train_dataset = tokenized_train_dataset.with_format('torch')
tokenized_val_dataset = tokenized_val_dataset.with_format('torch')

In [29]:
tokenized_train_dataset.features

{'labels': Value(dtype='float64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [61]:
hf_output_dir = os.path.join(repo_path, f'{save_model_name}_model_output')
train_args = TrainingArguments(
    output_dir=hf_output_dir,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=5,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    save_total_limit=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    )

In [55]:
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(100))
small_val_dataset = tokenized_val_dataset.shuffle(seed=42).select(range(100))

In [62]:
trainer = Trainer(
    model = bert_reranker.model,
    args = train_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_val_dataset,
    # compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [63]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2,0.180959
2,0.2088,0.212182
3,0.1729,0.175978
4,0.1675,0.168036
5,0.1582,0.165875


TrainOutput(global_step=2685, training_loss=0.18268427253879427, metrics={'train_runtime': 2889.3115, 'train_samples_per_second': 29.703, 'train_steps_per_second': 0.929, 'total_flos': 2.257998803257344e+16, 'train_loss': 0.18268427253879427, 'epoch': 5.0})

In [74]:
best_model_path = os.path.join(hf_output_dir, 'best_model')
trainer.save_model(best_model_path)

In [73]:
trainer

<transformers.trainer.Trainer at 0x7fd00c68dc70>

In [71]:
bert_reranker.model = AutoModelForSequenceClassification.from_pretrained('smalldata/ftr/microsoft_codebert-base_model_output/checkpoint-2685')
bert_reranker.model.to(bert_reranker.device)
rerankers = [bert_reranker]

In [72]:
bert_with_training = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=None, aggregation_strategy=params['aggregation_strategy'], repo_path=repo_path, rerankers=rerankers)



100%|██████████| 100/100 [08:59<00:00,  5.39s/it]

Evaluation results written to smalldata/ftr/BM25Searcher_results.txt





In [None]:
# bert_reranker.train_mlp(train_dataloader, val_dataloader)
train_reranker(bert_reranker, train_dataloader, val_dataloader, freeze_bert=args.freeze_bert, save_dir=repo_path)

In [None]:

reranker_output_file = f"925_bert_reranker_{save_model_name}_N{args.n}_K{args.k}_non_frozen_metrics.txt" if not args.freeze_bert else f"bert_reranker_{save_model_name}_N{args.n}_K{args.k}_frozen_metrics.txt"

# reranker_output_file = f"bert_reranker_{save_model_name}_N{args.n}_K{args.k}_without_mlp_metrics.txt"
reranker_output_path = os.path.join(eval_path, reranker_output_file)

bert_reranker_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=reranker_output_path, aggregation_strategy='sump', rerankers=rerankers, repo_path=repo_path)

print("BERT Reranker Evaluation")
print(bert_reranker_eval)

In [30]:
torch.cuda.empty_cache()