In [7]:
import sys
sys.path.append('../src')

In [38]:
import argparse
from typing import List

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator
from utils import AggregatedSearchResult, get_combined_df

In [3]:
def prepare_data_from_df(df, searcher, depth, n_positive, n_negative):
    data = []
    print(f'Preparing data from dataframe of size: {len(df)}')

    for _, row in df.iterrows():
        commit_message = row['commit_message']
        actual_files_modified = row['actual_files_modified']
        # search_results = search(searcher, commit_message, row['commit_date'], 1000)

        # search_results = searcher.search(commit_message, row['commit_date'], 100)
        search_results = searcher.pipeline(commit_message, row['commit_date'], depth, 'sump')

        # flatten the contributing results for each aggregated result
        search_results = [result for agg_result in search_results for result in agg_result.contributing_results]

        # efficiently get the top n_positive and n_negative samples
        positive_samples = []
        negative_samples = []

        for result in search_results:
            if result.file_path in actual_files_modified and len(positive_samples) < n_positive:
                positive_samples.append(result.commit_msg)
            elif result.file_path not in actual_files_modified and len(negative_samples) < n_negative:
                negative_samples.append(result.commit_msg)

            if len(positive_samples) == n_positive and len(negative_samples) == n_negative:
                break

        # Get positive and negative samples
        # positive_samples = [res.commit_msg for res in search_results if res.file_path in actual_files_modified][:n_positive]
        # negative_samples = [res.commit_msg for res in search_results if res.file_path not in actual_files_modified][:n_negative]

        for sample_msg in positive_samples:
            # sample_msg  = reverse_tokenize(json.loads(sample.raw)['contents'])
            data.append((commit_message, sample_msg, 1))

        for sample_msg in negative_samples:
            # sample_msg  = reverse_tokenize(json.loads(sample.raw)['contents'])
            data.append((commit_message, sample_msg, 0))

    return data

In [44]:
class BERTReranker:
    # def __init__(self, model_name, psg_len, psg_cnt, psg_stride, agggreagtion_strategy, batch_size, use_gpu=True):
    def __init__(self, parameters):
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        # self.model = AutoModel.from_pretrained(self.model_name, num_labels=1)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)
        self.model.eval()  # Set the model to evaluation mode

        print(f'Using device: {self.device}')

        if torch.cuda.is_available() and parameters['use_gpu']:
            # print GPU info
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt']
        # self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy']
        self.batch_size = parameters['batch_size']
        # self.max_title_len = parameters.get('max_title_len', 0)
        # self.use_title = self.max_title_len > 0
        self.rerank_depth = parameters['rerank_depth']
        # self.max_seq_length = parameters.get('max_seq_length', 512)
        self.max_seq_length = self.tokenizer.model_max_length

        print(f"Initialized BERT reranker with parameters: {parameters}")

        input_dim = parameters['INPUT_DIM']  # Default BERT hidden size
        hidden_dim = parameters['HIDDEN_DIM']  # Example hidden size
        output_dim = parameters['OUTPUT_DIM']  # We want a single score as output

        self.mlp = MLP(input_dim, hidden_dim, output_dim).to(self.device)

    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        aggregated_results = aggregated_results[:self.rerank_depth]
        print(f'Reranking {len(aggregated_results)} results')

        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        query_passage_pairs = []
        for agg_result in aggregated_results:
            query_passage_pairs.extend(
                (query, result.commit_msg)
                for result in agg_result.contributing_results[: self.psg_cnt]
            )
        print(f'Flattened query passage pairs: {len(query_passage_pairs)}')

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank')
            return aggregated_results
        # query_passage_pairs = [(query, result.commit_msg) for aggregated_result in aggregated_results for result in aggregated_result.contributing_results]

        # print('Flattened query passage pairs')

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # print('Encoded query passage pairs')

        # create tensors for the input ids, attention masks
        input_ids = torch.cat([encoded_pair['input_ids'] for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.cat([encoded_pair['attention_mask'] for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size)

        # print('Created dataloader')

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        for agg_result in aggregated_results:
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has
            end_index = score_index + len(agg_result.contributing_results)
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index

            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get the pooled output from BERT's [CLS] token
                pooled_output = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output

                # Pass the pooled output through the MLP to get the scores
                logits = self.mlp(pooled_output).squeeze(-1) # type: ignore

                # Collect the scores (detach them from the computation graph and move to CPU)
                scores.extend(logits.detach().cpu().numpy())

        return scores

    def train_mlp(self, train_dataloader, validation_dataloader):
        # Set BERT parameters to not require gradients
        # for param in self.model.parameters():
        #     param.requires_grad = False

        # Set up the optimizer. Only parameters of the MLP will be updated.
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=self.parameters['lr'])

        # Set up the loss function
        criterion = nn.BCEWithLogitsLoss()  #

        # Set up training variables
        num_epochs = self.parameters['num_epochs']
        # Training loop

        print('Starting training loop')
        # for epoch in range(epochs):
        for epoch in tqdm(range(num_epochs)):
            # self.model.eval()  # Make sure the BERT model is in evaluation mode
            self.model.train()  # BERT finetuning should be in train mode
            self.mlp.train()  # MLP should be in training mode
            total_loss = 0

            for batch in train_dataloader:
                b_input_ids, b_attention_mask, b_labels = batch
                # queries, commits, b_labels = batch

                # # tokenize the query passage pairs and create tensors for the input ids, attention masks, and token type ids
                # encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in zip(queries, commits)]

                # b_input_ids = torch.cat([encoded_pair['input_ids'] for encoded_pair in encoded_pairs], dim=0) # type: ignore
                # b_attention_mask = torch.cat([encoded_pair['attention_mask'] for encoded_pair in encoded_pairs], dim=0) # type: ignore

                # b_input_ids = b_input_ids.to(self.device)
                # b_attention_mask = b_attention_mask.to(self.device)

                # tokenize the query passage pairs

                # b_labels = b_labels.to(self.device)
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)
                b_labels = b_labels.float().to(self.device)

                # Forward pass
                # with torch.no_grad():  # No need to calculate gradients for BERT
                pooled_output = self.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
                logits = self.mlp(pooled_output).squeeze(-1) # type: ignore

                # Compute loss
                loss = criterion(logits, b_labels.float())
                total_loss += loss.item()

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Calculate average loss over the training data.
            avg_train_loss = total_loss / len(train_dataloader)

            # Validation step
            self.model.eval()
            self.mlp.eval()
            total_eval_loss = 0
            for batch in validation_dataloader:
                b_input_ids, b_attention_mask, b_labels = batch
                # queries, commits, b_labels = batch

                # # tokenize the query passage pairs and create tensors for the input ids, attention masks, and token type ids
                # encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in zip(queries, commits)]

                # b_input_ids = torch.cat([encoded_pair['input_ids'] for encoded_pair in encoded_pairs], dim=0) # type: ignore
                # b_attention_mask = torch.cat([encoded_pair['attention_mask'] for encoded_pair in encoded_pairs], dim=0) # type: ignore

                # b_input_ids = b_input_ids.to(self.device)
                # b_attention_mask = b_attention_mask.to(self.device)

                # tokenize the query passage pairs

                # b_labels = b_labels.to(self.device)

                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)
                b_labels = b_labels.float().to(self.device)

                with torch.no_grad():
                    pooled_output = self.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
                    logits = self.mlp(pooled_output).squeeze(-1) # type: ignore

                # Compute loss
                loss = criterion(logits, b_labels.float())
                total_eval_loss += loss.item()

            avg_val_loss = total_eval_loss / len(validation_dataloader)

            # Print progress
            print(f"Epoch {epoch+1}/{num_epochs}")
            print(f"Average training loss: {avg_train_loss}")
            print(f"Validation Loss: {avg_val_loss}")
            break

            # Here you can add early stopping based on validation loss

        print("Training complete!")

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def rerank_pipeline(self, query, aggregated_results):
        reranked_results = self.rerank(query, aggregated_results)
        return reranked_results

In [5]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [45]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']
repo_path = '../smalldata/fbr/'
index_path = '../smalldata/fbr/index_commit_tokenized/'
K = 1000
n = 10

In [9]:
combined_df = get_combined_df(repo_path)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69835 entries, 0 to 69834
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   owner                  69835 non-null  string  
 1   repo_name              69835 non-null  string  
 2   commit_date            69835 non-null  int64   
 3   commit_id              69835 non-null  string  
 4   commit_message         69835 non-null  string  
 5   file_path              69835 non-null  string  
 6   cur_file_content       67179 non-null  string  
 7   previous_commit_id     64247 non-null  string  
 8   previous_file_path     4140 non-null   string  
 9   previous_file_content  64247 non-null  string  
 10  diff                   61590 non-null  string  
 11  status                 69835 non-null  category
 12  is_merge_request       69835 non-null  bool    
 13  file_extension         69835 non-null  object  
dtypes: bool(1), category(1), int64(1), obj

In [10]:
BM25_AGGR_STRAT = 'sump'

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

Loaded index at ../smalldata/fbr/index_commit_tokenized/
Index Stats: {'total_terms': 8061856, 'documents': 69835, 'non_empty_documents': 69835, 'unique_terms': 14589}


In [11]:
bm25_baseline_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file='BM25_metrics.txt', aggregation_strategy=BM25_AGGR_STRAT, repo_path=repo_path)

100%|██████████| 10/10 [00:01<00:00,  6.06it/s]

Evaluation results written to ../smalldata/fbr/BM25_metrics.txt





In [12]:
print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)

BM25 Baseline Evaluation
{'MAP': 0.2247, 'P@10': 0.09, 'P@100': 0.029, 'P@1000': 0.0033, 'MRR': 0.2406, 'Recall@100': 0.6583, 'Recall@1000': 0.7417}


In [27]:
NUM_POSITIVE = 10
NUM_NEGATIVE = 100

In [19]:
# Reranking with BERT
params = {
    'model_name': 'microsoft/codebert-base',
    'psg_len': 400,
    'psg_cnt': 3,
    # 'psg_stride': 32,
    'aggregation_strategy': 'sump',
    'batch_size': 32,
    'use_gpu': True,
    'rerank_depth': 500,
    'num_epochs': 10,
    'lr': 2e-5,
    'INPUT_DIM': 768,
    'HIDDEN_DIM': 100,
    'OUTPUT_DIM': 1,
    # 'max_seq_length': 512,
}

In [46]:
bert_reranker = BERTReranker(params)
rerankers = [bert_reranker]

Using device: cpu
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_len': 400, 'psg_cnt': 3, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 500, 'num_epochs': 10, 'lr': 2e-05, 'INPUT_DIM': 768, 'HIDDEN_DIM': 100, 'OUTPUT_DIM': 1}


In [25]:
# Step 1: Filter necessary columns
filtered_df = combined_df[['commit_date', 'commit_message', 'commit_id', 'file_path']]

# Step 2: Group by commit_id
grouped_df = filtered_df.groupby(['commit_id', 'commit_date', 'commit_message'])['file_path'].apply(list).reset_index()
grouped_df.rename(columns={'file_path': 'actual_files_modified'}, inplace=True)

# Step 3: Determine midpoint and filter dataframe
midpoint_date = np.median(grouped_df['commit_date'])
recent_df = grouped_df[grouped_df['commit_date'] > midpoint_date]
print(f'Number of commits after midpoint date: {len(recent_df)}')
# sys.exit(0)

recent_df = recent_df.head(1000)

Number of commits after midpoint date: 5795


In [35]:
recent_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 2015
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   commit_id              1000 non-null   string
 1   commit_date            1000 non-null   int64 
 2   commit_message         1000 non-null   string
 3   actual_files_modified  1000 non-null   object
dtypes: int64(1), object(1), string(2)
memory usage: 39.1+ KB


In [28]:
# Step 4: Split recent dataframe and prepare data
df_train, df_temp = train_test_split(recent_df, test_size=0.2, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

print('Preparing data...')


Preparing data...
Preparing data from dataframe of size: 800
Preparing data from dataframe of size: 100
Preparing data from dataframe of size: 100


In [None]:

train_depth = 1000
train_data = prepare_data_from_df(df_train, bm25_searcher, depth=train_depth, n_positive=NUM_POSITIVE, n_negative=NUM_NEGATIVE)
val_data = prepare_data_from_df(df_val, bm25_searcher, depth=train_depth, n_positive=NUM_POSITIVE, n_negative=NUM_NEGATIVE)
test_data = prepare_data_from_df(df_test, bm25_searcher, depth=train_depth, n_positive=NUM_POSITIVE, n_negative=NUM_NEGATIVE)

In [32]:
# get distribution of labels
train_labels = [label for _, _, label in train_data]
val_labels = [label for _, _, label in val_data]
test_labels = [label for _, _, label in test_data]

# print size of data
print(f'Train data size: {len(train_data)}')
print(f'Val data size: {len(val_data)}')
print(f'Test data size: {len(test_data)}')

print(f'Train data sample: {train_data[0]}')

print(f'Train label distribution: {np.unique(train_labels, return_counts=True)}')
print(f'Val label distribution: {np.unique(val_labels, return_counts=True)}')
print(f'Test label distribution: {np.unique(test_labels, return_counts=True)}')

Train data size: 83414
Val data size: 10372
Test data size: 10486
Train data sample: ("Clean up enableSyncDefaultUpdates flag a bit (#26858)\n\n## Overview\r\n\r\nDoes a few things:\r\n- Renames `enableSyncDefaultUpdates` to\r\n`forceConcurrentByDefaultForTesting`\r\n- Changes the way it's used so it's dead-code eliminated separate from\r\n`allowConcurrentByDefault`\r\n- Deletes a bunch of the gated code\r\n\r\nThe gates that are deleted are unnecessary now. We were keeping them\r\nwhen we originally thought we would come back to being concurrent by\r\ndefault. But we've shifted and now sync-by default is the desired\r\nbehavior long term, so there's no need to keep all these forked tests\r\naround.\r\n\r\nI'll follow up to delete more of the forked behavior if possible.\r\nIdeally we wouldn't need this flag even if we're still using\r\n`allowConcurrentByDefault`.", 'Revert "Re-arrange slightly to prevent refactor hazard (#16743)" (#16769)\n\nThis reverts commit ab4951fc03750a726e412b9

In [39]:
class TripletDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        query, passage, label = self.data[index]

        # tokenize the query passage pairs and create tensors for the input ids, attention masks, and token type ids
        encoded_pair = self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True)

        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)

        return input_ids, attention_mask, label

In [40]:
train_dataset = TripletDataset(train_data, bert_reranker.tokenizer, bert_reranker.max_seq_length)
val_dataset = TripletDataset(val_data, bert_reranker.tokenizer, bert_reranker.max_seq_length)
test_dataset = TripletDataset(test_data, bert_reranker.tokenizer, bert_reranker.max_seq_length)

In [41]:
train_dataloader = DataLoader(train_dataset, batch_size=bert_reranker.batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=bert_reranker.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=bert_reranker.batch_size, shuffle=False)

In [None]:
bert_reranker.train_mlp(train_dataloader, val_dataloader)