In [1]:
import sys
sys.path.append('src')

In [2]:
!ls

2_7	  data		 logs	    profiling  requirements.txt  src
2_8	  debug_test.py  misc	    README.md  scripts		 temp.py
big_logs  logging.conf	 notebooks  repos      smalldata


In [3]:
import os

# import pickle
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.init as init
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer

from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator
from utils import (
    AggregatedSearchResult,
    TripletDataset,
    get_combined_df,
    prepare_triplet_data_from_df,
    set_seed,
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
set_seed(42)

In [5]:
# print torch devices available
print('Available devices: ', torch.cuda.device_count())
print('Current cuda device: ', torch.cuda.current_device())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Available devices:  1
Current cuda device:  0
Quadro RTX 6000


In [6]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # Adding an intermediate layer
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [6]:
class BERTReranker:
    # def __init__(self, model_name, psg_len, psg_cnt, psg_stride, agggreagtion_strategy, batch_size, use_gpu=True):
    def __init__(self, parameters):
        self.parameters = parameters
        self.model_name = parameters['model_name']
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        # self.model = AutoModel.from_pretrained(self.model_name, num_labels=1)
        self.model = AutoModel.from_pretrained(self.model_name)
        # self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1)
        self.device = torch.device("cuda" if torch.cuda.is_available() and parameters['use_gpu'] else "cpu")
        self.model.to(self.device)

        print(f'Using device: {self.device}')

        if torch.cuda.is_available() and parameters['use_gpu']:
            # print GPU info
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f'GPU Device Count: {torch.cuda.device_count()}')
            print(f"GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")


        self.psg_len = parameters['psg_len']
        self.psg_cnt = parameters['psg_cnt']
        # self.psg_stride = parameters.get('psg_stride', self.psg_len)
        self.aggregation_strategy = parameters['aggregation_strategy']
        self.batch_size = parameters['batch_size']
        # self.max_title_len = parameters.get('max_title_len', 0)
        # self.use_title = self.max_title_len > 0
        self.rerank_depth = parameters['rerank_depth']
        # self.max_seq_length = parameters.get('max_seq_length', 512)
        self.max_seq_length = self.tokenizer.model_max_length

        print(f"Initialized BERT reranker with parameters: {parameters}")

        # input_dim = parameters['INPUT_DIM']  # Default BERT hidden size
        # hidden_dim = parameters['HIDDEN_DIM']   # Example hidden size
        # output_dim = parameters['OUTPUT_DIM']  # We want a single score as output

        self.mlp = MLP(self.model.config.hidden_size, parameters['hidden_dim'], 1, parameters['dropout_prob']).to(self.device)

    def rerank(self, query, aggregated_results: List[AggregatedSearchResult]):
        """
        Rerank the BM25 aggregated search results using BERT model scores.

        query: The issue query string.
        aggregated_results: A list of AggregatedSearchResult objects from BM25 search.
        """
        # aggregated_results = aggregated_results[:self.rerank_depth] # already done in the pipeline
        # print(f'Reranking {len(aggregated_results)} results')

        # Flatten the list of results into a list of (query, passage) pairs but only keep max psg_cnt passages per file
        query_passage_pairs = []
        for agg_result in aggregated_results:
            query_passage_pairs.extend(
                (query, result.commit_msg)
                for result in agg_result.contributing_results[: self.psg_cnt]
            )

        if not query_passage_pairs:
            print('WARNING: No query passage pairs to rerank')
            print(query, aggregated_results, self.psg_cnt)
            return aggregated_results

        # tokenize the query passage pairs
        encoded_pairs = [self.tokenizer.encode_plus([query, passage], max_length=self.max_seq_length, truncation=True, padding='max_length', return_tensors='pt', add_special_tokens=True) for query, passage in query_passage_pairs]

        # create tensors for the input ids, attention masks
        input_ids = torch.stack([encoded_pair['input_ids'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore
        attention_masks = torch.stack([encoded_pair['attention_mask'].squeeze() for encoded_pair in encoded_pairs], dim=0) # type: ignore

        # Create a dataloader for feeding the data to the model
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)

        scores = self.get_scores(dataloader, self.model)

        score_index = 0
        # Now assign the scores to the aggregated results by mapping the scores to the contributing results
        for agg_result in aggregated_results:
            # Each aggregated result gets a slice of the scores equal to the number of contributing results it has which should be min(psg_cnt, len(contributing_results))
            assert score_index < len(scores), f'score_index {score_index} is greater than or equal to scores length {len(scores)}'
            end_index = score_index + len(agg_result.contributing_results[: self.psg_cnt])
            cur_passage_scores = scores[score_index:end_index]
            score_index = end_index


            # Aggregate the scores for the current aggregated result
            agg_score = self.aggregate_scores(cur_passage_scores)
            agg_result.score = agg_score  # Assign the aggregated score

        assert score_index == len(scores), f'score_index {score_index} does not equal scores length {len(scores)}, indices probably not working correctly'

        # Sort by the new aggregated score
        aggregated_results.sort(key=lambda res: res.score, reverse=True)

        return aggregated_results

    def get_scores(self, dataloader, model):
        scores = []
        with torch.no_grad():
            for batch in dataloader:
                # Unpack the batch and move it to GPU
                b_input_ids, b_attention_mask = batch
                b_input_ids = b_input_ids.to(self.device)
                b_attention_mask = b_attention_mask.to(self.device)

                # Get the pooled output from BERT's [CLS] token
                # pooled_output = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output

                cls_output = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]

                # # Pass the pooled output through the MLP to get the scores
                # logits = self.mlp(pooled_output).squeeze(-1) # type: ignore
                logits = self.mlp(cls_output).squeeze(-1) # type: ignore

                # # Collect the scores (detach them from the computation graph and move to CPU)
                scores.extend(logits.detach().cpu().numpy())


                # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                # logits = outputs.logits
                # scores.extend(logits.detach().cpu().numpy().squeeze(-1))

        return scores

    def aggregate_scores(self, passage_scores):
        """
        Aggregate passage scores based on the specified strategy.
        """
        if len(passage_scores) == 0:
            return 0.0


        if self.aggregation_strategy == 'firstp':
            return passage_scores[0]
        if self.aggregation_strategy == 'maxp':
            return max(passage_scores)
        if self.aggregation_strategy == 'avgp':
            return sum(passage_scores) / len(passage_scores)
        if self.aggregation_strategy == 'sump':
            return sum(passage_scores)
        # else:
        raise ValueError(f"Invalid score aggregation method: {self.aggregation_strategy}")

    def rerank_pipeline(self, query, aggregated_results):
        if len(aggregated_results) == 0:
            return aggregated_results
        top_results = aggregated_results[:self.rerank_depth]
        bottom_results = aggregated_results[self.rerank_depth:]
        reranked_results = self.rerank(query, top_results)
        min_top_score = reranked_results[-1].score
        # now adjust the scores of bottom_results
        for i, result in enumerate(bottom_results):
            result.score = min_top_score - i - 1
        # combine the results
        reranked_results.extend(bottom_results)
        assert(len(reranked_results) == len(aggregated_results))
        return reranked_results

In [12]:
def train_reranker(bertranker, train_dataloader, validation_dataloader, freeze_bert, save_dir):
    # Set BERT parameters to not require gradients
    save_dir = os.path.join(save_dir, 'models')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # for param in bertranker.model.parameters():
    #     param.requires_grad = False if freeze_bert else True


    # if freeze_bert:
    #     optimizer = torch.optim.Adam(bertranker.mlp.parameters(), lr=bertranker.parameters['mlp_lr'], weight_decay=bertranker.parameters['weight_decay'])
    # else:
    #     optimizer = torch.optim.Adam([
    #         {'params': bertranker.model.parameters(), 'lr': bertranker.parameters['bert_lr'], 'weight_decay': bertranker.parameters['weight_decay']},
    #         {'params': bertranker.mlp.parameters(), 'lr': bertranker.parameters['mlp_lr'], 'weight_decay': bertranker.parameters['weight_decay']}
    #             ], lr=bertranker.parameters['mlp_lr'])

    optimizer = torch.optim.Adam(bertranker.model.parameters(), lr=bertranker.parameters['bert_lr'])

    # one optimizer for both BERT and MLP with same learning rate


    print(f'Optimizer: {optimizer}')

    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)
    # Set up the loss function
    criterion = nn.BCEWithLogitsLoss()  #

    # Set up training variables
    num_epochs = bertranker.parameters['num_epochs']
    # print train and val dataloader sizes
    print(f'Train dataloader size: {len(train_dataloader)}')
    print(f'Val dataloader size: {len(validation_dataloader)}')
    # Training loop
    print('Starting training loop')

    if freeze_bert:
        print('BERT is frozen, training only MLP')
    else:
        print('BERT is unfrozen, training BERT and MLP')
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    # model_name = 'bert_reranker_frozen' if freeze_bert else 'bert_reranker'
    model_name = bertranker.parameters['model_name'].replace('/', '_') + '_frozen' if freeze_bert else bertranker.parameters['model_name'].replace('/', '_')
    model_name += '_frozen' if freeze_bert else ''
    print(f'Model name: {model_name}')
    # for epoch in range(epochs):
    for epoch in tqdm(range(num_epochs)):
        # self.model.eval()  # Make sure the BERT model is in evaluation mode
        # if freeze_bert:
        #     bertranker.model.eval()  # BERT finetuning should be in eval mode
        # else:
        #     bertranker.model.train()  # BERT finetuning should be in train mode

        bertranker.model.train()  # BERT finetuning should be in train mode
        bertranker.mlp.train()  # MLP should be in training mode
        total_loss = 0

        for batch in train_dataloader:
            # breakpoint()
            b_input_ids, b_attention_mask, b_labels = batch
            b_input_ids = b_input_ids.to(bertranker.device)
            b_attention_mask = b_attention_mask.to(bertranker.device)
            b_labels = b_labels.float().to(bertranker.device)

            # Forward pass
            if freeze_bert:
                with torch.no_grad():
                    # pooled_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
                    cls_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]

            else:
                pooled_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
            cls_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]

            logits = bertranker.mlp(cls_output).squeeze(-1) # type: ignore

            # outputs = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
            # logits = bertranker.mlp(outputs.logits).squeeze(-1) # type: ignore
            # logits = outputs.logits.squeeze(-1) # type: ignore
            # Compute loss
            loss = criterion(logits, b_labels)
            total_loss += loss.item()

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Calculate average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Validation step
        bertranker.model.eval()
        bertranker.mlp.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for batch in validation_dataloader:
                b_input_ids, b_attention_mask, b_labels = batch
                b_input_ids = b_input_ids.to(bertranker.device)
                b_attention_mask = b_attention_mask.to(bertranker.device)
                b_labels = b_labels.float().to(bertranker.device)

                pooled_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).pooler_output
                cls_output = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask).last_hidden_state[:, 0, :]
                logits = bertranker.mlp(cls_output).squeeze(-1) # type: ignore

                # outputs = bertranker.model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)
                # logits = outputs.logits.squeeze(-1) # type: ignore

                # Compute loss
                loss = criterion(logits, b_labels.float())
                total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # scheduler.step(avg_val_loss)
        # Save losses
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)

        # Print progress
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Average training loss: {avg_train_loss}")
        print(f"Validation Loss: {avg_val_loss}")
        print(f'Best validation loss: {best_val_loss}')

        # save graph of losses
        plt.plot(train_losses, label='Training loss', color='blue', linestyle='dashed', linewidth=1, marker='o', markerfacecolor='blue', markersize=3)
        plt.plot(val_losses, label='Validation loss', color='red', linestyle='dashed', linewidth=1, marker='o', markerfacecolor='red', markersize=3)
        plt.legend(frameon=False)
        plt.savefig(os.path.join(save_dir, f'{model_name}_losses.png'))
        plt.close()

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # model name with frozen or unfrozen bert
            save_path = os.path.join(save_dir, f'{model_name}_best_model.pth')
            mlp_save_path = os.path.join(save_dir, f'{model_name}_best_mlp.pth')
            torch.save(bertranker.model.state_dict(), save_path)
            torch.save(bertranker.mlp.state_dict(), mlp_save_path)

            print(f"Model saved with validation loss: {best_val_loss}")

            # evaluate on train set



        # Here you can add early stopping based on validation loss

    print("Training complete!")

In [7]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    repo_path='../smalldata/ftr',
    index_path='../smalldata/ftr/index_commit_tokenized/',
    k=1000,
    n=100,
    overwrite_cache=False,
    freeze_bert=False,
)

In [8]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']
repo_path = args.repo_path
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'

In [9]:
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

Loaded index at ../smalldata/ftr/index_commit_tokenized/
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [10]:
bm25_output_path = os.path.join(eval_path, f'bm25_baseline_N{n}_K{K}_metrics.txt')
print(f'BM25 output path: {bm25_output_path}')

bm25_baseline_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=bm25_output_path, aggregation_strategy=BM25_AGGR_STRAT, repo_path=repo_path)

print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)

BM25 output path: ../smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:23<00:00,  4.29it/s]

Evaluation results written to ../smalldata/ftr/eval/bm25_baseline_N100_K1000_metrics.txt
BM25 Baseline Evaluation
{'MAP': 0.1542, 'P@10': 0.087, 'P@100': 0.0267, 'P@1000': 0.0041, 'MRR': 0.2133, 'Recall@100': 0.5077, 'Recall@1000': 0.6845}





In [10]:
# Reranking with BERT
params = {
    'model_name': 'microsoft/codebert-base',
    'psg_len': 400,
    'psg_cnt': 5,
    # 'psg_stride': 32,
    'aggregation_strategy': 'sump',
    'batch_size': 16,
    # 'batch_size': 512,
    # 'batch_size': 1,
    'use_gpu': True,
    'rerank_depth': 250,
    'num_epochs': 3,
    # 'mlp_lr': 1e-2,
    'mlp_lr': 1e-3,
    'bert_lr': 5e-5,
    'hidden_dim': 128,
    'num_positives': 10,
    'num_negatives': 10,
    'train_depth': 1000,
    'num_workers': 8,
    'weight_decay': 0.01,
    'dropout_prob': 0.5,
    'train_commits': 1500,
}


In [11]:

bert_reranker = BERTReranker(params)
rerankers = [bert_reranker]
save_model_name = params['model_name'].replace('/', '_')

Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 476.73 MB
Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_len': 400, 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 16, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 3, 'mlp_lr': 0.001, 'bert_lr': 5e-05, 'hidden_dim': 128, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 1000, 'num_workers': 8, 'weight_decay': 0.01, 'dropout_prob': 0.5, 'train_commits': 1500}


In [34]:
# get average length of commit messages
combined_df['commit_message'].str.split().str.len().mean()

60.66661695926252

In [11]:
filtered_df = combined_df[['commit_date', 'commit_message', 'commit_id', 'file_path', 'diff']]

# Step 2: Group by commit_id
grouped_df = filtered_df.groupby(['commit_id', 'commit_date', 'commit_message'])['file_path'].apply(list).reset_index()
grouped_df.rename(columns={'file_path': 'actual_files_modified'}, inplace=True)

# Step 3: Determine midpoint and filter dataframe
midpoint_date = np.median(grouped_df['commit_date'])
recent_df = grouped_df[grouped_df['commit_date'] > midpoint_date]
print(f'Number of commits after midpoint date: {len(recent_df)}')
# sys.exit(0)

# recent_df = recent_df.head(2000)


Number of commits after midpoint date: 5804


In [12]:
average_commit_len = recent_df['commit_message'].str.split().str.len().mean()
# filter out commits with less than average length
recent_df = recent_df[recent_df['commit_message'].str.split().str.len() > average_commit_len]
print(f'Number of commits after filtering by commit message length: {len(recent_df)}')

# randomly sample 1500 rows from recent_df
recent_df = recent_df.sample(params['train_commits'])
print(f'Number of commits after sampling: {len(recent_df)}')

Number of commits after filtering by commit message length: 1543
Number of commits after sampling: 1500


In [17]:
def random_print(df):
    # randomly print one commit message
    print(df['commit_message'].sample().values[0])

In [14]:
# prepare data first
if not os.path.exists(os.path.join(repo_path, 'cache')):
    os.makedirs(os.path.join(repo_path, 'cache'))
# train_cache = os.path.join(repo_path, 'cache', 'train_data_cache.pkl')
# val_cache = os.path.join(repo_path, 'cache', 'val_data_cache.pkl')
# test_cache = os.path.join(repo_path, 'cache', 'test_data_cache.pkl')
triplet_cache = os.path.join(repo_path, 'cache', 'triplet_data_cache.pkl')


In [26]:
def temp_prep(df, searcher, search_depth, num_positives, num_negatives):

    data = []
    print(f'Preparing data from dataframe of size: {len(df)} with search_depth: {search_depth}')
    total_positives, total_negatives = 0, 0
    for _, row in df.iterrows():
    # for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):

        cur_positives = 0
        cur_negatives = 0
        pos_commit_ids = set()
        neg_commit_ids = set()
        commit_message = row['commit_message']
        actual_files_modified = row['actual_files_modified']

        agg_search_results = searcher.pipeline(commit_message, row['commit_date'], search_depth, 'sump', aggregate_on='commit')

        # for each agg_result, find out how many files it has edited are in actual_files_modified and sort by score

        for agg_result in agg_search_results:
            agg_result_files = set([result.file_path for result in agg_result.contributing_results])
            intersection = agg_result_files.intersection(actual_files_modified)
            # TODO maybe try this for training
            agg_result.score = len(intersection) / len(agg_result_files) # how focused the commit is
            # agg_result.score = len(intersection)

        agg_search_results.sort(key=lambda res: res.score, reverse=True)

        # go from top to bottom, first num_positives non-0 scores are positive samples and the next num_negatives are negative samples
        for agg_result in agg_search_results:
            cur_commit_msg = agg_result.contributing_results[0].commit_message
            if cur_positives < num_positives and agg_result.score > 0:
                # meaning there is at least one file in the agg_result that is in actual_files_modified
                # pos_commits.append(agg_result)
                data.append((commit_message, cur_commit_msg, 1))
                cur_positives += 1
                pos_commit_ids.add(agg_result.commit_id)
            elif cur_negatives < num_negatives:
                # neg_commits.append(agg_result)
                data.append((commit_message, cur_commit_msg, 0))
                cur_negatives += 1
                neg_commit_ids.add(agg_result.commit_id)
            if cur_positives == num_positives and cur_negatives == num_negatives:
                break

        assert len(pos_commit_ids.intersection(neg_commit_ids)) == 0, 'Positive and negative commit ids should not intersect'
        # print(f"Total positives: {cur_positives}, Total negatives: {cur_negatives}")
        total_positives += cur_positives
        total_negatives += cur_negatives

    # # Write data to cache file
    # with open(cache_file, 'wb') as file:
    #     pickle.dump(data, file)
    #     print(f"Saved data to cache file: {cache_file}")


    # print percentage of positives and negatives
    denom = total_positives + total_negatives
    print(f"Percentage of positives: {total_positives / denom}, Percentage of negatives: {total_negatives / denom}")
    return data









    #     # flatten the contributing results for each aggregated result
    #     search_results = [result for agg_result in agg_search_results for result in agg_result.contributing_results]
    #     search_results.sort(key=lambda res: res.score, reverse=True)
    #     # efficiently get the top num_positives and num_negatives samples
    #     positive_samples = []
    #     negative_samples = []

    #     for result in search_results:
    #         if result.file_path in actual_files_modified and len(positive_samples) < num_positives:
    #             positive_samples.append(result.commit_msg)
    #             total_positives += 1
    #         elif result.file_path not in actual_files_modified and len(negative_samples) < num_negatives:
    #             negative_samples.append(result.commit_msg)
    #             total_negatives += 1

    #         if len(positive_samples) == num_positives and len(negative_samples) == num_negatives:
    #             break


    #     for sample_msg in positive_samples:
    #         data.append((commit_message, sample_msg, 1))

    #     for sample_msg in negative_samples:
    #         data.append((commit_message, sample_msg, 0))
    # print(f"Total positives: {total_positives}, Total negatives: {total_negatives}")
    # # print percentage of positives and negatives
    # denom = total_positives + total_negatives
    # print(f"Percentage of positives: {total_positives / denom}, Percentage of negatives: {total_negatives / denom}")
    # return data


as43lqmefl=temp_prep(recent_df.head(2), bm25_searcher, num_positives=10, num_negatives=10, search_depth=1000)

Preparing data from dataframe of size: 2 with search_depth: 1000
Percentage of positives: 0.2, Percentage of negatives: 0.8


In [17]:
triplet_data = prepare_triplet_data_from_df(recent_df, bm25_searcher, search_depth=params['train_depth'], num_positives=params['num_positives'], num_negatives=params['num_negatives'], cache_file=triplet_cache, overwrite=args.overwrite_cache)

Preparing data from dataframe of size: 1500 with search_depth: 1000


100%|██████████| 1500/1500 [06:26<00:00,  3.89it/s]


Saved data to cache file: ../smalldata/ftr/cache/triplet_data_cache.pkl
Total positives: 6796, Total negatives: 14660
Percentage of positives: 0.3167412378821775, Percentage of negatives: 0.6832587621178225


In [21]:
len(triplet_data)

21456

In [89]:
# Step 4: Split recent dataframe and prepare data
# df_train, df_temp = train_test_split(recent_df, test_size=0.2, random_state=42)
# df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)


In [20]:
# create train, val, test data from triplet data using train_test_split with stratify
# stratify based on the third column of the triplet data



TypeError: list indices must be integers or slices, not tuple

In [31]:

# get distribution of labels
train_labels = [label for _, _, label in train_data]
val_labels = [label for _, _, label in val_data]
test_labels = [label for _, _, label in test_data]

# print size of data
print(f'Train data size: {len(train_data)}')
print(f'Val data size: {len(val_data)}')
print(f'Test data size: {len(test_data)}')

print(f'Train data sample: {train_data[0]}')

print(f'Train label distribution: {np.unique(train_labels, return_counts=True)}')
print(f'Val label distribution: {np.unique(val_labels, return_counts=True)}')
print(f'Test label distribution: {np.unique(test_labels, return_counts=True)}')

Train data size: 42011
Val data size: 5361
Test data size: 5354
Train label distribution: (array([0, 1]), array([32000, 10011]))
Val label distribution: (array([0, 1]), array([4000, 1361]))
Test label distribution: (array([0, 1]), array([4000, 1354]))


In [None]:


train_dataset = TripletDataset(train_data, bert_reranker.tokenizer, bert_reranker.max_seq_length)
val_dataset = TripletDataset(val_data, bert_reranker.tokenizer, bert_reranker.max_seq_length)
test_dataset = TripletDataset(test_data, bert_reranker.tokenizer, bert_reranker.max_seq_length)

# Step 5: train the MLP
train_dataloader = DataLoader(train_dataset, batch_size=bert_reranker.batch_size, shuffle=True, num_workers=params['num_workers'])
val_dataloader = DataLoader(val_dataset, batch_size=bert_reranker.batch_size, shuffle=False, num_workers=params['num_workers'])
test_dataloader = DataLoader(test_dataset, batch_size=bert_reranker.batch_size, shuffle=False, num_workers=params['num_workers'])

In [None]:

# bert_reranker.train_mlp(train_dataloader, val_dataloader)
train_reranker(bert_reranker, train_dataloader, val_dataloader, freeze_bert=args.freeze_bert, save_dir=repo_path)

reranker_output_file = f"925_bert_reranker_{save_model_name}_N{args.n}_K{args.k}_non_frozen_metrics.txt" if not args.freeze_bert else f"bert_reranker_{save_model_name}_N{args.n}_K{args.k}_frozen_metrics.txt"

# reranker_output_file = f"bert_reranker_{save_model_name}_N{args.n}_K{args.k}_without_mlp_metrics.txt"
reranker_output_path = os.path.join(eval_path, reranker_output_file)

bert_reranker_eval = model_evaluator.evaluate_sampling(n=n, k=K, output_file_path=reranker_output_path, aggregation_strategy='sump', rerankers=rerankers, repo_path=repo_path)

print("BERT Reranker Evaluation")
print(bert_reranker_eval)

In [30]:
torch.cuda.empty_cache()