In [3]:
import sys
sys.path.append('src')

In [4]:
from bm25_v2 import BM25Searcher
import pandas as pd
import numpy as np
from utils import get_combined_df, Args
from eval import SearchEvaluator, ModelEvaluator
import json

In [5]:
data_path="data/2_7/facebook_react"
repo_name = data_path.split("/")[-1]
combined_df = get_combined_df(data_path)
index_path=f"{data_path}/index_commit_tokenized"
# REPO_DIR = '/home/ssg2/ssg2/ds/repos/facebook_react'
k=1000 # initial ranker depth
n=100 # number of samples to evaluate on

In [6]:
# load fid_to_path and path_to_fid json files to dicts
with open(f"facebook_react_FID_to_paths.json") as f:
    fid_to_path = json.load(f)

# make all fids ints
fid_to_path = {int(k): v for k, v in fid_to_path.items()}

with open(f"facebook_react_path_to_FID.json") as f:
    path_to_fid = json.load(f)

In [7]:
metrics = ['MAP', 'P@1', 'P@10', 'P@20', 'P@30', 'MRR', 'R@1', 'R@10', 'R@100', 'R@1000']
bm25_searcher = BM25Searcher(index_path, fid_to_path, path_to_fid)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df, fid_to_path, path_to_fid)
gold_df = pd.read_parquet(f"gold/{repo_name}/v2_{repo_name}_gpt4_gold.parquet")
#! rename commit_message to original_message
gold_df = gold_df.rename(columns={'commit_message': 'original_message'})
#!rename transformed_message to commit_message
gold_df = gold_df.rename(columns={f'transformed_message_gpt4': 'commit_message'})

Loaded index at data/2_7/facebook_react/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [17]:
test_id, test_date, test_orig, test_files, test_query = gold_df.iloc[6]
test_files_ids = [path_to_fid[f] for f in test_files]
test_query, test_files, test_files_ids

("Input properties 'value' and 'defaultValue' accepting and assigning functions and symbols leads to improper handling and inconsistencies in numeric equality checks.",
 array(['packages/react-dom/src/__tests__/ReactDOMInput-test.js',
        'packages/react-dom/src/client/ReactDOMFiberInput.js',
        'packages/react-dom/src/events/ChangeEventPlugin.js',
        'packages/react-dom/src/shared/DOMProperty.js'], dtype=object),
 [2869, 2717, 2752, 2780])

In [9]:
bm25_results = bm25_searcher.search(test_query, test_date, k)

In [10]:
aggregated_results = bm25_searcher.aggregate_file_scores(bm25_results, 'sump')

In [12]:
# not sure why this file doesn't exist
# path_to_fid['packages/react/src/ReactServerSharedInternals.js']

In [22]:
test_query

"Input properties 'value' and 'defaultValue' accepting and assigning functions and symbols leads to improper handling and inconsistencies in numeric equality checks."

In [21]:
aggregated_results[0].contributing_results[0].commit_message

"Use defaultValue instead of setAttribute('value') (#11534)\n\n* Use defaultValue instead of setAttribute('value')\n\nThis commit replaces the method of synchronizing an input's value\nattribute from using setAttribute to assigning defaultValue. This has\nseveral benefits:\n\n- Fixes issue where IE10+ and Edge password icon disappears (#7328)\n- Fixes issue where toggling input types hides display value on dates\n  in Safari (unreported)\n- Removes mutationMethod behaviors from DOMPropertyOperations\n\n* initialValue in Input wrapperState is always a string\n\n* The value property is assigned before the value attribute. Fix related tests.\n\n* Remove initial value tests in ReactDOMInput\n\nI added these tests after removing the `value` mutation\nmethod. However they do not add any additional value over existing\ntests.\n\n* Improve clarity of value checks in ReactDOMInput.postMountWrapper\n\n* Remove value and defaultValue from InputWithWrapperState type\n\nThey are already included in

In [16]:
evaluator.evaluate(aggregated_results, test_files_ids)

{'MAP': 0.54,
 'P@1': 1.0,
 'P@10': 0.2,
 'P@20': 0.1,
 'P@30': 0.0667,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.5,
 'R@100': 1.0,
 'R@1000': 1.0}

In [110]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from utils import set_seed

In [157]:
set_seed(42)

In [283]:
model_name = 'microsoft/codebert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to('cuda')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
text1 = test_query
text2 = aggregated_results[0].contributing_results[0].commit_message

In [36]:
inputs = tokenizer(text1, text2, return_tensors="pt", padding=True, truncation='only_second', max_length=tokenizer.model_max_length, add_special_tokens=True)

In [286]:
model.eval()
inputs.to('cuda')
# Predict with model
with torch.no_grad():
    outputs = model(**inputs)

# Assuming the model has been fine-tuned for regression task
# Convert logits to similarity score (sigmoid function to get value between 0 and 1)

# get the similarity score
similarity_score = torch.softmax(outputs.logits.squeeze(), dim=0)[1].item()
model.train()
similarity_score

0.5136816501617432

In [222]:
# create a pandas dataframe of (txt1, tx2, 1) repeating for 100 rows
# train_df = pd.DataFrame({'txt1': [text1]*100, 'txt2': [text2]*100, 'label': [1]*100})
train_df = pd.read_pickle('data/2_7/facebook_react/cache/triplet_data_cache.pkl')

In [229]:
import os
repo_paths = [
    "data/2_7/apache_spark",
    "data/2_7/apache_kafka",
    "data/2_7/facebook_react",
    "data/2_8/angular_angular",
    "data/2_8/django_django",
    "data/2_8/pytorch_pytorch",
    "data/2_7/julialang_julia",
    "data/2_7/ruby_ruby",
    "data/2_9/huggingface_transformers",
    "data/2_9/redis_redis",
]

In [230]:
combined_triplet_data = pd.DataFrame()
for data_path in tqdm(repo_paths, total=len(repo_paths)):
    triplet_cache = os.path.join(data_path, 'cache', 'gpt_triplet_data_cache.pkl')
    if os.path.exists(triplet_cache):
        repo_triplet_data = pd.read_pickle(triplet_cache)
        combined_triplet_data = pd.concat([combined_triplet_data, repo_triplet_data], ignore_index=True)
    else:
        print(f"Warning: Triplet cache not found for {data_path}, skipping this repository.")

In [129]:
from datasets import Dataset

In [234]:
train_df = combined_triplet_data

In [237]:
dataset = Dataset.from_pandas(train_df)

In [238]:
def tokenize_function(example):
    return tokenizer(example['query'], example['passage'], return_tensors="pt", padding=True, truncation='only_second', max_length=tokenizer.model_max_length, add_special_tokens=True)

In [239]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(lambda examples: {'labels': examples['label']}, batched=True)

Map:   0%|          | 0/74385 [00:00<?, ? examples/s]

Map:   0%|          | 0/74385 [00:00<?, ? examples/s]

In [243]:
from transformers import Trainer, TrainingArguments

In [264]:
training_args = TrainingArguments(
    output_dir='./tmp',          # output directory for model checkpoints
    evaluation_strategy="no",     # evaluation is done at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=1,              # total number of training epochs
    weight_decay=0.01,               # strength of weight decay,
    report_to='none',
    fp16=True,

)

In [251]:
tokenized_datasets

Dataset({
    features: ['query', 'passage', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 74385
})

In [325]:
trainer = Trainer(
    model=bert_reranker.model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [326]:
trainer.train()

Step,Training Loss
500,0.5889
1000,0.5539
1500,0.5366
2000,0.5141
2500,0.5095
3000,0.5109
3500,0.492
4000,0.4962
4500,0.4727
5000,0.4822


TrainOutput(global_step=9299, training_loss=0.4829821280477277, metrics={'train_runtime': 1465.3526, 'train_samples_per_second': 50.763, 'train_steps_per_second': 6.346, 'total_flos': 1.95715158529536e+16, 'train_loss': 0.4829821280477277, 'epoch': 1.0})

In [290]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='data/2_7/facebook_react/index_commit_tokenized',
    repo_path='data/2_7/facebook_react', k=10000, n=100,
    model_path='microsoft/codebert-base', overwrite_cache=False,
    batch_size=32, num_epochs=3, learning_rate=2e-05,
    run_name='debug',
    notes='debug (ignore)',
    num_positives=10, num_negatives=10, train_depth=10000, num_workers=8,
    train_commits=1000, psg_cnt=25, use_gpu=True,
    rerank_depth=250, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4',
    overwrite_eval=False, sanity_check=True, debug=False,
    psg_len=350, psg_stride=250, ignore_gold_in_training=False,
    eval_folder='debug', use_gpt_train=True,
    aggregation_strategy='sump',

)

In [320]:
bert_params = {
        'model_name': args.model_path,
        'psg_cnt': 5,
        'aggregation_strategy': 'sump',
        'batch_size': args.batch_size,
        'use_gpu': args.use_gpu,
        'rerank_depth': 250,
        'num_epochs': args.num_epochs,
        'lr': args.learning_rate,
        'num_positives': args.num_positives,
        'num_negatives': args.num_negatives,
        'train_depth': args.train_depth,
        'num_workers': args.num_workers,
        'train_commits': args.train_commits,
        'bm25_aggr_strategy': 'maxp',
        'output_length': 1000
    }
bert_params

{'model_name': 'microsoft/codebert-base',
 'psg_cnt': 5,
 'aggregation_strategy': 'sump',
 'batch_size': 32,
 'use_gpu': True,
 'rerank_depth': 250,
 'num_epochs': 3,
 'lr': 2e-05,
 'num_positives': 10,
 'num_negatives': 10,
 'train_depth': 10000,
 'num_workers': 8,
 'train_commits': 1000,
 'bm25_aggr_strategy': 'maxp',
 'output_length': 1000}

In [305]:
from models import BERTReranker

In [321]:
bert_reranker = BERTReranker(bert_params, 'classification')
bert_reranker.model.to(bert_reranker.device)

Using device: cuda
Using GPU: Quadro RTX 6000
GPU Device Count: 1
GPU Memory Usage: 5383.44 MB
Using classification model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized BERT reranker with parameters: {'model_name': 'microsoft/codebert-base', 'psg_cnt': 5, 'aggregation_strategy': 'sump', 'batch_size': 32, 'use_gpu': True, 'rerank_depth': 250, 'num_epochs': 3, 'lr': 2e-05, 'num_positives': 10, 'num_negatives': 10, 'train_depth': 10000, 'num_workers': 8, 'train_commits': 1000, 'bm25_aggr_strategy': 'maxp', 'output_length': 1000}


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [327]:
tmp = bert_reranker.rerank_pipeline(test_query, aggregated_results)

In [328]:
evaluator.evaluate(tmp, test_files_ids)

{'MAP': 0.6771,
 'P@1': 1.0,
 'P@10': 0.3,
 'P@20': 0.2,
 'P@30': 0.1333,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.75,
 'R@100': 1.0,
 'R@1000': 1.0}

In [None]:
# bm25
{'MAP': 0.54,
 'P@1': 1.0,
 'P@10': 0.2,
 'P@20': 0.1,
 'P@30': 0.0667,
 'MRR': 1.0,
 'R@1': 0.25,
 'R@10': 0.5,
 'R@100': 1.0,
 'R@1000': 1.0}


# bert without training
{'MAP': 0.2716,
 'P@1': 0.0,
 'P@10': 0.2,
 'P@20': 0.1,
 'P@30': 0.0667,
 'MRR': 0.5,
 'R@1': 0.0,
 'R@10': 0.5,
 'R@100': 0.75,
 'R@1000': 1.0}