In [1]:
import sys
sys.path.append('../src')

import pandas as pd
import os

from utils import get_combined_df
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator

In [3]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='../data/2_7/facebook_react/index_commit_tokenized', repo_path='../data/2_7/facebook_react', k=1000, n=100, model_path='microsoft/codebert-base', overwrite_cache=False, batch_size=32, num_epochs=10, learning_rate=5e-05, run_name='repr_0.1663', notes='reproducing current best 0.1663 MAP result for CodeReranker', num_positives=10, num_negatives=10, train_depth=1000, num_workers=8, train_commits=1000, psg_cnt=25, aggregation_strategy='sump', use_gpu=True, rerank_depth=100, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4', overwrite_eval=False, sanity_check=True, debug=False, best_model_path=None, bert_best_model='data/combined_commit_train/best_model', psg_len=350, psg_stride=250, ignore_gold_in_training=False, eval_folder='repr_0.1663', use_gpt_train=True
)

metrics =['MAP', 'P@1', 'P@10', 'P@20', 'P@30', 'MRR', 'R@1', 'R@10', 'R@100', 'R@1000']
repo_path = args.repo_path
repo_name = repo_path.split('/')[-1]
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

test_path = os.path.join('..', 'gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
gold_df = pd.read_parquet(test_path)

Loaded index at ../data/2_7/facebook_react/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [4]:
cache_path = os.path.join(args.repo_path, 'cache', '4X_random_split')
code_df = pd.read_parquet(os.path.join(cache_path, 'code_df.parquet'))

In [15]:
def prep_line(line):
        return line.rstrip().lstrip()

def parse_diff_remove_minus(diff):
    return [
        line[1:] if line.startswith('+') else line
        for line in diff.split('\n')
        if not (line.startswith('-') or len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
        and len(prep_line(line)) > 2
    ]

def full_parse_diffs(diff):
       # keep both insertions and deletions to be passed to the model
        return [
            line[1:] if (line.startswith('+') or line.startswith('-')) else line
            for line in diff.split('\n')
            if not (len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
        ]

def full_tokenize(s, tokenizer):
        return tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()

In [9]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')

In [10]:
diff = code_df.iloc[0].SR_diff

In [16]:
tokenizer.decode(full_tokenize(diff, tokenizer))



In [None]:
def find_diff_stats(diff):
    

In [8]:
code_df.train_commit_id.nunique()

1890

In [9]:
code_df.label.value_counts()

label
0    18114
1     5862
Name: count, dtype: int64

In [14]:
triplets_df = pd.read_parquet(os.path.join(cache_path, 'diff_code_triplets.parquet'))

In [15]:
triplets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421828 entries, 0 to 421827
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   query      421828 non-null  object
 1   file_path  421828 non-null  object
 2   passage    421828 non-null  object
 3   label      421828 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 12.9+ MB


In [16]:
triplets_df.label.value_counts()

label
0    328390
1     93438
Name: count, dtype: int64

In [3]:
repo_paths = [
    "../data/2_7/apache_spark",
    "../data/2_7/apache_kafka",
    "../data/2_8/angular_angular",
    "../data/2_8/django_django",
    "../data/2_8/pytorch_pytorch",
    "../data/2_7/julialang_julia",
    "../data/2_7/ruby_ruby",
    "../data/2_9/huggingface_transformers",
    "../data/2_9/redis_redis",
    "../data/2_7/facebook_react",
]

In [43]:
dfs = []
for repo in repo_paths:
    gold_df_path = os.path.join('..', 'gold', repo, f'v2_{repo}_gpt4_train.parquet')
    gold_df = pd.read_parquet(gold_df_path)
    dfs.append(gold_df)

print(len(dfs))
big_gold_df = pd.concat(dfs, ignore_index=True)


10


In [44]:
big_gold_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   commit_id                 5000 non-null   string
 1   commit_date               5000 non-null   int64 
 2   commit_message            5000 non-null   string
 3   actual_files_modified     5000 non-null   object
 4   transformed_message_gpt4  5000 non-null   object
dtypes: int64(1), object(2), string(2)
memory usage: 195.4+ KB


In [45]:
big_gold_df.to_parquet('merged_train.parquet')

In [11]:
from utils import get_code_df

In [2]:
# code_df_list = []
# print(repo_paths)
# for repo_path in repo_paths:
#     repo_name = repo_path.split('/')[-1]
#     print(f'processing {repo_name}')
#     index_path = os.path.join(repo_path, 'index_commit_tokenized')
#     K = args.k
#     n = args.n
#     combined_df = get_combined_df(repo_path)
#     BM25_AGGR_STRAT = 'sump'
#     eval_path = os.path.join(repo_path, 'eval')
#     if not os.path.exists(eval_path):
#         os.makedirs(eval_path)
    
#     bm25_searcher = BM25Searcher(index_path)
#     evaluator = SearchEvaluator(metrics)
#     model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)
    
#     gold_df_path = os.path.join('..', 'gold', repo_name, f'v2_{repo_name}_gpt4_train.parquet')

#     recent_df = pd.read_parquet(gold_df_path)
#     recent_df = recent_df.rename(columns={'commit_message': 'original_message', f'transformed_message_{args.openai_model}': 'commit_message'})
#     cache_path = f'{repo_name}_code_df.parquet'
#     code_df = get_code_df(recent_df, bm25_searcher, args.train_depth, args.num_positives, args.num_negatives, combined_df, cache_path, False)
    