In [97]:
import sys
sys.path.append('../src')

import pandas as pd
import os

from utils import get_combined_df, prepare_code_triplets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator

In [3]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Args(
    index_path='../data/2_7/facebook_react/index_commit_tokenized', repo_path='../data/2_7/facebook_react', k=1000, n=100, model_path='microsoft/codebert-base', overwrite_cache=False, batch_size=32, num_epochs=10, learning_rate=5e-05, run_name='repr_0.1663', notes='reproducing current best 0.1663 MAP result for CodeReranker', num_positives=10, num_negatives=10, train_depth=1000, num_workers=8, train_commits=1000, psg_cnt=25, aggregation_strategy='sump', use_gpu=True, rerank_depth=100, do_train=True, do_eval=True, eval_gold=True, openai_model='gpt4', overwrite_eval=False, sanity_check=True, debug=False, best_model_path=None, bert_best_model='data/combined_commit_train/best_model', psg_len=350, psg_stride=250, ignore_gold_in_training=False, eval_folder='repr_0.1663', use_gpt_train=True
)

metrics =['MAP', 'P@1', 'P@10', 'P@20', 'P@30', 'MRR', 'R@1', 'R@10', 'R@100', 'R@1000']
repo_path = args.repo_path
repo_name = repo_path.split('/')[-1]
index_path = args.index_path
K = args.k
n = args.n
combined_df = get_combined_df(repo_path)
BM25_AGGR_STRAT = 'sump'
eval_path = os.path.join(repo_path, 'eval')
if not os.path.exists(eval_path):
    os.makedirs(eval_path)

bm25_searcher = BM25Searcher(index_path)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

test_path = os.path.join('..', 'gold', 'facebook_react', 'v2_facebook_react_gpt4_gold.parquet')
gold_df = pd.read_parquet(test_path)

Loaded index at ../data/2_7/facebook_react/index_commit_tokenized
Index Stats: {'total_terms': 7587973, 'documents': 73765, 'non_empty_documents': 73765, 'unique_terms': 14602}


In [92]:
cache_path = os.path.join(args.repo_path, 'cache', 'X_diff_split')
code_df = pd.read_parquet('../data/2_7/facebook_react/cache/repr_0.1663/code_df.parquet')

In [54]:
def prep_line(line):
    return line.rstrip().lstrip()

def parse_diff_remove_minus(diff):
    return [
        line[1:] if line.startswith('+') else line
        for line in diff.split('\n')
        if not (line.startswith('-') or len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
        and len(prep_line(line)) > 2
    ]

def full_parse_diffs(diff):
   # keep both insertions and deletions to be passed to the model
    return [
        line[1:] if (line.startswith('+') or line.startswith('-')) else line
        for line in diff.split('\n')
        if not (len(line) == 0 or (line.startswith('@@') and line.count('@@') > 1))
    ]

def full_parse_diffs_split(diff):
   # keep both insertions and deletions to be passed to the model
    res = []
    cur = []
    for line in diff.split('\n'):
        if not len(line) == 0:
            if (line.startswith('@@') and line.count('@@') > 1):
                if cur:
                    res.append(cur)
                cur = []
            else:
                cur.append(line[1:] if (line.startswith('+') or line.startswith('-')) else line)
    if cur:
        res.append(cur)
    return res

def full_tokenize(s, tokenizer):
        return tokenizer.encode_plus(s, max_length=None, truncation=False, return_tensors='pt', add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)['input_ids'].squeeze().tolist()

In [9]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')

In [79]:
def find_diff_tokens(diff):
    ntokens = len(full_tokenize(diff, tokenizer))
    return ntokens

In [78]:
# code_df = pd.read_parquet('../data/merged_code_df/multi_code_df.parquet')
# code_df.train_commit_id.nunique()

In [73]:
# average token in each diff (only insertions)
total_rows = 0
total_diff_tokens = 0
for i, row in tqdm(code_df.iterrows(), total=code_df.shape[0]):
    diff = row.SR_diff
    if diff or not pd.isna(diff):
        total_diff_tokens += find_diff_tokens(diff)
        total_rows += 1

total_diff_tokens, total_diff_tokens / total_rows

100%|██████████| 61173/61173 [02:48<00:00, 362.16it/s]


(95828835, 1598.1594176311664)

In [77]:
# average number of tokens in each diff split
total_diff_splits = 0
total_diff_split_tokens = 0
for i, row in tqdm(code_df.iterrows(), total=code_df.shape[0]):
    diff = row.SR_diff
    if diff or not pd.isna(diff):
        diff_split_list = full_parse_diffs_split(diff)
        total_diff_splits += len(diff_split_list)
        for diff_split in diff_split_list:
            total_diff_split_tokens += find_diff_tokens('\n'.join(diff_split))

total_diff_split_tokens, total_diff_split_tokens/total_diff_splits

100%|██████████| 61173/61173 [02:46<00:00, 367.34it/s]


(85946036, 326.5960472265606)

In [76]:
# average splits per diff (number of @@ -- @@ changes)
# so this will be number of distinct places where the file is edited
total_diff_splits / total_rows

4.388729528701511

In [118]:
def prepare_code_triplets(code_df, args, mode, cache_file, overwrite=False):
    if not mode:
        raise ValueError(f"Mode: {mode} must be specified for preparing code triplets")

    print(f"Preparing code triplets with mode {mode} for {len(code_df)} rows.")
    if cache_file and os.path.exists(cache_file) and not overwrite:
        print(f"Loading data from cache file: {cache_file}")
        return pd.read_parquet(cache_file)

    if mode == 'sliding_window':
        triplets = prepare_sliding_window_triplets(code_df, args)
    elif mode == 'parse_functions':
        triplets = prepare_function_triplets(code_df, args)
    elif mode == 'diff_content':
        triplets = prepare_diff_content_triplets(code_df, args)
    elif mode == 'diff_subsplit':
      triplets = prepare_split_diff_triplets(code_df, args)
    else:
        raise ValueError(f"Unsupported mode: {mode}")

    triplets_df = pd.DataFrame(triplets, columns=['query', 'file_path', 'passage', 'label'])
    if cache_file:
        print(f"Saving data to cache file: {cache_file}")
        triplets_df.to_parquet(cache_file)

    print(triplets_df.head(5))

    return triplets_df

def prepare_split_diff_triplets(code_df, args):
    print('Preparing triplets split by diff content (further subplit at @@)')
    #### Helper functions ####
    #### end of helper functions ####

    triplets = []

    for _, row in tqdm(code_df.iterrows(), total=len(code_df)):
        cur_diff = row['SR_diff']
        if cur_diff is None or pd.isna(cur_diff):
            # NOTE: for cases where status is added probably or if diff was not able to be stored (encoding issue, etc)
            # THIS WILL LEAD TO A FEW POSITIVES MISSING - don't freak out, it's normal, I checked ;)
            continue
        diff_split_list = full_parse_diffs_split(cur_diff) # keep both insertions and deletions
        for diff_split in diff_split_list:
          triplets.append((row['train_query'], row['SR_file_path'], '\n'.join(diff_split), row['label']))

    # now add the top code_reranker.psg_cnt to triplets
    return triplets

In [123]:
triplets_df = pd.read_parquet(os.path.join(cache_path, 'diff_code_triplets.parquet'))
# triplets_df = prepare_code_triplets(code_df, args, mode='diff_subsplit', cache_file=None)

In [121]:
triplets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31734 entries, 0 to 31733
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query      31734 non-null  object
 1   file_path  31734 non-null  object
 2   passage    31734 non-null  object
 3   label      31734 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 991.8+ KB


In [124]:
triplets_df.label.value_counts()

label
0    26940
1    12169
Name: count, dtype: int64

In [82]:
cache_path

'../data/2_7/facebook_react/cache/4X_random_split'

In [86]:
triplets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39109 entries, 0 to 39108
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query      39109 non-null  object
 1   file_path  39109 non-null  object
 2   passage    39109 non-null  object
 3   label      39109 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [94]:
code_df.iloc[0].SR_diff



In [96]:
triplets_df.iloc[3].passage



In [16]:
triplets_df.label.value_counts()

label
0    328390
1     93438
Name: count, dtype: int64

In [3]:
repo_paths = [
    "../data/2_7/apache_spark",
    "../data/2_7/apache_kafka",
    "../data/2_8/angular_angular",
    "../data/2_8/django_django",
    "../data/2_8/pytorch_pytorch",
    "../data/2_7/julialang_julia",
    "../data/2_7/ruby_ruby",
    "../data/2_9/huggingface_transformers",
    "../data/2_9/redis_redis",
    "../data/2_7/facebook_react",
]

In [43]:
dfs = []
for repo in repo_paths:
    gold_df_path = os.path.join('..', 'gold', repo, f'v2_{repo}_gpt4_train.parquet')
    gold_df = pd.read_parquet(gold_df_path)
    dfs.append(gold_df)

print(len(dfs))
big_gold_df = pd.concat(dfs, ignore_index=True)


10


In [44]:
big_gold_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   commit_id                 5000 non-null   string
 1   commit_date               5000 non-null   int64 
 2   commit_message            5000 non-null   string
 3   actual_files_modified     5000 non-null   object
 4   transformed_message_gpt4  5000 non-null   object
dtypes: int64(1), object(2), string(2)
memory usage: 195.4+ KB


In [45]:
big_gold_df.to_parquet('merged_train.parquet')

In [11]:
from utils import get_code_df

In [2]:
# code_df_list = []
# print(repo_paths)
# for repo_path in repo_paths:
#     repo_name = repo_path.split('/')[-1]
#     print(f'processing {repo_name}')
#     index_path = os.path.join(repo_path, 'index_commit_tokenized')
#     K = args.k
#     n = args.n
#     combined_df = get_combined_df(repo_path)
#     BM25_AGGR_STRAT = 'sump'
#     eval_path = os.path.join(repo_path, 'eval')
#     if not os.path.exists(eval_path):
#         os.makedirs(eval_path)
    
#     bm25_searcher = BM25Searcher(index_path)
#     evaluator = SearchEvaluator(metrics)
#     model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)
    
#     gold_df_path = os.path.join('..', 'gold', repo_name, f'v2_{repo_name}_gpt4_train.parquet')

#     recent_df = pd.read_parquet(gold_df_path)
#     recent_df = recent_df.rename(columns={'commit_message': 'original_message', f'transformed_message_{args.openai_model}': 'commit_message'})
#     cache_path = f'{repo_name}_code_df.parquet'
#     code_df = get_code_df(recent_df, bm25_searcher, args.train_depth, args.num_positives, args.num_negatives, combined_df, cache_path, False)
    