In [4]:
import pandas as pd
import glob
import os
import json
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader
import tiktoken
import datetime
import time
import numpy as np

In [2]:
# encoding = 'cl100k_base'
encoding = 'p50k_base'
enc = tiktoken.get_encoding(encoding)
assert enc.decode(enc.encode("hello world")) == "hello world"

In [3]:
def tokenize(text):
    return ' '.join(map(str,enc.encode(text, disallowed_special=())))

In [5]:
!ls -GFlash data/karpathy_llama2.c/

total 2.4M
   0 drwxr-xr-x 10 siddharth  320 Oct  5 22:02 ./
   0 drwxr-xr-x 12 siddharth  384 Oct  5 22:04 ../
8.0K -rw-r--r--  1 siddharth 6.1K Oct  5 22:02 .DS_Store
   0 drwxr-xr-x  3 siddharth   96 Oct  5 22:02 jsonl/
868K -rw-r--r--  1 siddharth 868K Oct  2 22:02 karpathy_llama2.c_commit_data_0.parquet
616K -rw-r--r--  1 siddharth 615K Oct  2 22:02 karpathy_llama2.c_commit_data_1.parquet
372K -rw-r--r--  1 siddharth 372K Oct  2 22:03 karpathy_llama2.c_commit_data_2.parquet
284K -rw-r--r--  1 siddharth 283K Oct  2 22:03 karpathy_llama2.c_commit_data_3.parquet
240K -rw-r--r--  1 siddharth 238K Oct  2 22:03 karpathy_llama2.c_commit_data_4.parquet
   0 drwxr-xr-x 21 siddharth  672 Oct  3 00:52 searcher/


In [119]:
# Load the parquet file
# tempdf = pd.read_parquet('data/karpathy_llama2.c/karpathy_llama2.c_commit_data_0.parquet')
tempdf = pd.read_parquet('data/apache_kafka/apache_kafka_commit_data_0.parquet')

In [120]:
tempdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5948 entries, 0 to 5947
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   owner                  5948 non-null   string             
 1   repo_name              5948 non-null   string             
 2   commit_date            5948 non-null   datetime64[ns, UTC]
 3   commit_id              5948 non-null   string             
 4   commit_message         5948 non-null   string             
 5   file_path              5948 non-null   string             
 6   previous_commit_id     5948 non-null   string             
 7   previous_file_content  5159 non-null   string             
 8   cur_file_content       5860 non-null   string             
 9   diff                   5072 non-null   string             
 10  status                 5948 non-null   category           
 11  is_merge_request       5948 non-null   bool             

In [5]:
df = pd.read_parquet('data/facebook_react/facebook_react_commit_data_0.parquet')

In [203]:
# get commit 7022e8d6a3222c97d287dfa0f2361acc8a30683a
# df[df['commit_id'] == '7022e8d6a3222c97d287dfa0f2361acc8a30683a']

In [6]:
# (df.head(1)['commit_date'].astype('int64')/1e6).astype('int64')
df.head(1)['commit_date']

0   2023-09-29 18:24:38-04:00
Name: commit_date, dtype: datetime64[us, pytz.FixedOffset(-240)]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73551 entries, 0 to 73550
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype                                 
---  ------                 --------------  -----                                 
 0   owner                  73551 non-null  string                                
 1   repo_name              73551 non-null  string                                
 2   commit_date            73551 non-null  datetime64[us, pytz.FixedOffset(-240)]
 3   commit_id              73551 non-null  string                                
 4   commit_message         73551 non-null  string                                
 5   file_path              73551 non-null  string                                
 6   previous_commit_id     73325 non-null  string                                
 7   previous_file_content  60606 non-null  string                                
 8   cur_file_content       67356 non-null  string           

In [8]:
 # print just the memory usage in human readable format (MB) to 2 decimal places
print(f'{df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB')

2699.89 MB


In [9]:
print('Number of unique commits stored (others excluded for not being code commits):', df.commit_id.nunique())

Number of unique commits stored (others excluded for not being code commits): 11595


In [112]:
# BASE_DIR = 'data/karpathy_llama2.c/'
REPO_LIST = ['karpathy_llama2.c', 'facebook_react', 'apache_kafka', 'ggerganov_llama.cpp', 'nodejs_node']

In [10]:
# REPO_LIST = ['karpathy_llama2.c']
REPONAME = ['facebook_react']

In [61]:
# def convert_data_to_jsonl(data_dir, output_file):
#     all_files = glob.glob(os.path.join(data_dir, '*.parquet'))
#     all_dataframes = [pd.read_parquet(file) for file in all_files]
#     combined_df = pd.concat(all_dataframes, ignore_index=True)
#     # replace NaN with empty string
#     combined_df.fillna('', inplace=True)

#     with open(output_file, 'w') as f:
#         for index, row in combined_df.iterrows():
#             doc = {
#                 'id': row['commit_id'],
#                 'contents': row['commit_message'] + '\n' + row['cur_file_content'],
#                 # Optionally include source code
#                 # 'source_code': row['cur_file_content']
#             }
#             f.write(json.dumps(doc) + '\n')

In [8]:
def count_commits(repo_dir):
    all_files = glob.glob(os.path.join(repo_dir, '*.parquet'))
    all_dataframes = [pd.read_parquet(file) for file in all_files]
    combined_df = pd.concat(all_dataframes, ignore_index=True)

    # number of unique commit_id columns
    return combined_df.commit_id.nunique()

In [119]:
total_commits = 0
for repo in REPO_LIST:
    total_commits += count_commits('data/' + repo + '/')

In [120]:
print('Total number of commits:', total_commits)

Total number of commits: 11595


In [9]:
def convert_repo_to_jsonl(repo_dir, output_file, use_tokenizer=False):
    all_files = glob.glob(os.path.join(repo_dir, '*.parquet'))
    all_dataframes = [pd.read_parquet(file) for file in all_files]
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    # replace NaN with empty string in non-category columns
    # combined_df.fillna('', inplace=True)

    combined_df['commit_message'] = combined_df['commit_message'].fillna('')
    combined_df['cur_file_content'] = combined_df['cur_file_content'].fillna('')
    # convert commit_date to int64 (unix timestamp in milliseconds)
    combined_df['commit_date'] = (combined_df['commit_date'].astype('int64') / 1e6).astype('int64')
    # df['commit_date'] = df['commit_date'].astype(str)
    # print(type(df['commit_date'][0]))
    # print combined_df memory usage
    # print(combined_df.info(memory_usage='deep'))
    print(f'Combined Memory Usage: {combined_df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB for {len(combined_df)} rows')
    print(output_file)
    with open(output_file, 'x') as f:
        for index, row in combined_df.iterrows():
            doc = {
                'id': row['commit_id'],
                'contents': row['commit_message'] if not use_tokenizer else tokenize(row['commit_message']),
                # 'source_code': row['cur_file_content'],  # Optionally include source code
                # 'contents': tokenize(row['commit_message']) + '\n' + tokenize(row['cur_file_content']),
                # 'contents': tokenize(row['commit_message']) + '\n' + tokenize(row['cur_file_content']) if use_tokenizer else row['commit_message'] + '\n' + row['cur_file_content'],
                'repo_name': row['repo_name'],
                'file_path': row['file_path'],
                'commit_date': row['commit_date'],
            }
            f.write(json.dumps(doc) + '\n')

In [10]:
# empty data/jsonl if it has data
# !rm -rf data/jsonl_tiktoken

In [15]:
# jsonl_dir_name = 'jsonl_6'
# for repo_name in REPO_LIST:
#     repo_dir = os.path.join('data', repo_name)
#     # create data/jsonl directory if it doesn't exist
#     os.makedirs(os.path.join('data', jsonl_dir_name), exist_ok=True)

#     # store in data/jsonl
#     output_jsonl_file = os.path.join('data', jsonl_dir_name, f'{repo_name}.jsonl')
#     convert_repo_to_jsonl(repo_dir, output_jsonl_file)

In [13]:
# REPO_LIST = ['facebook_react']

In [128]:
# store in data/repo_dir/jsonl
jsonl_dir_name = 'jsonl'
for repo_name in REPO_LIST:
    print(repo_name)
    repo_dir = os.path.join('data', repo_name)
    # create data/jsonl directory if it doesn't exist
    os.makedirs(os.path.join(repo_dir, jsonl_dir_name), exist_ok=True)
    output_name = f'{repo_name}_commit_only_tk.jsonl'
    # store in data/jsonl
    output_jsonl_file = os.path.join(repo_dir, jsonl_dir_name, output_name)
    # if file exists, delete it
    if os.path.exists(output_jsonl_file):
        os.remove(output_jsonl_file)
    convert_repo_to_jsonl(repo_dir, output_jsonl_file, use_tokenizer=True)
    # if not os.path.exists(output_jsonl_file):
    #     convert_repo_to_jsonl(repo_dir, output_jsonl_file, use_tokenizer=True)
    # else:
    #     print('File already exists:', output_jsonl_file)

karpathy_llama2.c
Combined Memory Usage: 18.29 MB for 402 rows
data/karpathy_llama2.c/jsonl/karpathy_llama2.c_commit_only_tk.jsonl
facebook_react
Combined Memory Usage: 2699.89 MB for 73551 rows
data/facebook_react/jsonl/facebook_react_commit_only_tk.jsonl
apache_kafka
Combined Memory Usage: 3645.70 MB for 75870 rows
data/apache_kafka/jsonl/apache_kafka_commit_only_tk.jsonl
ggerganov_llama.cpp
Combined Memory Usage: 604.98 MB for 2111 rows
data/ggerganov_llama.cpp/jsonl/ggerganov_llama.cpp_commit_only_tk.jsonl
nodejs_node
Combined Memory Usage: 11010.96 MB for 208188 rows
data/nodejs_node/jsonl/nodejs_node_commit_only_tk.jsonl


In [62]:
# Usage
# jsonl_file_path = f'{BASE_DIR}/jsonl/llama2.jsonl'
# convert_data_to_jsonl(BASE_DIR, jsonl_file_path)

In [121]:
# # get list of jsonl files which are present in data/repo_name/jsonl/repo_name.jsonl
# jsonl_files = glob.glob('data/*/*/*.jsonl')
# print(jsonl_files)

For normal untokenized
- Parquet -> JSONL 22s
- Index build 1m26s
- 6 repos
    Parquet -> JSONL 1m11s
    Same mem usage as before, just lower time since no need for tokenization
    Index Build 3m51s
    Index Size 5Gb

For tokenized
- Parquet -> JSONL 8m3s
- Index Build 2m12s
- 6 repos:
    Parquert -> JSONL 24m
        - Combined Memory Usage: 18.29 MB for 402 rows data/isonl_6/karpathy_llama2.c.jsonl
        - Combined Memory Usage: 0.94 MB for 108 rows data/json1_6/siddharth-gandhi_refpred.jsonl \\
        - Combined Memory Usage: 2699.89 MB for 73551 rows data/jsonl_6/facebook_react.jsonl \\
        - Combined Memory Usage: 3645.70 MB for 75870 rows data/jsonl_6/apache_kafka. jsonl \\
        - Combined Memory Usage: 605.11 MB for 2111 rows data/jsonl_6/ggerganov_llama.cpp.jsonl \\
        - Combined Memory Usage: 11010.96 MB for 208188 rows data/jsonl_6/nodejs_node.json
        - 36731 total commits 
        - Total ~360K rows
        - Interesting heuristic, on avg 10 files edited per commit?
    Index build 6m42s
    Index Size 10GB

Building the index


In [25]:
# REPO_LIST = ['facebook_react']

In [187]:
%%bash

# Specify the repository list here
REPO_LIST=("karpathy_llama2.c" "facebook_react" "apache_kafka" "ggerganov_llama.cpp" "nodejs_node")

# Loop over each repo in the REPO_LIST array
for repo in "${REPO_LIST[@]}"
do
    # Directory paths
    repo_dir="data/$repo"
    index_dir="$repo_dir/index_tk"
    jsonl_dir_name="$repo_dir/jsonl"

    # Check if the index directory already exists
    # if [ -d "$index_dir" ]; then
    #     echo "Index directory $index_dir already exists. Not doing $repo."
    #     continue  # Skip to the next iteration of the loop
    # fi

    # remove all fiiles in the index directory
    rm -rf "$index_dir"

    # Create the directory if it doesn't exist
    mkdir -p "$index_dir"

    # Build the index from data/jsonl
    python -m pyserini.index.lucene -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
     -threads 4 -input "$jsonl_dir_name" -index "$index_dir" -storePositions -storeDocvectors -storeRaw -impact -pretokenized

    # Log the repo being processed
    echo "Processing $repo"
done

2023-10-10 02:45:48,832 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-10-10 02:45:48,833 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-10-10 02:45:48,833 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: data/karpathy_llama2.c/jsonl
2023-10-10 02:45:48,833 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-10-10 02:45:48,833 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-10-10 02:45:48,834 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 4
2023-10-10 02:45:48,834 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-10-10 02:45:48,834 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-10-10 02:45:48,834 INFO  [main] index.IndexCollection (IndexCollection.java:3

In [185]:
%%bash
# Directory to store the index
# index_dir="./bm25_index_6/"
# jsonl_dir_name="jsonl_6"
repo_dir="data/karpathy_llama2.c"
index_dir="$repo_dir/index_tk"
# jsonl_dir_name="jsonl_tiktoken_6"
jsonl_dir_name="$repo_dir/jsonl"

# Create the directory if it doesn't exist
mkdir -p "$index_dir"

# Remove any existing indexes
rm -rf "$index_dir/*"

echo jsonl_dir_name: "$jsonl_dir_name"
ls -l "$jsonl_dir_name"

# build the index from data/jsonl
python -m pyserini.index.lucene -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
 -threads 4 -input "$jsonl_dir_name" -index "$index_dir" -storePositions -storeDocvectors -storeRaw -impact -pretokenized

jsonl_dir_name: data/karpathy_llama2.c/jsonl
total 100
-rw-r--r-- 1 siddharth staff 101236 Oct 10 02:18 karpathy_llama2.c_commit_only_tk.jsonl
2023-10-10 02:44:47,203 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-10-10 02:44:47,204 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-10-10 02:44:47,204 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: data/karpathy_llama2.c/jsonl
2023-10-10 02:44:47,204 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-10-10 02:44:47,204 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-10-10 02:44:47,204 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 4
2023-10-10 02:44:47,204 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-10-10 02:44:47,205 INFO  [main] index.

In [14]:
# repo_dir=f"data/{REPO_LIST[0]}"
# repo_dir=f"data/karpathy_llama2.c"

In [178]:
query = 'Refactors Resources to have a more compact and memory efficient struture.'

In [39]:
def convert_date_to_timestamp(date_str):
    date_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d')

    # Convert the datetime object to a UNIX timestamp
    # Method 1: Using timestamp() method
    unix_timestamp_1 = int(date_obj.timestamp())
    return unix_timestamp_1

In [17]:
# https://github.com/facebook/react/commit/7022e8d6a3222c97d287dfa0f2361acc8a30683a
query = "Duplicate Panels Created in Firefox on Multiple chrome.panels.create Calls. I've encountered an issue in Firefox where multiple calls to chrome.panels.create result in the creation of duplicate panels. This seems to happen every time chrome.panels.create is called, even if a panel already exists. This leads to a cluttered interface with many duplicate panels. Ideally, chrome.panels.create should only create a new panel if there isn't one already existing. I believe a check should be implemented to ensure that chrome.panels.create is only called if no panels have been created yet to prevent this duplication issue."
query = "Duplicate Panels Created in Firefox on Multiple chrome.panels.create Calls."
query_date = "2023-08-31"

In [18]:
convert_date_to_timestamp(query_date)

1693454400

In [37]:
def reverse_tokenize(text):
    text = json.loads(text)
    # print(list(text['contents'].split(' ')))
    text['contents'] = enc.decode([int(i) for i in text['contents'].split(' ')])
    # return string
    return json.dumps(text, indent=2)

In [147]:
# lst = [f'{repo_dir}/index/', f'{repo_dir}/index_tk/', f'{repo_dir}/index_nf/', f'{repo_dir}/index_tk_nf/']
# for i in lst:
#     index_reader = IndexReader(i)
#     search = LuceneSearcher(i)
#     print(i)
#     print(index_reader.stats())
#     search_res = search.search(query, k=10) if 'tk' not in i else search.search(tokenize(query), k=10)
#     if 'tk' in i:
#         print(reverse_tokenize(search_res[0].raw))
#     else:
#         print(search_res[0].raw)
#     print(f'Score: {search_res[0].score}')
#     print()

In [45]:
# https://github.com/facebook/react/commit/7022e8d6a3222c97d287dfa0f2361acc8a30683a
query = "Duplicate Panels Created in Firefox on Multiple chrome.panels.create Calls."
query_date = "2023-08-31"


modified_query = "I've encountered an issue in Firefox where multiple calls to chrome.panels.create result in the creation of duplicate panels. This seems to happen every time chrome.panels.create is called, even if a panel already exists. This leads to a cluttered interface with many duplicate panels. Ideally, chrome.panels.create should only create a new panel if there isn't one already existing. I believe a check should be implemented to ensure that chrome.panels.create is only called if no panels have been created yet to prevent this duplication issue."

actual_modified_files = ['packages/react-devtools-extensions/src/main/index.js']

In [42]:
df = pd.read_parquet('data/facebook_react/facebook_react_commit_data_0.parquet')

In [188]:
# https://github.com/facebook/react/commit/d9e00f795b77676fb14f2a3c6f421f48f73bec2a
query = "Stop flowing and then abort if a stream is cancelled"
query_date = "2023-09-22"
query_commit_id = 'd9e00f795b77676fb14f2a3c6f421f48f73bec2a'
actual_modified_files = df[df['commit_id'] == query_commit_id]['file_path'].tolist()

In [59]:
# filter df to only include commits with commit_id d9e00f795b77676fb14f2a3c6f421f48f73bec2a & get the file_path column as a list to get actual_modified_files
# df[df['commit_id'] == 'd9e00f795b77676fb14f2a3c6f421f48f73bec2a']
# actual_modified_files = df[df['commit_id'] == query_commit_id]['file_path'].tolist()

In [72]:
actual_modified_files

['packages/react-dom/src/__tests__/ReactDOMFizzServerBrowser-test.js',
 'packages/react-dom/src/server/ReactDOMFizzServerBrowser.js',
 'packages/react-dom/src/server/ReactDOMFizzServerBun.js',
 'packages/react-dom/src/server/ReactDOMFizzServerEdge.js',
 'packages/react-dom/src/server/ReactDOMFizzServerNode.js',
 'packages/react-dom/src/server/ReactDOMFizzStaticBrowser.js',
 'packages/react-dom/src/server/ReactDOMFizzStaticEdge.js',
 'packages/react-server-dom-esm/src/ReactFlightDOMServerNode.js',
 'packages/react-server-dom-webpack/src/ReactFlightDOMServerBrowser.js',
 'packages/react-server-dom-webpack/src/ReactFlightDOMServerEdge.js',
 'packages/react-server-dom-webpack/src/ReactFlightDOMServerNode.js',
 'packages/react-server-dom-webpack/src/__tests__/ReactFlightDOMBrowser-test.js',
 'packages/react-server/src/ReactFizzServer.js',
 'packages/react-server/src/ReactFlightServer.js']

tokenized with or without flag is the same, so let's just use with flag to avoid recomputing tokens

In [148]:
repo_dir = f"data/facebook_react/"

In [145]:
print(idx_path)

data/nodejs_node/index_tk/


In [191]:
# idx_path = f'{repo_dir}/index_tk/'
idx_path = f'data/facebook_react/index_tk'
bm25searcher = LuceneSearcher(idx_path)
hits = bm25searcher.search(tokenize(modified_query), k=1000)
# print(hits[0])
for i in range(len(hits)):
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
    # print with repo name and file name
    obj = json.loads(hits[i].raw)
    # print(obj)
    commit_date = int(obj["commit_date"])
    if commit_date > convert_date_to_timestamp(query_date):
        continue
    if obj["file_path"] in actual_modified_files:
        print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f} {obj["repo_name"]}/{obj["file_path"]} {commit_date}')
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f} {obj["repo_name"]}/{obj["file_path"]} {commit_date}')

38 848e802d203e531daf2b9b0edb281a1eb6c5415d 108.84327 react/packages/react-server/src/ReactFizzServer.js 1643990253
39 848e802d203e531daf2b9b0edb281a1eb6c5415d 108.84327 react/packages/react-server/src/ReactFlightServer.js 1643990253
137 ef8bdbecb6dbb9743b895c2e867e5a5264dd6651 87.51058 react/packages/react-server-dom-webpack/src/ReactFlightDOMServerBrowser.js 1678466175
138 ef8bdbecb6dbb9743b895c2e867e5a5264dd6651 87.51058 react/packages/react-server-dom-webpack/src/ReactFlightDOMServerEdge.js 1678466175
139 ef8bdbecb6dbb9743b895c2e867e5a5264dd6651 87.51058 react/packages/react-server-dom-webpack/src/ReactFlightDOMServerNode.js 1678466175
141 ef8bdbecb6dbb9743b895c2e867e5a5264dd6651 87.51058 react/packages/react-server-dom-webpack/src/__tests__/ReactFlightDOMBrowser-test.js 1678466175
146 ef8bdbecb6dbb9743b895c2e867e5a5264dd6651 87.51057 react/packages/react-server/src/ReactFlightServer.js 1678466175
213 c88fb49d37fd01024e0a254a37b7810d107bdd1d 85.80286 react/packages/react-server/src

In [192]:
# we have to write a function to evaluate this behaviour. For now just focus on the perfomance of normal query (without modification)
# the way we do this is by randomly sampling 1000 queries from df and then running the query on the index and then checking if the file is present in the actual_modified_files list. We want to store all hits and return IR metrics like MAP, MRR, P@10, P@100, P@1K, P@10K, NDCG@10, NDCG@100, NDCG@1K, NDCG@10K

# write 2 functions, one for searching and one for evaluating

def search(query, idx_path, query_date, k=1000):
    bm25searcher = LuceneSearcher(idx_path)
    hits = bm25searcher.search(tokenize(query), k)
    # filter hits based on date
    filtered_hits = []
    for i in range(len(hits)):
        obj = json.loads(hits[i].raw)
        commit_date = int(obj["commit_date"])
        if commit_date > convert_date_to_timestamp(query_date):
            continue
        filtered_hits.append(hits[i])
    return filtered_hits

In [193]:

from sklearn.metrics import average_precision_score, ndcg_score

def evaluate(query, idx_path, query_date, actual_modified_files, k=1000):
    hits = search(query, idx_path, query_date, k)

    # Convert the hits to a list of filenames
    retrieved_files = [json.loads(hit.raw)['file_path'] for hit in hits]

    # Generate binary relevance judgments based on the actual_modified_files
    relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]

    if sum(relevant) == 0:
        return {
            'MAP': 0,
            'P@10': 0,
            'P@100': 0,
            'P@1K': 0,
            # 'P@10K': 0,
            'MRR': 0,
            'Recall@1K': 0
            # 'NDCG@10': 0,
            # 'NDCG@100': 0,
            # 'NDCG@1K': 0,
            # 'NDCG@10K': 0
        }
    # Calculate the metrics
    MAP = average_precision_score(relevant, [1]*len(relevant))
    unique_relevant_files = {
        file for idx, file in enumerate(retrieved_files) if relevant[idx] == 1
    }
    recall = len(unique_relevant_files) / len(actual_modified_files)
    # recall = sum(relevant) / len(actual_modified_files)
    # also calculate MRR
    MRR = mean_reciprocal_rank(relevant)
    precision_values = [precision_at_k(relevant, k_val) for k_val in [10, 100, 1000]]

    #todo NDCG calculations - no multi-label support as of now
    # true_relevance = [[rel] for rel in relevant]
    # scores = [[1] for _ in relevant]  # assuming all the retrieved files are equally relevant
    # NDCG_values = [ndcg_score(true_relevance, scores, k=k_val) for k_val in [10, 100, 1000, 10000]]

    metrics = {
        'MAP': MAP,
        'P@10': precision_values[0],
        'P@100': precision_values[1],
        'P@1K': precision_values[2],
        # 'P@10K': precision_values[3],
        'MRR': MRR,
        'Recall@1K': recall
        # 'NDCG@10': NDCG_values[0],
        # 'NDCG@100': NDCG_values[1],
        # 'NDCG@1K': NDCG_values[2],
        # 'NDCG@10K': NDCG_values[3]
    }
    # round all the values to 4 decimal places
    metrics = {k: round(v, 4) for k, v in metrics.items()}
    return metrics

def precision_at_k(relevant, k):
    return sum(relevant[:k]) / k

def mean_reciprocal_rank(relevant):
    for idx, value in enumerate(relevant):
        if value == 1:
            return 1 / (idx + 1)
    return 0

In [140]:
idx_path

'data/facebook_react//index_tk_nf/'

In [194]:
evaluate(query, idx_path, query_date, actual_modified_files, k=100)

{'MAP': 0.0723,
 'P@10': 0.0,
 'P@100': 0.06,
 'P@1K': 0.006,
 'MRR': 0.0455,
 'Recall@1K': 0.3571}

In [195]:
# Assuming df is your data frame
sampled_commits = df.drop_duplicates(subset='commit_id').sample(100, replace=False, random_state=42)

results = []

for index, row in sampled_commits.iterrows():
    query = row['commit_message']
    query_date = row['commit_date'].strftime('%Y-%m-%d')
    query_commit_id = row['commit_id']
    actual_modified_files = df[df['commit_id'] == query_commit_id]['file_path'].tolist()

    result = evaluate(query, idx_path, query_date, actual_modified_files)
    results.append(result)

# Compute average scores
avg_scores = {}
metrics = ['MAP', 'MRR', 'P@10', 'P@100', 'P@1K', 'Recall@1K']
for metric in metrics:
    avg_scores[metric] = np.mean([result[metric] for result in results])

# round all the values to 4 decimal places
avg_scores = {k: round(v, 4) for k, v in avg_scores.items()}
print(avg_scores)

{'MAP': 0.0219, 'MRR': 0.121, 'P@10': 0.052, 'P@100': 0.0188, 'P@1K': 0.0068, 'Recall@1K': 0.5148}


In [211]:
# generalize sampling across all repos by making a function which does it for each repo_name in REPO_LIST

def evaluate_sampling(repo_dir, idx_path, n=100):
    metrics = ['MAP', 'MRR', 'P@10', 'P@100', 'P@1K', 'Recall@1K']
    all_files = glob.glob(os.path.join(repo_dir, '*.parquet'))
    all_dataframes = [pd.read_parquet(file) for file in all_files]
    combined_df = pd.concat(all_dataframes, ignore_index=True)

    print(f'Index path: {idx_path}')
    total_commits = combined_df.commit_id.nunique()
    print(f'Total commits: {total_commits}')
    if total_commits < 100:
        print(f'Not enough commits to sample for {repo_dir}, skipping...')
        return {metric: 0 for metric in metrics}
    # n = total_commits // 10 if total_commits > 10 else 1
    print(f'Processing {repo_dir} with {n} samples')

    sampled_commits = combined_df.drop_duplicates(subset='commit_id').sample(n, replace=False, random_state=42)
    print(f'Number of commits sampled: {len(sampled_commits)}')
    results = []
    for index, row in sampled_commits.iterrows():
        query = row['commit_message']
        query_date = row['commit_date'].strftime('%Y-%m-%d')
        query_commit_id = row['commit_id']
        actual_modified_files = combined_df[combined_df['commit_id'] == query_commit_id]['file_path'].tolist()

        result = evaluate(query, idx_path, query_date, actual_modified_files)
        results.append(result)
    avg_scores = {
        metric: np.mean([result[metric] for result in results])
        for metric in metrics
    }
    # round all the values to 4 decimal places
    avg_scores = {k: round(v, 4) for k, v in avg_scores.items()}
    return avg_scores

In [214]:
evaluate_sampling('data/apache_kafka/', 'data/apache_kafka/index_tk', n=100)

Index path: data/apache_kafka/index_tk
Total commits: 10438
Processing data/apache_kafka/ with 1000 samples
Number of commits sampled: 1000


{'MAP': 0.0,
 'MRR': 0.0,
 'P@10': 0.0,
 'P@100': 0.0,
 'P@1K': 0.0,
 'Recall@1K': 0.0}

In [96]:
repo_dir

'data/facebook_react/'

In [110]:
from tqdm import tqdm

In [197]:
REPO_LIST = ['karpathy_llama2.c',
 'facebook_react',
 'apache_kafka',
 'ggerganov_llama.cpp',
 'nodejs_node']

In [198]:
metrics = ['MAP', 'MRR', 'P@10', 'P@100', 'P@1K', 'Recall@1K']
res = []
for repo_name in tqdm(REPO_LIST):
    repo_dir = f'data/{repo_name}/'
    idx_path = f'{repo_dir}/index_tk/'
    print(f'Processing {repo_dir}')
    avg_scores = evaluate_sampling(repo_dir, idx_path)
    res.append(avg_scores)
    print(avg_scores)

# avg scores for all repos
avg_scores = {}
for metric in metrics:
    avg_scores[metric] = np.mean([result[metric] for result in res])
print(f'Average scores for all repos: {avg_scores}')
# evaluate_sampling(repo_dir)

  0%|          | 0/5 [00:00<?, ?it/s]

Processing data/karpathy_llama2.c/
Index path: data/karpathy_llama2.c//index_tk/
Processing data/karpathy_llama2.c/ with 100 samples


 20%|██        | 1/5 [00:01<00:04,  1.13s/it]

{'MAP': 0.0, 'MRR': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1K': 0.0, 'Recall@1K': 0.0}
Processing data/facebook_react/
Index path: data/facebook_react//index_tk/
Processing data/facebook_react/ with 100 samples


 40%|████      | 2/5 [00:11<00:19,  6.58s/it]

{'MAP': 0.0219, 'MRR': 0.121, 'P@10': 0.052, 'P@100': 0.0188, 'P@1K': 0.0068, 'Recall@1K': 0.5148}
Processing data/apache_kafka/
Index path: data/apache_kafka//index_tk/
Processing data/apache_kafka/ with 100 samples


 60%|██████    | 3/5 [00:20<00:14,  7.49s/it]

{'MAP': 0.0, 'MRR': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1K': 0.0, 'Recall@1K': 0.0}
Processing data/ggerganov_llama.cpp/
Index path: data/ggerganov_llama.cpp//index_tk/
Processing data/ggerganov_llama.cpp/ with 100 samples


 80%|████████  | 4/5 [00:24<00:06,  6.12s/it]

{'MAP': 0.0, 'MRR': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1K': 0.0, 'Recall@1K': 0.0}
Processing data/nodejs_node/
Index path: data/nodejs_node//index_tk/
Processing data/nodejs_node/ with 100 samples


100%|██████████| 5/5 [00:51<00:00, 10.21s/it]

{'MAP': 0.0, 'MRR': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1K': 0.0, 'Recall@1K': 0.0}
Average scores for all repos: {'MAP': 0.00438, 'MRR': 0.0242, 'P@10': 0.0104, 'P@100': 0.0037600000000000003, 'P@1K': 0.0013599999999999999, 'Recall@1K': 0.10296000000000001}





In [57]:
# llama2.c
# query = 'nInference for Llama-2 Transformer model in pure C'

# refpred
# query = 'if is_arxiv:\n return f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{paper_id}/references?fields=title,
# abstract,url,venue,publicationVenue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess'

# react
# query = "export {default} from './npm/Circle';"

# kafka
# public class MockKafkaLog4jAppender extends KafkaLog4jAppender {
#     private MockProducer<byte[], byte[]> mockProducer =
#             new MockProducer<>(false, new MockSerializer(), new MockSerializer());

#     private Properties producerProperties;

#     @Override
#     protected Producer<byte[], byte[]> getKafkaProducer(Properties props) {
#         producerProperties = props;
#         return mockProducer;
#     }

#     void setKafkaProducer(MockProducer<byte[], byte[]> producer) {
#         this.mockProducer = producer;
#     }
# """

# Kakfa
# query = """
# /**
#  * Local file based quorum state store. It takes the JSON format of {@link QuorumStateData}
#  * with an extra data version number as part of the data for easy deserialization.
#  *
#  * Example format:
#  * <pre>
#  * {"clusterId":"",
#  *   "leaderId":1,
#  *   "leaderEpoch":2,
#  *   "votedId":-1,
#  *   "appliedOffset":0,
#  *   "currentVoters":[],
#  *   "data_version":0}
#  * </pre>
#  * */

# """

# kakfa
query = """Convert coordinator retriable errors to a known producer…
… response error (#14378)

KIP-890 Part 1 tries to address hanging transactions on old clients. Thus, the produce version can not be bumped and no new errors can be added. Before we used the java client's notion of retriable and abortable errors -- retriable errors are defined as such by extending the retriable error class, fatal errors are defined explicitly, and abortable errors are the remaining. However, many other clients treat non specified errors as fatal and that means many retriable errors kill the application."""

# kakfa
# query = """Fix flaky TopicAdminTest::retryEndOffsetsShouldRetryWhenTopicNotFound test case"""

# nodejs
# query = """bool ShouldAbortOnUncaughtException(Isolate* isolate) {
#   DebugSealHandleScope scope(isolate);
#   Environment* env = Environment::GetCurrent(isolate);
#   return env != nullptr &&
#          (env->is_main_thread() || !env->is_stopping()) &&
#          env->abort_on_uncaught_exception() &&
#          env->should_abort_on_uncaught_toggle()[0] &&
#          !env->inside_should_not_abort_on_uncaught_scope();
# }"""

In [58]:
bm25searcher = LuceneSearcher('bm25_index_6/')
hits = bm25searcher.search(query, k=10)
# print(hits[0])
for i in range(len(hits)):
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
    # print with repo name and file name
    obj = json.loads(hits[i].raw)
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f} {obj["repo_name"]}/{obj["file_path"]}')

 1 5aecd2825644728f68a26558c957f5dfd4643423 99.51060 kafka/core/src/main/scala/kafka/server/ReplicaManager.scala
 2 29a1a16668d76a1cc04ec9e39ea13026f2dce1de 82.57980 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 3 5aad085a8e7514c14a17121d316a2e2b2add8bcc 81.72260 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 4 5aecd2825644728f68a26558c957f5dfd4643423 81.36090 kafka/core/src/test/scala/unit/kafka/server/ReplicaManagerTest.scala
 5 ef09a2e3fc11a738f6681fd57fb84ad109593fd3 80.57710 kafka/core/src/main/scala/kafka/coordinator/transaction/TransactionCoordinator.scala
 6 f5d5f654db359af077088685e29fbe5ea69616cf 79.69870 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 7 2b6365c78b6e659f8df0651a24013d028f39edd9 79.64400 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 8 ff77b3ad041c1a4c80119f960e1f

In [13]:
index_reader = IndexReader('idx_karpathy/')
index_reader.stats()

{'total_terms': 696778,
 'documents': 402,
 'non_empty_documents': 402,
 'unique_terms': 6840}

In [14]:
from pyserini.index import IndexReader

In [18]:
index_reader = IndexReader('idx_karpathy_double_token/')

In [19]:
index_reader.dump_documents_BM25('tmp/idx_karpathy_double.jsonl')

100%|██████████| 402/402 [00:02<00:00, 190.05it/s]


In [12]:
index_reader = IndexReader('idx_karpathy_double_token/')
index_reader.stats()

{'total_terms': 578447,
 'documents': 402,
 'non_empty_documents': 402,
 'unique_terms': 3034}

In [59]:
tiktoken_searcher = LuceneSearcher('bm25_index_tiktoken_6/')
# get tokenized query with enc.encode
tokeninzed_query = tokenize(query)
hits = tiktoken_searcher.search(tokeninzed_query, k=10)
# print(hits[0])
for i in range(len(hits)):
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
    # print with repo name and file name
    obj = json.loads(hits[i].raw)
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f} {obj["repo_name"]}/{obj["file_path"]}')

 1 5aecd2825644728f68a26558c957f5dfd4643423 141.63670 kafka/core/src/main/scala/kafka/server/ReplicaManager.scala
 2 5aecd2825644728f68a26558c957f5dfd4643423 112.99820 kafka/core/src/test/scala/unit/kafka/server/ReplicaManagerTest.scala
 3 5aad085a8e7514c14a17121d316a2e2b2add8bcc 111.59350 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 4 ff77b3ad041c1a4c80119f960e1f87c07b9e93dd 111.57550 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 5 29a1a16668d76a1cc04ec9e39ea13026f2dce1de 110.54000 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 6 ea0bb001262320bc9233221955a2be31c85993b9 109.68660 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 7 f5d5f654db359af077088685e29fbe5ea69616cf 109.62250 kafka/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
 8 b937ec7567

In [46]:
tiktoken_index_reader = IndexReader('bm25_index_tiktoken_6/')
tiktoken_index_reader.stats()

{'total_terms': 2698903862,
 'documents': 360230,
 'non_empty_documents': 360230,
 'unique_terms': -1}

In [47]:
# print the document source code inside the first hit raw
content = json.loads(hits[0].raw)['contents']

# print the document source code inside the first hit raw by decoding the tokenized string with enc.decode (convert to array of int and then decode)
# print(enc.decode(json.loads(hits[0].raw)['contents']))

# convert content to array of int
content_arr = [int(i) for i in content.split()]

In [48]:
print(enc.decode(content_arr))

worker: fix --abort-on-uncaught-exception handling

The `set_abort_on_uncaught_exception(false)` line was supposed to
prevent aborting when running Workers in
`--abort-on-uncaught-exception` mode, but it was incorrectly set
and not checked properly in the should-abort callback.

PR-URL: https://github.com/nodejs/node/pull/34724
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Reviewed-By: Richard Lau <riclau@uk.ibm.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Mary Marchini <oss@mmarchini.me>

#include "node.h"
#include "node_context_data.h"
#include "node_errors.h"
#include "node_internals.h"
#include "node_native_module_env.h"
#include "node_platform.h"
#include "node_v8_platform-inl.h"
#include "uv.h"

#if HAVE_INSPECTOR
#include "inspector/worker_inspector.h"  // ParentInspectorHandle
#endif

namespace node {
using errors::TryCatchScope;
using v8::Array;
using v8::Context;
using v8::EscapableHandleScope;
using v8::Function;
using v8::FunctionCallbackInfo;
using v8::H