In [19]:
from utils import set_seed, get_combined_df
from bm25_v2 import BM25Searcher
from eval import ModelEvaluator, SearchEvaluator
import os
from sklearn.metrics import average_precision_score, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
import sys
sys.path.append('../src')

In [20]:
INDEX_PATH = '../2_7/apache_kafka/index_commit_tokenized'
REPO_PATH = '../2_7/apache_kafka'
K = 1000 # initial ranking depth
N = 100 # number of samples
BM25_AGGR_STRAT = 'sump'

In [21]:
set_seed(42)

In [22]:
metrics = ['MAP', 'P@10', 'P@100', 'P@1000', 'MRR', 'Recall@100', 'Recall@1000']

In [23]:
combined_df = get_combined_df(REPO_PATH)

In [24]:
eval_path = '../misc'
bm25_output_path = os.path.join(eval_path, f'bm25_baseline_N{N}_K{K}_metrics.txt')

In [25]:
bm25_searcher = BM25Searcher(INDEX_PATH)
evaluator = SearchEvaluator(metrics)
model_evaluator = ModelEvaluator(bm25_searcher, evaluator, combined_df)

Loaded index at ../2_7/apache_kafka/index_commit_tokenized
Index Stats: {'total_terms': 10796945, 'documents': 75655, 'non_empty_documents': 75655, 'unique_terms': 15591}


In [24]:
bm25_baseline_eval = model_evaluator.evaluate_sampling(n=N, k=K, output_file_path=bm25_output_path, aggregation_strategy=BM25_AGGR_STRAT, repo_path=REPO_PATH)

print("BM25 Baseline Evaluation")
print(bm25_baseline_eval)

100%|██████████| 100/100 [00:26<00:00,  3.75it/s]

Evaluation results written to ../misc/bm25_baseline_N100_K1000_metrics.txt
BM25 Baseline Evaluation
{'MAP': 0.2137, 'P@10': 0.114, 'P@100': 0.034, 'P@1000': 0.0053, 'MRR': 0.305, 'Recall@100': 0.5517, 'Recall@1000': 0.7426}





In [41]:
def sample_query(df, seed=42):
    """
    Sample a query from the dataframe
    """
    sampled_commit = df.drop_duplicates(subset='commit_id').sample(1, random_state=seed).iloc[0]
    return {
        'commit_message': sampled_commit['commit_message'],
        'commit_id': sampled_commit['commit_id'],
        'commit_date': sampled_commit['commit_date'],
        'actual_files_modified': df[df['commit_id'] == sampled_commit['commit_id']]['file_path'].tolist(),
        'diff_text': df[df['commit_id'] == sampled_commit['commit_id']]['diff'].tolist()
    }

In [45]:
query = sample_query(combined_df)
print(f'Commid Message: {query["commit_message"]}')
print(f'Acutal Files Modified: {query["actual_files_modified"]}')
print(f'Diff Text: {query["diff_text"]}')

Commid Message: produce/fetch remote time metric not set correctly when num.acks = 1; patched by Jun Rao; reviewed by Neha Narkhede; KAFKA-584

git-svn-id: https://svn.apache.org/repos/asf/incubator/kafka/branches/0.8@1402250 13f79535-47bb-0310-9956-ffa450edef68

Acutal Files Modified: ['core/src/main/scala/kafka/network/RequestChannel.scala']
Diff Text: ['@@ -40,9 +40,9 @@ object RequestChannel extends Logging {\n   }\n \n   case class Request(processor: Int, requestKey: Any, buffer: ByteBuffer, startTimeMs: Long) {\n-    var dequeueTimeMs = -1L\n-    var apiLocalCompleteTimeMs = -1L\n-    var responseCompleteTimeMs = -1L\n+    @volatile var dequeueTimeMs = -1L\n+    @volatile var apiLocalCompleteTimeMs = -1L\n+    @volatile var responseCompleteTimeMs = -1L\n     val requestId = buffer.getShort()\n     val requestObj: RequestOrResponse = RequestKeys.deserializerForKey(requestId)(buffer)\n     buffer.rewind()\n@@ -50,6 +50,10 @@ object RequestChannel extends Logging {\n \n     def upda

In [46]:
query = sample_query(combined_df, seed=24)
print(f'Commid Message: {query["commit_message"]}')
print(f'Acutal Files Modified: {query["actual_files_modified"]}')
print(f'Diff Text: {query["diff_text"]}')

Commid Message: KAFKA-13946; Add missing parameter to kraft test kit `ControllerNode.setMetadataDirectory()` (#12225)

Added parameter `metadataDirectory` to `setMetadataDirectory()` so that `this.metadataDirectory` would not be set to itself.

Reviewers: Kvicii <42023367+Kvicii@users.noreply.github.com>, dengziming <dengziming1993@gmail.com>, Jason Gustafson <jason@confluent.io>
Acutal Files Modified: ['core/src/test/java/kafka/testkit/ControllerNode.java']
Diff Text: ['@@ -27,7 +27,7 @@ public class ControllerNode implements TestKitNode {\n             return this;\n         }\n \n-        public Builder setMetadataDirectory() {\n+        public Builder setMetadataDirectory(String metadataDirectory) {\n             this.metadataDirectory = metadataDirectory;\n             return this;\n         }']


In [47]:
# find number of unique commits
print(f'Number of unique commits: {len(combined_df.drop_duplicates(subset="commit_id"))}')

Number of unique commits: 10445


Commid Message: produce/fetch remote time metric not set correctly when num.acks = 1; patched by Jun Rao; reviewed by Neha Narkhede; KAFKA-584

git-svn-id: https://svn.apache.org/repos/asf/incubator/kafka/branches/0.8@1402250 13f79535-47bb-0310-9956-ffa450edef68

Acutal Files Modified: ['core/src/main/scala/kafka/network/RequestChannel.scala']
Diff Text: ['@@ -40,9 +40,9 @@ object RequestChannel extends Logging {\n   }\n \n   case class Request(processor: Int, requestKey: Any, buffer: ByteBuffer, startTimeMs: Long) {\n-    var dequeueTimeMs = -1L\n-    var apiLocalCompleteTimeMs = -1L\n-    var responseCompleteTimeMs = -1L\n+    @volatile var dequeueTimeMs = -1L\n+    @volatile var apiLocalCompleteTimeMs = -1L\n+    @volatile var responseCompleteTimeMs = -1L\n     val requestId = buffer.getShort()\n     val requestObj: RequestOrResponse = RequestKeys.deserializerForKey(requestId)(buffer)\n     buffer.rewind()\n@@ -50,6 +50,10 @@ object RequestChannel extends Logging {\n \n     def upda

In [35]:
print('Commit Message:', query['commit_message'].values[0])
print(f'Files Changed: {query["file_path"].values[0]}')
print(f'Diff: {query["diff"].values[0]}')

Commit Message: KAFKA-5505: Incremental cooperative rebalancing in Connect (KIP-415) (#6363)

Added the incremental cooperative rebalancing in Connect to avoid global rebalances on all connectors and tasks with each new/changed/removed connector. This new protocol is backward compatible and will work with heterogeneous clusters that exist during a rolling upgrade, but once the clusters consist of new workers only some affected connectors and tasks will be rebalanced: connectors and tasks on existing nodes still in the cluster and not added/changed/removed will continue running while the affected connectors and tasks are rebalanced.

This commit attempted to minimize the changes to the existing V0 protocol logic, though that was not entirely possible.

This commit adds extensive unit and integration tests for both the old V0 protocol and the new v1 protocol. Soak testing has been performed multiple times to verify behavior while connectors and added, changed, and removed and while worke

In [2]:
def precision_at_k(relevant, k):
    return sum(relevant[:k]) / k


def mean_reciprocal_rank(relevant):
    for idx, value in enumerate(relevant):
        if value == 1:
            return 1 / (idx + 1)
    return 0

def calculate_average_precision(relevant):
    pred_rel = [1] * len(relevant)
    relevant_documents_count = 0
    cumulative_precision = 0.0

    # We iterate through the predicted relevance scores
    for i in range(len(pred_rel)):
        # Check if the prediction at this rank is correct (i.e., if it is a relevant document)
        if pred_rel[i] == 1 and relevant[i] == 1:
            relevant_documents_count += 1
            precision_at_i = relevant_documents_count / (i + 1)
            cumulative_precision += precision_at_i

    # The average precision is the cumulative precision divided by the number of relevant documents
    average_precision = cumulative_precision / sum(relevant) if sum(relevant) > 0 else 0
    return average_precision

# @staticmethod
# def calculate_recall(relevant, total_modified_files, k):
#   # Does not work for commit based approach as it can have multiple mentions of the same file across commits leading to a higher than 1 recall
#     print(total_modified_files)
#     print(relevant)
#     return sum(relevant[:k]) / total_modified_files


def calculate_recall(retrieved_files, actual_modified_files, relevant, k):
    # this complicated mess is required as compared to the above much simpler code to support both commit-based and file-based approaches
    # in file-based approach, this is equivalent to the above code
    # in code-based approach, duplicates could be present in retrieved_files, which is why we need to filter them out (the above code would not work in this case)

    return len({file for idx, file in enumerate(retrieved_files[:k])
                    if relevant[idx] == 1
                }) / len(actual_modified_files) if len(actual_modified_files) > 0 else 0

In [3]:
def demo():
    # a demo to show how P, R, MAP, MRR are calculated

    # this is for a single query, meaning a eval_commit_msg and it's corresponding actual_modified_files. this is taken from the data itself
    print('Query 1')
    print('All file paths are distinct (because of BM25 modification from last time)')
    k = 3
    actual_modified_files = ['a', 'b', 'c', 'd', 'e', 'f']
    retrieved_files = ['q', 'b', 'd', 'x', 'a', 'z', 'c']
    relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files] # relevant=[0, 1, 1, 0, 1, 0, 1]
    print(f'{actual_modified_files=}')
    print(f'{retrieved_files=}')
    print(f'{relevant=}')
    # P@k
    # 2/3
    p_at_k = precision_at_k(relevant, k)
    print(f'P@{k} = {p_at_k}')

    # MRR
    # 1/2
    mrr = mean_reciprocal_rank(relevant)
    print(f'MRR = {mrr}')

    # MAP
    # (1/2 + 2/3 + 3/5 + 4/7) / 4
    _map = calculate_average_precision(relevant)
    print(f'MAP = {_map}')

    # Recall@k
    # 2/6
    recall_at_k = calculate_recall(retrieved_files, actual_modified_files, relevant, k)
    print(f'Recall@{k} = {recall_at_k}')

    evaluation = {
        'P@k': p_at_k,
        'MRR': mrr,
        'MAP': _map,
        'Recall@k': recall_at_k
    }
    return evaluation


r1 = demo()


Query 1
All file paths are distinct (because of BM25 modification from last time)
actual_modified_files=['a', 'b', 'c', 'd', 'e', 'f']
retrieved_files=['q', 'b', 'd', 'x', 'a', 'z', 'c']
relevant=[0, 1, 1, 0, 1, 0, 1]
P@3 = 0.6666666666666666
MRR = 0.5
MAP = 0.5845238095238094
Recall@3 = 0.3333333333333333


In [4]:

def demo2():
    print('Query 2')
    print('All file paths are distinct (because of BM25 modification from last time)')
    k = 3
    actual_modified_files = ['p', 'q', 'r', 'a', 'b']
    retrieved_files = ['r', 'b', 'p', 'u', 'a', 'q']

    print(f'{actual_modified_files=}')
    print(f'{retrieved_files=}')
    relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]
    print(f'{relevant=}')
    # P@k
    p_at_k = precision_at_k(relevant, k)
    print(f'P@{k} = {p_at_k}')

    # MRR
    mrr = mean_reciprocal_rank(relevant)
    print(f'MRR = {mrr}')

    # MAP
    _map = calculate_average_precision(relevant)
    print(f'MAP = {_map}')

    # Recall@k
    recall_at_k = calculate_recall(retrieved_files, actual_modified_files, relevant, k)
    print(f'Recall@{k} = {recall_at_k}')

    evaluation = {
        'P@k': p_at_k,
        'MRR': mrr,
        'MAP': _map,
        'Recall@k': recall_at_k
    }
    return evaluation

r2 = demo2()

Query 2
All file paths are distinct (because of BM25 modification from last time)
actual_modified_files=['p', 'q', 'r', 'a', 'b']
retrieved_files=['r', 'b', 'p', 'u', 'a', 'q']
relevant=[1, 1, 1, 0, 1, 1]
P@3 = 1.0
MRR = 1.0
MAP = 0.9266666666666665
Recall@3 = 0.6


In [12]:
# Macro Average
print('Query 1')
print(r1)
print('Query 2')
print(r2)

print('Macro Average')
c1=  {k: (r1[k] + r2[k]) / 2 for k in r1.keys()}
print(c1)

Query 1
{'P@k': 0.6666666666666666, 'MRR': 0.5, 'MAP': 0.5845238095238094, 'Recall@k': 0.3333333333333333}
Query 2
{'P@k': 1.0, 'MRR': 1.0, 'MAP': 0.9266666666666665, 'Recall@k': 0.6}
Macro Average
{'P@k': 0.8333333333333333, 'MRR': 0.75, 'MAP': 0.755595238095238, 'Recall@k': 0.4666666666666667}


In [15]:
def file_based_demo():

    def file_based_evaluation(retrievals, k):
        # retrievals is a list of tuples: (file, [list of positions in retrieved results across different queries])
        # Example: [('a', [5, -1]), ('b', [2, 1]), ...] where -1 indicates the file was not retrieved in that query

        file_evaluations = {}

        for file, positions in retrievals:
            precisions = []
            recalls = []
            rr_list = []  # List for Reciprocal Rank calculations

            for pos in positions:
                if pos != -1:  # File is retrieved
                    # Calculate precision and recall at k
                    precision = 1 / (pos + 1) if pos < k else 0
                    recall = 1 if pos < k else 0

                    precisions.append(precision)
                    recalls.append(recall)

                    # Calculate Reciprocal Rank
                    rr_list.append(1 / (pos + 1))

            # Calculate average precision, recall, and mean reciprocal rank for the file
            avg_precision = sum(precisions) / len(precisions) if precisions else 0
            avg_recall = sum(recalls) / len(recalls) if recalls else 0
            mrr = sum(rr_list) / len(rr_list) if rr_list else 0

            file_evaluations[file] = {
                'Average Precision': avg_precision,
                'Average Recall': avg_recall,
                'MRR': mrr
            }

        return file_evaluations


    # get file_positions
    queries = [
        {
            'actual_modified_files': ['a', 'b', 'c', 'd', 'e', 'f'],
            'retrieved_files': ['q', 'b', 'd', 'x', 'a', 'z', 'c']
        },
        {
            'actual_modified_files': ['p', 'q', 'r', 'a', 'b'],
            'retrieved_files': ['r', 'b', 'p', 'u', 'a', 'q']
        }
    ]
    file_positions = {}
    for query in queries:
        actual_modified_files = query['actual_modified_files']
        retrieved_files = query['retrieved_files']
        relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]
        for idx, file in enumerate(retrieved_files):
            if file not in file_positions:
                file_positions[file] = []
            if relevant[idx] == 1:
                file_positions[file].append(idx)
            else:
                file_positions[file].append(-1)

    print(f'{file_positions=}')
    k = 3

    file_based_results = file_based_evaluation(file_positions.items(), k)

    # Print the evaluation results for each file
    for file, metrics in file_based_results.items():
        print(f"File: {file}, Metrics: {metrics}")

    # Print the average evaluation results for all files
    avg_precision = sum([metrics['Average Precision'] for metrics in file_based_results.values()]) / len(file_based_results)
    avg_recall = sum([metrics['Average Recall'] for metrics in file_based_results.values()]) / len(file_based_results)
    avg_mrr = sum([metrics['MRR'] for metrics in file_based_results.values()]) / len(file_based_results)
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average MRR: {avg_mrr}")

    return {
        'Average Precision': avg_precision,
        'Average Recall': avg_recall,
        'MRR': avg_mrr
    }

c2 = file_based_demo()

file_positions={'q': [-1, 5], 'b': [1, 1], 'd': [2], 'x': [-1], 'a': [4, 4], 'z': [-1], 'c': [6], 'r': [0], 'p': [2], 'u': [-1]}
File: q, Metrics: {'Average Precision': 0.0, 'Average Recall': 0.0, 'MRR': 0.16666666666666666}
File: b, Metrics: {'Average Precision': 0.5, 'Average Recall': 1.0, 'MRR': 0.5}
File: d, Metrics: {'Average Precision': 0.3333333333333333, 'Average Recall': 1.0, 'MRR': 0.3333333333333333}
File: x, Metrics: {'Average Precision': 0, 'Average Recall': 0, 'MRR': 0}
File: a, Metrics: {'Average Precision': 0.0, 'Average Recall': 0.0, 'MRR': 0.2}
File: z, Metrics: {'Average Precision': 0, 'Average Recall': 0, 'MRR': 0}
File: c, Metrics: {'Average Precision': 0.0, 'Average Recall': 0.0, 'MRR': 0.14285714285714285}
File: r, Metrics: {'Average Precision': 1.0, 'Average Recall': 1.0, 'MRR': 1.0}
File: p, Metrics: {'Average Precision': 0.3333333333333333, 'Average Recall': 1.0, 'MRR': 0.3333333333333333}
File: u, Metrics: {'Average Precision': 0, 'Average Recall': 0, 'MRR': 

In [14]:
print('Macro Average (Commit/Query based)')
print(c1)

print()
print('Macro Average (File based)')
print(c2)

Macro Average (Commit/Query based)
{'P@k': 0.8333333333333333, 'MRR': 0.75, 'MAP': 0.755595238095238, 'Recall@k': 0.4666666666666667}

Macro Average (File based)
{'Average Precision': 0.21666666666666665, 'Average Recall': 0.4, 'MRR': 0.2676190476190476}


In [49]:
def demo3():
    # our MAP vs sklearn's AP
    actual_modified_files = ['a', 'b', 'c', 'd', 'e', 'f']
    retrieved_files = ['q', 'b', 'd', 'x', 'a', 'z', 'c']
    relevant = [1 if file in actual_modified_files else 0 for file in retrieved_files]
    print(f'{actual_modified_files=}')
    print(f'{retrieved_files=}')
    print(f'{relevant=}')

    # MAP
    # (1/2 + 2/3 + 3/5 + 4/7) / 4
    _map = calculate_average_precision(relevant)
    print(f'MAP = {_map}')


    print('Sklearn AP')
    print('Actually designed for binary classification, like')
    y_true = [1, 0, 0, 1]
    y_pred = [0.1, 0.5, 0.35, 0.8]

    # print confusion matrix
    # P = tp / (tp + fp)
    print(f'{y_true=}')
    print(f'{y_pred=}')
    ap = average_precision_score(y_true, y_pred)
    print(f'Sklean AP = {ap}')

    def replicate_sklearn_ap():
        p_list = []
        for i, threshold in enumerate(y_pred):
            if y_true[i] == 0:
                continue
            y_pred_binary = [1 if score >= threshold else 0 for score in y_pred]
            # print(f'{threshold=}')
            # print(f'{y_pred_binary=}')
            # print(f'{confusion_matrix(y_true, y_pred_binary)=}')
            tp = confusion_matrix(y_true, y_pred_binary)[1][1]
            fp = confusion_matrix(y_true, y_pred_binary)[0][1]
            p = tp / (tp + fp)
            print(f'{p=}')
            p_list.append(p)
        print(f'{p_list=}')
        print(f'Replicated AP = {sum(p_list) / len(p_list)=}')

    replicate_sklearn_ap()


    # Old MAP with sklearn:
    # scores = [len(retrieved_files) - i for i in range(len(retrieved_files))]
    # scores = [i for i in range(len(retrieved_files))]
    ap = average_precision_score(relevant, [1 for i in range(len(retrieved_files))])
    print(f'OLD MAP (with sklearn) = {ap}')

demo3()

actual_modified_files=['a', 'b', 'c', 'd', 'e', 'f']
retrieved_files=['q', 'b', 'd', 'x', 'a', 'z', 'c']
relevant=[0, 1, 1, 0, 1, 0, 1]
MAP = 0.5845238095238094
Sklearn AP
Actually designed for binary classification, like
y_true=[1, 0, 0, 1]
y_pred=[0.1, 0.5, 0.35, 0.8]
Sklean AP = 0.75
p=0.5
p=1.0
p_list=[0.5, 1.0]
Replicated AP = sum(p_list) / len(p_list)=0.75
OLD MAP (with sklearn) = 0.5714285714285714


In [None]:
# Example usage
    # Prepare data: Map each file to its positions in retrieved results for each query
    # file_positions = {
    #     'a': [4, -1],  # File 'a' is at position 4 in first query, not retrieved in second query
    #     'b': [1, 1],
    #     'c': [6, -1],
    #     'd': [2, -1],
    #     'e': [-1, -1],
    #     'f': [-1, -1],
    #     'p': [-1, 2],
    #     'q': [0, 5],
    #     'r': [-1, 0],
    #     'u': [-1, 3],
    #     'x': [3, -1],
    #     'z': [5, -1]
    # }