In [1]:
import os, sys, json, time, re, string, codecs, random, numpy as np, pandas as pd
#import evaluate_at_K
from tqdm.notebook import tqdm_notebook
import pickle as pkl
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [2]:
class BM25(object):
    def __init__(self, b=0.7, k1=1.6, n_gram:int = 3):
        self.n_gram = n_gram
        self.vectorizer = TfidfVectorizer(max_df=.65, min_df=1,
                                  use_idf=True, 
                                  ngram_range=(n_gram, n_gram))
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        start_time = time.perf_counter()
        print(f"Fitting tf_idf vectorizer")

        y = self.vectorizer.fit_transform(X)  # Combine fit and transform
        self.avdl = y.sum(1).mean()

        print(f"Finished tf_idf vectorizer, time : {time.perf_counter() - start_time:0.3f} sec")

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        if not q.strip():  # Handle empty queries
            return np.zeros(len(X))  

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        
        if q.nnz == 0:  # If query has no tokens in vocabulary
            return np.zeros(len(X))  

        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [3]:
# Load Data
candidate_query = pd.read_csv('data/train_candidates.csv')  # Added `.csv`
corpus = list(candidate_query["text"])  # List of candidate case texts
citation_names = list(candidate_query["id"].astype(str))  # List of case IDs (string format)

train_query = pd.read_csv('data/train_queries.csv')  # Added `.csv`
query_corpus = list(train_query["text"])  # List of query case texts
query_names = list(train_query["id"].astype(str))  # List of query case IDs (string format)

In [11]:
# Convert `relevant_candidates` into a dictionary
true_labels = {}
for index, row in train_query.iterrows():
    query_id = str(row["id"])  
    relevant_cases = row["relevant_candidates"]

    if isinstance(relevant_cases, str):  
        relevant_cases = ast.literal_eval(relevant_cases)  
    
    if isinstance(relevant_cases, float):  
        relevant_cases = []
    
    true_labels[query_id] = list(map(str, relevant_cases))  

In [4]:
# Train BM25 Model
bm25 = BM25(n_gram=3)
bm25.fit(corpus)

Fitting tf_idf vectorizer
Finished tf_idf vectorizer, time : 74.136 sec


In [5]:
from tqdm import tqdm

In [6]:
# Compute BM25 scores
bm_25_results_dict = {}
for i in tqdm(range(len (query_corpus))):
    qu = query_corpus[i]
    qu_n = query_names[i]
    doc_scores = bm25.transform(qu, corpus)
    bm_25_results_dict[qu_n] = {citation_names[j]: doc_scores[j] for j in range(len(doc_scores))}

100%|██████████| 827/827 [7:44:17<00:00, 33.69s/it]  


In [7]:
# Convert results to DataFrame
bm_25_results_list = []
for query_id, scores in bm_25_results_dict.items():
    for doc_id, score in scores.items():
        bm_25_results_list.append([query_id, doc_id, score])
bm_25_results_df = pd.DataFrame(bm_25_results_list, columns=['query_case_id', 'candidate_case_id', 'bm25_score'])


In [8]:
bm_25_results_df.to_csv("bm25_similarity_trigram.csv", index=False)

In [9]:
bm_25_results_df.head()

Unnamed: 0,query_case_id,candidate_case_id,bm25_score
0,599,262,7.663426
1,599,599,565.505087
2,599,1566,13.726064
3,599,2408,6.876886
4,599,2442,8.041049


In [5]:
bm_25_results_df = pd.read_csv("results/bm25_similarity_trigram.csv")

### Get the true labels.json file

In [9]:
true_labels_json_path = "true_labels.json"

true_labels_dict = {
    "Query Set": [{"id": qid, "relevant candidates": rel_cases} for qid, rel_cases in true_labels.items()],
    "Candidate Set": [{"id": str(doc_id)} for doc_id in bm_25_results_df["candidate_case_id"].unique()]
}

with open(true_labels_json_path, "w") as f:
    json.dump(true_labels_dict, f, indent=4)

print(f"saved ground truth labels to {true_labels_json_path}")

saved ground truth labels to true_labels.json


### Get top 10 cases for each

In [8]:
import pandas as pd

# Get the top 10 candidate_case_id for each query_case_id based on bm25_score
top_k = bm_25_results_df.groupby("query_case_id").apply(lambda x: x.nlargest(10, "bm25_score")).reset_index(drop=True)

top_k
# Save the results
#top_k.to_csv("top_10_cases.csv", index=False)

Unnamed: 0,query_case_id,candidate_case_id,bm25_score
0,599,599,565.505087
1,599,1207163,79.834238
2,599,264503,79.293065
3,599,1292810,62.289251
4,599,1296654,53.997327
...,...,...,...
8265,198095330,1569551,34.974201
8266,198095330,1921344,33.565374
8267,198095330,1619781,32.490590
8268,198095330,651925,31.206012


In [9]:
import pandas as pd 
top_k.head(10)

Unnamed: 0,query_case_id,candidate_case_id,bm25_score
0,599,599,565.505087
1,599,1207163,79.834238
2,599,264503,79.293065
3,599,1292810,62.289251
4,599,1296654,53.997327
5,599,1742283,51.809976
6,599,175790,50.351227
7,599,468835,50.349985
8,599,811367,48.402506
9,599,1694314,47.644333
