In [None]:
import pandas as pd
import bm25s
import Stemmer


def preprocess(row: pd.Series, key: str):
    company, year, _ = row["document_id"].split("/")
    processed = f"{company} in {year}, {" ".join(row[key].split()[:])}"
    return processed


documents_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv")
documents_df["processed"] = documents_df.apply(lambda x: preprocess(x, "document"), axis=1)
corpus = documents_df["processed"].to_list()

stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

docids = documents_df["document_id"].to_list()
splits = ["train", "eval", "test"]

query_splits = {split: [] for split in splits}

for split in splits:
    query_df = pd.read_csv(f"/home/nub/Bachelor/bachelor-thesis/data/processed/{split}.csv")
    query_df["processed"] = query_df.apply(lambda x: preprocess(x, "question"), axis=1)
    queries = query_df["processed"].to_list()
    
    query_tokens = bm25s.tokenize(queries, stopwords="en", stemmer=stemmer)
    results, scores = retriever.retrieve(query_tokens, corpus=docids, k=1, n_threads=-1)
    
    correct = 0
    for hits, docid in zip(results, query_df["document_id"]):
        correct += docid in hits

    print(split, correct / len(query_df))
    query_splits[split] = results

Split strings:   0%|          | 0/2789 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/2789 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/2789 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/2789 [00:00<?, ?it/s]

Split strings:   0%|          | 0/6251 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/6251 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/6251 [00:00<?, ?it/s]

train 0.2626779715245561


Split strings:   0%|          | 0/883 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/883 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/883 [00:00<?, ?it/s]

eval 0.2627406568516421


Split strings:   0%|          | 0/1147 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1147 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1147 [00:00<?, ?it/s]

test 0.2728857890148213


In [None]:
# train 0.3199488081906895
# eval 0.3023782559456399
# test 0.3086312118570183

# train 0.6445368741001439
# eval 0.6104190260475651
# test 0.6390584132519617