In [8]:
import pandas as pd
import bm25s
import Stemmer


def preprocess(row: pd.Series, key: str):
    company, year, *_ = row["document_id"].split("-")
    processed = f"{company}-{year}, {" ".join(row[key].split()[:])}"
    return processed


AMOUNT_QUERIES = 3
USE_AUG = True

documents_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv")
documents_df["processed"] = documents_df.apply(lambda x: preprocess(x, "document"), axis=1)

corpus = []

if USE_AUG:
    documents_aug_df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/documents_aug.csv")

    for i, document in enumerate(documents_df["processed"]):
        for j in range(AMOUNT_QUERIES):
            document += " " + documents_aug_df["pseudo_query"][i + j]
        corpus.append(document)
else:
    corpus = documents_df["processed"].to_list()

stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

docids = documents_df["document_id"].to_list()
splits = ["train", "eval", "test"]

query_splits = {split: [] for split in splits}

for split in splits:
    query_df = pd.read_csv(f"/home/nub/Bachelor/bachelor-thesis/data/processed/{split}.csv")
    query_df["processed"] = query_df.apply(lambda x: preprocess(x, "question"), axis=1)
    queries = query_df["processed"].to_list()
    
    query_tokens = bm25s.tokenize(queries, stopwords="en", stemmer=stemmer)
    results, scores = retriever.retrieve(query_tokens, corpus=docids, k=1, n_threads=-1)
    
    correct = 0
    for hits, docid in zip(results, query_df["document_id"]):
        correct += docid in hits

    print(split, correct / len(query_df))
    query_splits[split] = results

Split strings:   0%|          | 0/2789 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/2789 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/2789 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/2789 [00:00<?, ?it/s]

Split strings:   0%|          | 0/6251 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/6251 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/6251 [00:00<?, ?it/s]

train 0.6406974884018557


Split strings:   0%|          | 0/883 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/883 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/883 [00:00<?, ?it/s]

eval 0.6149490373725934


Split strings:   0%|          | 0/1147 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1147 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1147 [00:00<?, ?it/s]

test 0.6277244986922407


In [None]:
# train 0.3199488081906895
# eval 0.3023782559456399
# test 0.3086312118570183

# train 0.6445368741001439
# eval 0.6104190260475651
# test 0.6390584132519617

In [None]:
import torch

ckpt = torch.load("/home/nub/Bachelor/bachelor-thesis/models/finqa_full_base_10/checkpoint-17/optimizer.pt", map_location="cpu", weights_only=False)
print(ckpt.keys())
for k,v in ckpt.items():
    print(k, v)

dict_keys(['state', 'param_groups'])
state {0: {'step': 17, 'state1': tensor([[224,  24,  56,  ...,  75, 198, 180],
        [203,  60, 205,  ..., 209,  38,  64],
        [ 54,  54,  61,  ...,  89, 167, 202],
        ...,
        [ 55, 219,  79,  ...,  54, 192, 208],
        [ 53, 236, 153,  ...,  68, 203, 208],
        [ 55,  92,  57,  ...,  52, 211, 195]], dtype=torch.uint8), 'qmap1': tensor([-9.9297e-01, -9.7891e-01, -9.6484e-01, -9.5078e-01, -9.3672e-01,
        -9.2266e-01, -9.0859e-01, -8.9453e-01, -8.8047e-01, -8.6641e-01,
        -8.5234e-01, -8.3828e-01, -8.2422e-01, -8.1016e-01, -7.9609e-01,
        -7.8203e-01, -7.6797e-01, -7.5391e-01, -7.3984e-01, -7.2578e-01,
        -7.1172e-01, -6.9766e-01, -6.8359e-01, -6.6953e-01, -6.5547e-01,
        -6.4141e-01, -6.2734e-01, -6.1328e-01, -5.9922e-01, -5.8516e-01,
        -5.7109e-01, -5.5703e-01, -5.4297e-01, -5.2891e-01, -5.1484e-01,
        -5.0078e-01, -4.8672e-01, -4.7266e-01, -4.5859e-01, -4.4453e-01,
        -4.3047e-01, -4.164

KeyError: 'optimizer_state_dict'

In [13]:
import numpy as np

print(np.exp(2))
print(np.exp(2 - 1))
print(np.log(3.2) + 1)
print(np.log(3.3) + 1)
print(np.log2(64))

7.38905609893065
2.718281828459045
2.163150809805681
2.1939224684724348
6.0


In [42]:
print(np.exp(2))
print(np.log(7.38905609893065) + 1)
print(np.log(7.38905609893065 * np.e))

7.38905609893065
3.0
3.0
