# Assignment 3

## Setup & Dataset Retrieval

In [4]:
import os.path
import json
import logging
import pickle
from typing import Tuple, Dict

import numpy as np
import matplotlib.pyplot as plt
import rank_bm25 as bm25
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader

import src.utilities.tokenization as tkn
import src.utilities.scores as scr
import src.utilities.evaluation as eva

In [5]:
# Followed beir tutorial to download dataset:
# https://github.com/beir-cellar/beir#beers-quick-example

# Show beir library logs in stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

DATASET_NAME = "trec-covid"
DATASET_URL = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{DATASET_NAME}.zip"
DATA_DIR = "../../data"

# Download only if necessary
if not os.path.exists(os.path.join(DATA_DIR, DATASET_NAME)):
    data_path = util.download_and_unzip(DATASET_URL, DATA_DIR)
else:
    data_path = os.path.join(DATA_DIR, DATASET_NAME)

In [6]:
# Load documents, queries and ground truth from the downloaded dataset.
# Ground truth here is referred to as `_` because we do not care about it,
#   part of the assignment is to calculate our own ground truth
data: tkn.Dataset = GenericDataLoader(data_folder=data_path).load(split="test")
documents, queries, _ = data

2023-04-25 03:32:32 - Loading Corpus...


  0%|          | 0/171332 [00:00<?, ?it/s]

2023-04-25 03:32:35 - Loaded 171332 TEST Documents.
2023-04-25 03:32:35 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 

In [7]:
len(documents)

171332

## Tokenization

In [8]:
# document & query cleaning and tokenization
TOKENS_DIR = os.path.join(data_path, "tokens")
if not os.path.exists(TOKENS_DIR):
    os.mkdir(TOKENS_DIR)

QUERY_TOKENS_PATH = os.path.join(TOKENS_DIR, "query_tokens.pkl")
DOCS_TOKENS_PATH = os.path.join(TOKENS_DIR, "doc_tokens.pkl")

In [9]:
if not os.path.exists(DOCS_TOKENS_PATH):
    tokenized_docs: tkn.TokenizedDocuments = tkn.get_tokenized_documents(documents)

    with open(DOCS_TOKENS_PATH, "wb") as f:
        pickle.dump(tokenized_docs, f)
else:
    with open(DOCS_TOKENS_PATH, "rb") as f:
        tokenized_docs: tkn.TokenizedDocuments = pickle.load(f)

tokenized_docs[0:10]

Tokenizing documents: 100%|██████████| 171332/171332 [13:06<00:00, 217.93it/s]


[TokenizedText(text_id='ug7v899j', tokens=['clinical', 'features', 'culture', 'proven', '##co', '##pl', '##as', '##ma', 'pneumonia', '##e', 'infections', 'king', 'abdul', '##azi', '##z', 'university', 'hospital', 'jed', '##dah', 'saudi', 'arabia', 'objective', 'retrospective', 'chart', 'review', 'describes', 'ep', '##ide', '##mi', '##ology', 'clinical', 'features', '40', 'patients', 'culture', 'proven', '##co', '##pl', '##as', '##ma', 'pneumonia', '##e', 'infections', 'king', 'abdul', '##azi', '##z', 'university', 'hospital', 'jed', '##dah', 'saudi', 'arabia', 'methods', 'patients', 'positive', 'pneumonia', '##e', 'cultures', 'respiratory', 'specimens', 'january', '1997', 'december', '1998', 'identified', 'micro', '##biology', 'records', 'charts', 'patients', 'reviewed', 'results', '40', 'patients', 'identified', '33', '82', '5', 'required', 'admission', 'infections', '92', '5', 'community', 'acquired', 'infection', 'affected', 'age', 'groups', 'common', 'infants', '32', '5', 'pre', 's

In [10]:
if not os.path.exists(QUERY_TOKENS_PATH):
    tokenized_queries: tkn.TokenizedQueries = tkn.get_tokenized_queries(queries)

    with open(QUERY_TOKENS_PATH, "wb") as f:
        pickle.dump(tokenized_queries, f)
else:
    with open(QUERY_TOKENS_PATH, "rb") as f:
        tokenized_queries: tkn.TokenizedQueries = pickle.load(f)

tokenized_queries[0:10]

Tokenizing queries: 100%|██████████| 50/50 [00:00<00:00, 1509.12it/s]


[TokenizedText(text_id='1', tokens=['origin', 'co', '##vid', '19']),
 TokenizedText(text_id='2', tokens=['corona', '##virus', 'respond', 'changes', 'weather']),
 TokenizedText(text_id='3', tokens=['sar', '##s', 'co', '##v', '##2', 'infected', 'people', 'develop', 'immunity', 'cross', 'protection', 'possible']),
 TokenizedText(text_id='4', tokens=['causes', 'death', 'co', '##vid', '19']),
 TokenizedText(text_id='5', tokens=['drugs', 'active', 'sar', '##s', 'co', '##v', 'sar', '##s', 'co', '##v', '2', 'animal', 'studies']),
 TokenizedText(text_id='6', tokens=['types', 'rapid', 'testing', 'co', '##vid', '19', 'developed']),
 TokenizedText(text_id='7', tokens=['ser', '##ological', 'tests', 'detect', 'antibodies', 'corona', '##virus']),
 TokenizedText(text_id='8', tokens=['lack', 'testing', 'availability', 'led', '##re', '##port', '##ing', 'true', 'incidence', 'co', '##vid', '19']),
 TokenizedText(text_id='9', tokens=['co', '##vid', '19', 'affected', 'canada']),
 TokenizedText(text_id='10',

These two maps are used to map the raw numpy array indices to their actual text ids in the original dataset.
This is useful to later retrieve the original text

These index-to-ID mappings can be useful for looking up documents or queries by their index in a list or array, rather than by their text ID. For example, if you have a list of relevance scores for each query-document pair, you could use these mappings to look up the text IDs of the corresponding queries and documents based on their index in the relevance score list.

In [11]:
doc_idx_to_id = {
    i: d.text_id for i, d in enumerate(tokenized_docs)
}

query_idx_to_id = {
    i: q.text_id for i, q in enumerate(tokenized_queries)
}

## Embeddings & Document Scores

In [12]:
BM25_SCORER = bm25.BM25Okapi(corpus=[d.tokens for d in tokenized_docs])

SCORES_DIR_PATH = os.path.join(data_path, "scores")

scores_by_query_id: Dict[str, eva.ScoresPair] = {}
"""
query_id: (dense, sparse)
"""

'\nquery_id: (dense, sparse)\n'

In [17]:
if not os.path.exists(SCORES_DIR_PATH):
    os.mkdir(SCORES_DIR_PATH)

    for q_idx, q_text in enumerate(queries.values()):
        dense = scr.get_dense_embeddings_scores(query=q_text, docs=documents)
        sparse = scr.get_sparse_embeddings_scores(query=tokenized_queries[q_idx], scorer=BM25_SCORER)
        scores_by_query_id[q_idx] = eva.ScoresPair(dense, sparse)

        q_id = query_idx_to_id[q_idx]
        np.savetxt(os.path.join(SCORES_DIR_PATH, f"dense_scores_{q_id}.np"), dense, delimiter=",")
        np.savetxt(os.path.join(SCORES_DIR_PATH, f"sparse_scores_{q_id}.np"), sparse, delimiter=",")
else:
    for q_id in queries.keys():
        sparse_path = os.path.join(SCORES_DIR_PATH, f"sparse_scores_{q_id}.np")
        dense_path = os.path.join(SCORES_DIR_PATH, f"dense_scores_{q_id}.np")

        # I might have decided to not examine all the queries to save time
        if not os.path.exists(sparse_path) or not os.path.exists(dense_path):
            break

        sparse = np.genfromtxt(sparse_path, delimiter=",")
        dense = np.genfromtxt(dense_path, delimiter=",")
        scores_by_query_id[q_id] = eva.ScoresPair(dense, sparse)

## Evaluation

In [18]:
K_VALUES = [5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000]
STEP = 1

In [None]:
RESULTS_DIR = os.path.join(data_path, "results")
RESULTS_FILE_PATH = os.path.join(RESULTS_DIR, "results.pkl")
if not os.path.exists(RESULTS_FILE_PATH):
    os.mkdir(RESULTS_DIR)

    results_by_k: eva.ResultsByK = eva.get_dataset_results(
        scores_by_query_id=scores_by_query_id,
        k_values=K_VALUES,
        idx_to_doc_id=doc_idx_to_id
    )

    with(RESULTS_FILE_PATH, "wb") as f:
        pickle.dump(results_by_k, f)
else:
    with(RESULTS_FILE_PATH, "rb") as f:
        results_by_k: eva.ResultsByK = pickle.load(f)

In [None]:
list(results_by_k.keys())

## Template (TODO later remove)