# Scifact Retriever

By: Alvaro, Lyzander, James

## Install packages

In [1]:
!pip install datasets
!pip install -U sentence-transformers -q
!pip install contractions
!pip install python-terrier -q
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git
!pip install faiss-gpu==1.7.2

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

## Importing Necessities

In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer, CrossEncoder
import string
import random
import contractions
import pyterrier as pt
from pyterrier.measures import *
from sentence_transformers import util
from sentence_transformers.util import cos_sim
from pyterrier_t5 import MonoT5ReRanker
import pickle

if not pt.java.started():
    pt.java.init()

nltk.download('stopwords')

terrier-assemblies 5.10 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f966d374950>

## Load Dataset

Dataset is received from Hugging Face (**BeIR - Scifact**)

Scifact stands for **Science Fact**

In [4]:
from huggingface_hub import notebook_login

notebook_login() # Run these if needed

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# https://huggingface.co/datasets/BeIR/scifact
corpus = load_dataset("mteb/trec-covid", "corpus")
queries = load_dataset("mteb/trec-covid", "queries")

# https://huggingface.co/datasets/BeIR/scifact-qrels
# train_qrels = load_dataset("BeIR/scifact-qrels", "train")
# test_qrels = load_dataset("BeIR/scifact-qrels", "test")
qrels = load_dataset("mteb/trec-covid", "default")

qrels

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

corpus.jsonl:   0%|          | 0.00/200M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/171332 [00:00<?, ? examples/s]

queries.jsonl:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/50 [00:00<?, ? examples/s]

qrels/test.jsonl:   0%|          | 0.00/3.83M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/66336 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 66336
    })
})

In [6]:
corpus_df = pd.DataFrame(corpus["corpus"])
queries_df = pd.DataFrame(queries['queries'])

qrels_df = pd.DataFrame(qrels["test"])

## Inspecting Data

In [7]:
corpus_df.head()

Unnamed: 0,_id,title,text
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...


In [8]:
queries_df

Unnamed: 0,_id,text
0,1,what is the origin of COVID-19
1,2,how does the coronavirus respond to changes in...
2,3,will SARS-CoV2 infected people develop immunit...
3,4,what causes death from Covid-19?
4,5,what drugs have been active against SARS-CoV o...
5,6,what types of rapid testing for Covid-19 have ...
6,7,are there serological tests that detect antibo...
7,8,how has lack of testing availability led to un...
8,9,how has COVID-19 affected Canada
9,10,has social distancing had an impact on slowing...


In [9]:
qrels_df

Unnamed: 0,query-id,corpus-id,score
0,1,005b2j4b,2.0
1,1,00fmeepz,1.0
2,1,g7dhmyyo,2.0
3,1,0194oljo,1.0
4,1,021q9884,1.0
...,...,...,...
66331,50,zvop8bxh,2.0
66332,50,zwf26o63,1.0
66333,50,zwsvlnwe,0.0
66334,50,zxr01yln,1.0


## Utilities

In [10]:
def remove_nonalphanum(text):
  pattern = re.compile('[\W_]+')
  return pattern.sub(' ', text)

def sanitize_special_chars(query):
  sanitized_query = re.sub(r'[^\w\s]', '', query)
  return sanitized_query

class TextPreprocessor:
    def __init__(self, remove_stopwords=True, stem=False):
        self.remove_stopwords = remove_stopwords
        self.stem = stem
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

        self.keep_words = {'what', 'how', 'why', 'which', 'who', 'when', 'where'}
        self.stop_words = self.stop_words - self.keep_words

    def clean_text(self, text):
        text = str(text)

        # Contractions
        text = contractions.fix(text)

        text = re.sub(r'[^a-zA-Z\s]', ' ', text)

        text = ' '.join(text.split())

        return text.lower()

    def process_text(self, text):
        text = self.clean_text(text)

        tokens = word_tokenize(text, preserve_line=True)

        if self.remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]

        if self.stem:
            tokens = [self.stemmer.stem(token) for token in tokens]

        return ' '.join(tokens)

class PreProcessor:
  def __init__(self, df, text_preprocessor):
    self.df = df
    self.text_preprocessor = text_preprocessor

  def get_df(self):
    return self.df

  def combine_title_text(self, row, title_weight=1.5):
      title = row['title'] if isinstance(row['title'], str) else ''
      text = row['text'] if isinstance(row['text'], str) else ''

      weighted_title = ' '.join([title] * int(title_weight))

      return f"{weighted_title} {text}".strip()

  def preprocess_documents(self):
    self.df['text'] = self.df.apply(self.combine_title_text, axis=1)
    self.df["text_raw"] = self.df["text"]

    self.df['text'] = self.df['text'].apply(lambda txt: txt.lower() if isinstance(txt, str) else txt)

  def change_column_name(self, old_label, new_label):
    self.df.rename(columns={old_label: new_label}, inplace = True)

  def dynamic_apply_column(self, column_name, apply_method):
    if column_name in self.df.columns:
        self.df[column_name] = self.df[column_name].apply(apply_method)
    else:
        raise ValueError(f"Column '{column_name}' does not exist in the dataframe.")

  def add_new_attribute(self, column_name, old_column_name):
    self.df[column_name] = self.df[old_column_name]

## Augment Data

### Preprocessor

In [11]:
text_preprocessor = TextPreprocessor(False, False)

# For corpus
corpus_preprocessor = PreProcessor(corpus_df, text_preprocessor)

# For qrels
qrels_preprocessor = PreProcessor(qrels_df, text_preprocessor)

# For queries
queries_preprocessor = PreProcessor(queries_df, text_preprocessor)

### Corpus

In [12]:
# Augment data for corpus
corpus_preprocessor.preprocess_documents()
corpus_preprocessor.change_column_name("_id", "docno")

processed_corpus = corpus_preprocessor.get_df()

In [13]:
# Inspect, some title don't have any content in it.

processed_corpus[processed_corpus["docno"] == "7e8r61e7"]

Unnamed: 0,docno,title,text,text_raw
171327,7e8r61e7,Can Pediatric COVID-19 Testing Sensitivity Be ...,can pediatric covid-19 testing sensitivity be ...,Can Pediatric COVID-19 Testing Sensitivity Be ...


### Qrels

In [14]:
qrels_preprocessor.change_column_name("query-id", "qid")
qrels_preprocessor.change_column_name("corpus-id", "docno")
qrels_preprocessor.change_column_name("score", "label")

qrels_preprocessor.dynamic_apply_column("label", lambda x: 1 if x >= 1 else 0)

processed_qrels = qrels_preprocessor.get_df()

In [15]:
# Inspect

processed_qrels.head()

Unnamed: 0,qid,docno,label
0,1,005b2j4b,1
1,1,00fmeepz,1
2,1,g7dhmyyo,1
3,1,0194oljo,1
4,1,021q9884,1


### Queries

In [16]:
queries_preprocessor.change_column_name("_id", "qid")
queries_preprocessor.change_column_name("text", "query")

queries_preprocessor.add_new_attribute("query_raw", "query")
queries_preprocessor.dynamic_apply_column("query", lambda txt: txt.lower() if isinstance(txt, str) else txt)

# Apparently, pyterrier doesn't support special characters in query, hence we need to escape it
# for example: "will sars-cov2 infected people develop immunity? is cross protection possible?"
# will be: "will sars-cov2 infected people develop immunity\? is cross protection possible\?"

queries_preprocessor.dynamic_apply_column("query", sanitize_special_chars)
queries_preprocessor.dynamic_apply_column("query_raw", sanitize_special_chars)

processed_queries = queries_preprocessor.get_df()

In [17]:
# Inspect

processed_queries.head()

Unnamed: 0,qid,query,query_raw
0,1,what is the origin of covid19,what is the origin of COVID19
1,2,how does the coronavirus respond to changes in...,how does the coronavirus respond to changes in...
2,3,will sarscov2 infected people develop immunity...,will SARSCoV2 infected people develop immunity...
3,4,what causes death from covid19,what causes death from Covid19
4,5,what drugs have been active against sarscov or...,what drugs have been active against SARSCoV or...


## Indexing

In [18]:
!rm -rf ./index

import os

index_path = os.path.abspath("./index/pyterrier")
iter_indexer = pt.IterDictIndexer(index_path, meta={"docno": 26, "text": 2048, "text_raw": 2048})

index_ref = iter_indexer.index(processed_corpus.to_dict(orient="records"))

index = pt.IndexFactory.of(index_ref)

20:22:53.709 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer -- Indexed 17 empty documents


In [19]:
CUT_OFF = 30

## BM25

In [20]:
bm25 = pt.terrier.Retriever(index, wmodel="BM25")

# For now let's use the same apporach as TPK4

## BM25 >> BiEncoder

In [21]:
bimodel = SentenceTransformer('msmarco-distilbert-base-tas-b')
BATCH_SIZE = 128

def _biencoder_apply(dataframe):
    query_embs = bimodel.encode(dataframe['query_raw'].values)
    doc_embs = bimodel.encode(dataframe['text_raw'].values)
    scores = cos_sim(query_embs, doc_embs)
    return scores[0]

bi_encT = pt.apply.doc_score(_biencoder_apply, batch_size=BATCH_SIZE)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
bm25_biencoder = (
    bm25 % CUT_OFF
    >> pt.text.get_text(index, "text_raw")
    >> bi_encT
)

## BM25 >> Cross Encoder

In [23]:
crossmodel = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)

def _crossencoder_apply(dataframe):
    return crossmodel.predict(
        list(zip(dataframe['query_raw'].values, dataframe['text_raw'].values))
    )

cross_encT = pt.apply.doc_score(_crossencoder_apply, batch_size=BATCH_SIZE)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [24]:
bm25_crossencoder = (
    bm25 % CUT_OFF
    >> pt.text.get_text(index, "text_raw")
    >> cross_encT
)

## BiEncoder >> CrossEncoder

In [25]:
biencoder_crossencoder = bm25 >> pt.text.get_text(index, "text_raw") >> bi_encT % CUT_OFF >> pt.text.get_text(index, ["text_raw"]) >> cross_encT

## BM25 >> MonoT5

In [26]:
mono_t5 = MonoT5ReRanker("castorini/monot5-base-msmarco", text_field = "text_raw")

tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [27]:
bm25_monot5 = (bm25 % CUT_OFF) >> pt.text.get_text(index, "text_raw") >> mono_t5

## BiEncoder >> MonoT5

In [28]:
biencoder_monot5 = bm25 >> pt.text.get_text(index, "text_raw") >> bi_encT % CUT_OFF >> pt.text.get_text(index, ["text_raw"]) >> mono_t5

## CrossEncoder >> MonoT5

In [29]:
crossencoder_monot5 = bm25 >> pt.text.get_text(index, "text_raw") >> cross_encT % CUT_OFF >> pt.text.get_text(index, ["text_raw"]) >> mono_t5

## BiEncoder >> CrossEncoder >> MonoT5

In [30]:
biencoder_crossencoder_monot5 = bm25 >> pt.text.get_text(index, "text_raw") >> bi_encT % CUT_OFF >> pt.text.get_text(index, ["text_raw"]) >> cross_encT >> pt.text.get_text(index, ["text_raw"]) >> mono_t5

## Experiment

In [31]:
eval_metrics=[P@10, "map", nDCG@10]

pt.Experiment(
    [bm25, bm25_biencoder, bm25_crossencoder, biencoder_crossencoder, bm25_monot5, biencoder_monot5, crossencoder_monot5, biencoder_crossencoder_monot5],
    processed_queries,
    processed_qrels,
    eval_metrics=eval_metrics,
    names=["BM25", "BM25 >> Bi-encoder", "BM25 >> Cross-encoder", "Bi-encoder >> Cross-encoder", "BM25 >> MonoT5", "Bi-encoder >> MonoT5", "Cross-encoder >> MonoT5", "Bi-encoder >> Cross-encoder >> MonoT5"],
    baseline=0
)

monoT5:   0%|          | 0/375 [00:00<?, ?batches/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Token indices sequence length is longer than the specified maximum sequence length for this model (791 > 512). Running this sequence through the model will result in indexing errors


monoT5:   0%|          | 0/375 [00:00<?, ?batches/s]

monoT5:   0%|          | 0/375 [00:00<?, ?batches/s]

monoT5:   0%|          | 0/375 [00:00<?, ?batches/s]

Unnamed: 0,name,map,P@10,nDCG@10,map +,map -,map p-value,P@10 +,P@10 -,P@10 p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,BM25,0.134367,0.434,0.435145,,,,,,,,,
1,BM25 >> Bi-encoder,0.021139,0.49,0.480231,1.0,49.0,2.92637e-09,21.0,12.0,0.06542033,22.0,22.0,0.1956452
2,BM25 >> Cross-encoder,0.024452,0.59,0.616905,1.0,49.0,4.159683e-09,34.0,5.0,1.650589e-07,39.0,5.0,1.093098e-07
3,Bi-encoder >> Cross-encoder,0.025877,0.668,0.693668,5.0,45.0,1.972121e-08,36.0,8.0,1.680865e-06,38.0,11.0,4.539252e-07
4,BM25 >> MonoT5,0.024869,0.62,0.647212,2.0,48.0,4.85027e-09,35.0,3.0,2.193784e-07,39.0,6.0,2.344388e-07
5,Bi-encoder >> MonoT5,0.026146,0.664,0.705318,5.0,45.0,2.257208e-08,38.0,8.0,2.108664e-06,40.0,9.0,3.241663e-07
6,Cross-encoder >> MonoT5,0.044196,0.812,0.829669,14.0,36.0,1.602329e-06,43.0,5.0,9.466084e-10,45.0,4.0,3.159294e-10
7,Bi-encoder >> Cross-encoder >> MonoT5,0.026146,0.664,0.705318,5.0,45.0,2.257208e-08,38.0,8.0,2.108664e-06,40.0,9.0,3.241663e-07


## Save the model

In [34]:
pipeline_config = {
    'cut_off': CUT_OFF,
    'index_path': index_path,
    'components': {
        'bm25': {
            'type': 'BM25',
            'metadata': ['text']
        },
        'cross_encoder': {
            'model_name': 'cross-encoder/ms-marco-MiniLM-L-6-v2', # TLDR: just change these, with the model that we save later (for now use pre-trained model)
            'max_length': 512
        },
        'mono_t5': {
            'model_name': 'castorini/monot5-base-msmarco',
            'text_field': "text_raw"
        }
    }
}

In [35]:
with open('crossencoder_monot5_config.pkl', 'wb') as f:
    pickle.dump(pipeline_config, f)

## Download the index file

In [40]:
!zip -r /content/index.zip /content/index

  adding: content/index/ (stored 0%)
  adding: content/index/pyterrier/ (stored 0%)
  adding: content/index/pyterrier/data.lexicon.fsomapfile (deflated 82%)
  adding: content/index/pyterrier/data.meta.idx (deflated 55%)
  adding: content/index/pyterrier/data.meta-0.fsomapfile (deflated 88%)
  adding: content/index/pyterrier/data.document.fsarrayfile (deflated 64%)
  adding: content/index/pyterrier/data.properties (deflated 79%)
  adding: content/index/pyterrier/data.meta.zdata (deflated 1%)
  adding: content/index/pyterrier/data.lexicon.fsomaphash (deflated 55%)
  adding: content/index/pyterrier/data.lexicon.fsomapid (deflated 30%)
  adding: content/index/pyterrier/data.inverted.bf (deflated 14%)
  adding: content/index/pyterrier/data.direct.bf (deflated 13%)


In [41]:
from google.colab import files
files.download("/content/index.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
queries = [{"qid": "1", "query": "when does covid happen?"}]

results = crossencoder_monot5.transform(queries_df)

results.head()