**Progetto - Information Retrieval**

Vincenzo Pallini - 907303

Effettueremo pre-processing del dataset "Antique" (raccolta di q/a di Yahoo), successivamente svilupperemo un Search Engine che ritornerà i documenti più rilevanti rispetto alla query inserita.
Infine, effettueremo il re-ranking dei documenti più rilevanti con KNRM e BERT.



In [None]:
pip install python-terrier

Collecting python-terrier
  Downloading python-terrier-0.10.0.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting deprecated (from python-terrier)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting chest (from pyt

Download della versione TEST del dataset Antique

In [None]:
import os
import pyterrier as pt
if not pt.started():
    pt.init()

antique_test = pt.datasets.get_dataset('irds:antique/test')

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



PREPROCESSING

In [None]:
import subprocess
import re
import string
import nltk
from nltk.tokenize import WordPunctTokenizer
from itertools import islice
from tqdm import tqdm
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

if not pt.started():
    pt.init()

stop_words = nltk.corpus.stopwords.words('english')

def remove_whitespaces(text):
    return re.sub(r'\s+', ' ', text)

def toLowerCase (text):
    return text.lower()

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_symbols(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def remove_punctuation(text, punctuation=None):
    if punctuation is None:
        punctuation = string.punctuation
    return text.translate(str.maketrans(punctuation, ' '*len(punctuation)))

def remove_emojis(text):
    #Reference: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_repeated(text):
  cleaned_text = re.sub(r'(.)\1+', r'\1\1', text)
  blob = TextBlob(cleaned_text)
  corrected_blob = blob.correct()
  return ('\t' + str(corrected_blob))

stemmer = PorterStemmer()

def tokenize(text):
  tokenized_text = WordPunctTokenizer().tokenize(text)
  return tokenized_text

def remove_stopwords(text):
  tokenized_text_without_stopwords = []
  for token in text:
    if token.lower() not in stop_words:
      tokenized_text_without_stopwords.append(token)
  return tokenized_text_without_stopwords

# Funzione per eseguire lo stemming di un documento utilizzando PyTerrier e NLTK
def stem(text):
    # Applicazione dello stemming a ciascun token utilizzando il Porter Stemmer
    stemmed_text = [stemmer.stem(word) for word in text]

    # Ricostruzione del documento a partire dai token stemmati
    #stemmed_document = ' '.join(stemmed_text)

    return stemmed_text

#def remove_urls(text):
#  return re.sub(r'https?://\S+|www.\.\S+', '', text)


new_corpus=[]
for doc in antique_test.get_corpus_iter():
#for doc in tqdm(islice(antique_train.get_corpus_iter(), 5)):

    doc['text'] = remove_whitespaces(doc['text'])
    doc['text'] = toLowerCase(doc['text'])
    doc['text'] = remove_numbers(doc['text'])
    doc['text'] = remove_symbols(doc['text'])
    doc['text'] = remove_punctuation(doc['text'])
    doc['text'] = remove_emojis(doc['text'])
    #doc['text'] = remove_repeated(doc['text'])
    doc['text'] = tokenize(doc['text'])
    doc['text'] = remove_stopwords(doc['text'])
    #doc['text'] = stem(doc['text'])
    new_corpus.append(doc)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


antique/test documents:   0%|          | 0/403666 [00:00<?, ?it/s]

[INFO] Please confirm you agree to the authors' data usage agreement found at <https://ciir.cs.umass.edu/downloads/Antique/readme.txt>
[INFO] If you have a local copy of https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/684f7015aff377062a758e478476aac8
[INFO] [starting] https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt

https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.0%| 0.00/93.6M [00:00<?, ?B/s][A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.0%| 41.0k/93.6M [00:00<05:53, 265kB/s][A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.2%| 156k/93.6M [00:00<03:07, 499kB/s] [A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 0.6%| 565k/93.6M [00:00<01:08, 1.37MB/s][A
https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt: 1.4%| 1.34M/93.6M [00:00<00:37, 2.44MB/s][A
https:

INDEXING

In [None]:
pt_index_path = "./terrier_antique"
if not os.path.exists(pt_index_path + "/data.properties"):
    indexer = pt.index.IterDictIndexer(pt_index_path)
#.get_corpus_iter()
    index_ref = indexer.index(new_corpus,
                              fields=('text',),
                              meta=('text', 'docno'))
else:
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index2 = pt.IndexFactory.of(index_ref)

  index_ref = indexer.index(new_corpus,


15:15:23.588 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3501 empty documents


COLLECTION STATISTICS

In [None]:
print(index2.getCollectionStatistics().toString())

Number of documents: 403666
Number of terms: 207867
Number of postings: 5789456
Number of fields: 1
Number of tokens: 6957297
Field names: [text]
Positions:   false



In [None]:
words = ["free", "ask", "why", "what"]

for w in words:
    if w in index2.getLexicon():
        freq = index2.getLexicon()[w].getFrequency()
        total = index2.getCollectionStatistics().getNumberOfTokens()
        print(f'term {w}: {freq}/{total}={freq/total}')
    else:
        print(f'term {w} not in the index')

term free: 6215/6957297=0.0008933066965518361
term ask: 17583/6957297=0.0025272746010411803
term why: 26/6957297=3.7370835253978662e-06
term what: 2211/6957297=0.00031779583364056473


In [None]:
index2.getLexicon()["israel"].toString()

'term1418 Nt=910 TF=1569 maxTF=2147483647 @{0 5519232 6} TFf=1569'

SEARCH ENGINES (TF_IDF, BM25)

In [None]:
tfidf = pt.BatchRetrieve(index2, wmodel="TF_IDF")
print(tfidf.search('+israel').head(5))

  qid   docid      docno  rank     score    query
0   1   93546  1544892_4     0  8.780097  +israel
1   1  128262  2174097_0     1  8.763924  +israel
2   1  313072  4009854_2     2  8.138918  +israel
3   1   82102  3705248_1     3  8.064638  +israel
4   1  362941    80655_3     4  7.920074  +israel


In [None]:
bm25 = pt.BatchRetrieve(index2, wmodel="BM25")
print(bm25.search('+israel').head(5))

  qid   docid      docno  rank      score    query
0   1   93546  1544892_4     0  16.083493  +israel
1   1  128262  2174097_0     1  16.053869  +israel
2   1  313072  4009854_2     2  14.908974  +israel
3   1   82102  3705248_1     3  14.772908  +israel
4   1  362941    80655_3     4  14.508093  +israel


In [None]:
id_to_text = {x['docno']: x['text'] for x in new_corpus}
display(tfidf.search('israel').head(5))

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,93546,1544892_4,0,8.780097,israel
1,1,128262,2174097_0,1,8.763924,israel
2,1,313072,4009854_2,2,8.138918,israel
3,1,82102,3705248_1,3,8.064638,israel
4,1,362941,80655_3,4,7.920074,israel


In [None]:
id_to_text['1544892_4']

['israel', 'us', 'us', 'israel']

EVALUATION

In [None]:
from pyterrier.measures import *

adhoc = antique_test.get_topics(variant='text')

results = pt.Experiment(
    [tfidf, bm25],
    adhoc,
    antique_test.get_qrels(),
    eval_metrics=[AP@100, P@5, P@10]
)
display(results)

[INFO] [starting] https://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt
[INFO] [finished] https://ciir.cs.umass.edu/downloads/Antique/antique-test-queries.txt: [00:00] [11.4kB] [36.1MB/s]
[INFO] [starting] https://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel
[INFO] [finished] https://ciir.cs.umass.edu/downloads/Antique/antique-test.qrel: [00:00] [150kB] [633kB/s]


Unnamed: 0,name,AP@100,P@5,P@10
0,BR(TF_IDF),0.403388,0.796,0.709
1,BR(BM25),0.40404,0.799,0.709


In [None]:
res = tfidf.transform(adhoc[['qid', 'query']])
eval_res = pt.Utils.evaluate(res, antique_test.get_qrels(), metrics=[AP@100, P@5, P@10])
display(eval_res)

  eval_res = pt.Utils.evaluate(res, antique_test.get_qrels(), metrics=[AP@100, P@5, P@10])


{'AP@100': 0.40338804703690867, 'P@5': 0.796, 'P@10': 0.7090000000000002}

SEARCH ENGINE NEURALE

In [None]:
import os

#Pyterrier
!pip install --upgrade -q python-terrier

#Libraries that support Neural IR models
!pip install --upgrade -q git+https://github.com/Georgetown-IR-Lab/OpenNIR
!pip install --upgrade -q git+https://github.com/terrierteam/pyterrier_t5

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.3/114.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.1/158.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#!pip install onir

PREPARAZIONE DEL RE-RANKER KNRM

In [None]:
#Initiate PyTerrier
import pyterrier as pt
if not pt.started():
    pt.init(tqdm='notebook')

##Library that contains Neural rankers and re-rankers
import onir_pt

knrm = onir_pt.reranker('knrm', 'wordvec_hash', text_field='abstract')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
config file not found: config
[02;37m[2024-01-22 15:16:53,759][WordvecHashVocab][DEBUG] [0m[37m[starting] downloading https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip[0m




[02;37m[2024-01-22 15:17:08,522][onir.util.download][DEBUG] [0m[37mdownloaded https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip [14.40s] [682M] [48.7MB/s][0m
[02;37m[2024-01-22 15:17:08,530][WordvecHashVocab][DEBUG] [0m[37m[finished] downloading https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip [14.77s][0m
[02;37m[2024-01-22 15:17:08,530][WordvecHashVocab][DEBUG] [0m[37m[starting] extracting vecs[0m
[02;37m[2024-01-22 15:17:26,691][WordvecHashVocab][DEBUG] [0m[37m[finished] extracting vecs [18.16s][0m
[02;37m[2024-01-22 15:17:26,693][WordvecHashVocab][DEBUG] [0m[37m[starting] loading vecs into memory[0m
[02;37m[2024-01-22 15:19:56,589][WordvecHashVocab][DEBUG] [0m[37m[finished] loading vecs into memory [02:30][0m
[02;37m[2024-01-22 15:19:57,049][WordvecHashVocab][DEBUG] [0m[37m[starting] writing cached at /root/data/onir/vocab/wordvec_hash/fasttext-wiki-news-300d-1M.p[0m
[02;37m[2024-01-22 15:2

K-NRM

In [None]:
br = pt.BatchRetrieve(index2, wmodel='BM25') % 100
knrm = onir_pt.reranker('knrm', 'wordvec_hash', text_field='text')

#pipeline = br >> pt.text.get_text(antique_train, 'text') >> knrm


[02;37m[2024-01-22 15:20:30,625][WordvecHashVocab][DEBUG] [0m[37m[starting] reading cached at /root/data/onir/vocab/wordvec_hash/fasttext-wiki-news-300d-1M.p[0m
[02;37m[2024-01-22 15:20:40,242][WordvecHashVocab][DEBUG] [0m[37m[finished] reading cached at /root/data/onir/vocab/wordvec_hash/fasttext-wiki-news-300d-1M.p [9.62s][0m


In [None]:
br.search('israel')

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,93546,1544892_4,0,16.083493,israel
1,1,128262,2174097_0,1,16.053869,israel
2,1,313072,4009854_2,2,14.908974,israel
3,1,82102,3705248_1,3,14.772908,israel
4,1,362941,80655_3,4,14.508093,israel
...,...,...,...,...,...,...
95,1,284193,765133_8,95,12.181057,israel
96,1,302778,4388827_1,96,12.164056,israel
97,1,93869,3194779_0,97,12.131498,israel
98,1,126554,3365553_4,98,12.131498,israel


In [None]:
topics = antique_test.get_topics(variant='text')

In [None]:
pipeline = br >> pt.text.get_text(antique_test, 'text') >> knrm

Comparazione tra BM25 e KNRM

In [None]:
pt.Experiment(
    [br, pipeline],
    topics,
    antique_test.get_qrels(),
    names=['BM25', 'BM25 >> KNRM'],
    eval_metrics=["map", 'ndcg', 'P.10']
)

[INFO] [starting] building docstore
docs_iter: 100%|██████████████████████| 403666/403666 [4.06s<0ms, 99438.53doc/s]
[INFO] [finished] docs_iter: [4.06s] [403666doc] [99411.17doc/s]
[INFO] [finished] building docstore [4.07s]


[02;37m[2024-01-22 15:21:15,594][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:21:16,234][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [8ms<?, ?it/s]

[02;37m[2024-01-22 15:21:24,862][onir_pt][DEBUG] [0m[37m[finished] batches: [8.63s] [4831it] [560.02it/s][0m


Unnamed: 0,name,map,ndcg,P.10
0,BM25,0.403895,0.528329,0.709
1,BM25 >> KNRM,0.138719,0.335346,0.1365


Verranno utilizzate diverse pipeline pesate, combinando un modello preaddestrato (BM25) e un modello tunato (KNRM) con pesi variabili.

I risultati verranno valutati utilizzando diverse metriche di valutazione.

In [None]:
br_weight = .9
knrm_weight = .1

normalized_br = br >> pt.pipelines.PerQueryMaxMinScoreTransformer()
normalized_pipeline = pipeline >> pt.pipelines.PerQueryMaxMinScoreTransformer()

sum_pipeline = knrm_weight*normalized_pipeline + br_weight*normalized_br

In [None]:
sum_pipeline_1 = .1*normalized_pipeline + .9*normalized_br
sum_pipeline_2 = .2*normalized_pipeline + .8*normalized_br
sum_pipeline_3 = .3*normalized_pipeline + .7*normalized_br
sum_pipeline_4 = .4*normalized_pipeline + .6*normalized_br
sum_pipeline_5 = .5*normalized_pipeline + .5*normalized_br
sum_pipeline_6 = .6*normalized_pipeline + .4*normalized_br
sum_pipeline_7 = .7*normalized_pipeline + .3*normalized_br
sum_pipeline_8 = .8*normalized_pipeline + .2*normalized_br
sum_pipeline_9 = .9*normalized_pipeline + .1*normalized_br

sum_pipelines = []

import numpy as np
for i in np.linspace(0,1,100):
  sum_pipelines.append(i*normalized_pipeline + (1-i)*normalized_br)

  sum_pipelines.append(i*normalized_pipeline + (1-i)*normalized_br)


In [None]:
pt.Experiment(
    [
        br,
        pipeline,
        sum_pipeline_1,
        sum_pipeline_2,
        sum_pipeline_3,
        sum_pipeline_4,
        sum_pipeline_5,
        sum_pipeline_6,
        sum_pipeline_7,
        sum_pipeline_8,
        sum_pipeline_9
     ],
    topics,
    antique_test.get_qrels(),
    names=[
        'BM25',
        'BM25 >> KNRM',
        'Weighted SUM .1',
        'Weighted SUM .2',
        'Weighted SUM .3',
        'Weighted SUM .4',
        'Weighted SUM .5',
        'Weighted SUM .6',
        'Weighted SUM .7',
        'Weighted SUM .8',
        'Weighted SUM .9'
      ],
    eval_metrics=["map", 'ndcg', 'P.10']
)

[02;37m[2024-01-22 15:21:45,030][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:21:45,031][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:21:51,854][onir_pt][DEBUG] [0m[37m[finished] batches: [6.82s] [4831it] [708.22it/s][0m
[02;37m[2024-01-22 15:22:02,070][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:22:02,071][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [53ms<?, ?it/s]

[02;37m[2024-01-22 15:22:09,735][onir_pt][DEBUG] [0m[37m[finished] batches: [7.66s] [4831it] [630.41it/s][0m
[02;37m[2024-01-22 15:22:29,040][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:22:29,042][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:22:36,574][onir_pt][DEBUG] [0m[37m[finished] batches: [7.53s] [4831it] [641.45it/s][0m
[02;37m[2024-01-22 15:22:55,673][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:22:55,674][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:23:03,061][onir_pt][DEBUG] [0m[37m[finished] batches: [7.39s] [4831it] [654.10it/s][0m
[02;37m[2024-01-22 15:23:21,802][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:23:21,804][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:23:29,428][onir_pt][DEBUG] [0m[37m[finished] batches: [7.62s] [4831it] [633.73it/s][0m
[02;37m[2024-01-22 15:23:48,317][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:23:48,319][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:23:55,802][onir_pt][DEBUG] [0m[37m[finished] batches: [7.48s] [4831it] [645.61it/s][0m
[02;37m[2024-01-22 15:24:14,186][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:24:14,188][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:24:21,712][onir_pt][DEBUG] [0m[37m[finished] batches: [7.52s] [4831it] [642.14it/s][0m
[02;37m[2024-01-22 15:24:41,145][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:24:41,147][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [18ms<?, ?it/s]

[02;37m[2024-01-22 15:24:48,589][onir_pt][DEBUG] [0m[37m[finished] batches: [7.44s] [4831it] [649.22it/s][0m
[02;37m[2024-01-22 15:25:07,182][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:25:07,183][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:25:14,610][onir_pt][DEBUG] [0m[37m[finished] batches: [7.43s] [4831it] [650.53it/s][0m
[02;37m[2024-01-22 15:25:33,252][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:25:33,253][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:25:40,715][onir_pt][DEBUG] [0m[37m[finished] batches: [7.46s] [4831it] [647.53it/s][0m


Unnamed: 0,name,map,ndcg,P.10
0,BM25,0.403895,0.528329,0.709
1,BM25 >> KNRM,0.138719,0.335346,0.1365
2,Weighted SUM .1,0.396671,0.52525,0.7
3,Weighted SUM .2,0.383116,0.519345,0.6805
4,Weighted SUM .3,0.360074,0.508694,0.6395
5,Weighted SUM .4,0.32468,0.489525,0.5675
6,Weighted SUM .5,0.279876,0.460477,0.48
7,Weighted SUM .6,0.229331,0.418677,0.363
8,Weighted SUM .7,0.19105,0.384141,0.27
9,Weighted SUM .8,0.165684,0.359885,0.2145


 Effettueremo re-ranking con Vanilla BERT sfruttando un modello Pre-trained ma not-tuned.



In [None]:
#import onir_pt

vbert = onir_pt.reranker.from_checkpoint('https://macavaney.us/scibert-medmarco.tar.gz', text_field='text')

bert_pipeline = br >> pt.text.get_text(antique_test, 'text') >> vbert



[02;37m[2024-01-22 15:25:56,666][onir.util.download][DEBUG] [0m[37mdownloaded https://macavaney.us/scibert-medmarco.tar.gz [6.22s] [499M] [131MB/s][0m




[02;37m[2024-01-22 15:26:14,995][onir.util.download][DEBUG] [0m[37mdownloaded https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/pytorch_models/scibert_scivocab_uncased.tar [14.32s] [411M] [28.7MB/s] [md5 hash verified][0m


extracting: 411MB [1.59s, 259MB/s]
extracting: 821MB [9.35s, 87.8MB/s]


In [None]:
pt.Experiment(
    [br, bert_pipeline],
    topics,
    antique_test.get_qrels(),
    names=['BM25', 'BM25 >> BERT'],
    eval_metrics=["map", 'ndcg', 'P.10']
)

[02;37m[2024-01-22 15:26:54,787][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:26:54,947][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:29:29,951][onir_pt][DEBUG] [0m[37m[finished] batches: [02:35] [4831it] [31.17it/s][0m


Unnamed: 0,name,map,ndcg,P.10
0,BM25,0.403895,0.528329,0.709
1,BM25 >> BERT,0.369351,0.553099,0.6225


Utilizzeremo un modello pre-trained e un modello Vanilla BERT tuned

In [None]:
!pip install -q sentence_transformers ipdb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone


In [None]:
from sentence_transformers import CrossEncoder, SentenceTransformer
from pyterrier.transformer import TransformerBase

crossmodel = CrossEncoder('cross-encoder/stsb-roberta-base')
biencoder_model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [7ms<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [7ms<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/142 [40ms<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [35ms<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [42ms<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [25ms<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.18k [42ms<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [35ms<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [60ms<?, ?B/s]

config.json:   0%|          | 0.00/612 [44ms<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [66ms<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [48ms<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [34ms<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [22ms<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [41ms<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [19ms<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [32ms<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [59ms<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [41ms<?, ?B/s]

modules.json:   0%|          | 0.00/349 [67ms<?, ?B/s]

Applicheremo modelli Cross-Encoder e Bi-Encoder al nostro dataset.

I risultati di questi modelli vengono quindi utilizzati per ottenere i punteggi di rilevanza tra le query e i documenti utilizzando PyTerrier.

In [None]:
def _crossencoder_apply(df, column='text'):
  return crossmodel.predict(list(zip(df['query'].values, df[column].values)))

from functools import partial
crossencoder_apply_title = partial(_crossencoder_apply, column='text')
cross_encT = pt.apply.doc_score(crossencoder_apply_title, batch_size=64)


from sentence_transformers.util import cos_sim
def _biencoder_apply(df):
  # import ipdb
  # ipdb.set_trace()
  # df['query'].apply(query_expansion)
  query_embs = biencoder_model.encode(df['query'].values)
  doc_embs = biencoder_model.encode(df['text'].values)
  scores = cos_sim(query_embs, doc_embs)
  return scores[0]

bi_encT = pt.apply.doc_score(_biencoder_apply, batch_size=64)

In [None]:
bi_pipeline = br >> pt.text.get_text(antique_test, 'text') >> bi_encT
cross_pipeline = br >> pt.text.get_text(antique_test, 'text') >> cross_encT

normalized_br = br >> pt.pipelines.PerQueryMaxMinScoreTransformer()

normalized_bert_pipeline = bert_pipeline >> pt.pipelines.PerQueryMaxMinScoreTransformer()
normalized_bi_pipeline = bi_pipeline >> pt.pipelines.PerQueryMaxMinScoreTransformer()
normalized_cross_pipeline = cross_pipeline >> pt.pipelines.PerQueryMaxMinScoreTransformer()

bert_sum_pipeline = .5*normalized_bert_pipeline + .5*normalized_br
bi_sum_pipeline = .5*normalized_bi_pipeline + .5*normalized_br
cross_sum_pipeline = .5*normalized_cross_pipeline + .5*normalized_br

Effettueremo una comparazione delle prestazioni di diversi modelli (BM25, VBERT, Bi-Encoder, Cross-Encoder) in un contesto di recupero dell'informazione utilizzando PyTerrier.

I modelli sono valutati sia individualmente che in combinazione con il modello di base BM25.

In [None]:
pt.Experiment(
    [
        br,
        bert_pipeline,
        bert_sum_pipeline,
        bi_pipeline,
        bi_sum_pipeline,
        cross_pipeline,
        cross_sum_pipeline
    ],
    topics,
    antique_test.get_qrels(),
    names=[
        'BM25',
        'VBERT',
        '.5*VBERT + .5BM25',
        'BiEnc',
        '.5*BiEnc + .5BM25',
        'CrossEnc',
        '.5*CrossEnc + .5BM25'
      ],
    eval_metrics=["map", 'ndcg', 'P.10']
)

[02;37m[2024-01-22 15:30:24,789][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:30:24,793][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:33:00,200][onir_pt][DEBUG] [0m[37m[finished] batches: [02:35] [4831it] [31.09it/s][0m
[02;37m[2024-01-22 15:33:10,169][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2024-01-22 15:33:10,173][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/4831 [7ms<?, ?it/s]

[02;37m[2024-01-22 15:35:44,551][onir_pt][DEBUG] [0m[37m[finished] batches: [02:34] [4831it] [31.29it/s][0m


pt.apply:   0%|          | 0/198 [20ms<?, ?row/s]

pt.apply:   0%|          | 0/198 [7ms<?, ?row/s]

pt.apply:   0%|          | 0/198 [7ms<?, ?row/s]

pt.apply:   0%|          | 0/198 [15ms<?, ?row/s]

Unnamed: 0,name,map,ndcg,P.10
0,BM25,0.403895,0.528329,0.709
1,VBERT,0.369351,0.553099,0.6225
2,.5*VBERT + .5BM25,0.435593,0.564131,0.7405
3,BiEnc,0.348178,0.547101,0.5745
4,.5*BiEnc + .5BM25,0.426706,0.561012,0.733
5,CrossEnc,0.260948,0.453477,0.4005
6,.5*CrossEnc + .5BM25,0.394003,0.536019,0.6635
