# Information Retrieval - Notebook 4

## Requirements

In [1]:
# !pip install rank_bm25 spacy Sense2Vec
# !wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from rank_bm25 import BM25Okapi
# import spacy
# from sense2vec import Sense2Vec
tqdm.pandas()

In [2]:
# this turns on the autotimer, so that every cell has a timing information below
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime
# stop using:
# %unload_ext autotime

time: 235 µs (started: 2024-04-04 21:03:07 +02:00)


## Simplifying the data + writing it into a file

In [None]:
#origdocs = pd.read_csv('our.msmarco.docs.tsv',sep='\t',usecols=[1,2,3])
#origdocs['title'].fillna('-', inplace=True)
#origdocs['body'].fillna('-', inplace=True)
#origdocs

In [None]:
#ourdocs = pd.DataFrame(columns = ['docid', 'text'])
#ourdocs['docid']=origdocs.docid
#ourdocs['text']=origdocs.title+' '+origdocs.body
#ourdocs

In [7]:
#del origdocs # saving memory

time: 185 µs (started: 2024-03-27 23:55:50 +01:00)


In [3]:
# ~ 30 seconds
ourdocs = pd.read_csv('/bigstorage/pavlo/testing/our.text.msmarco.docs_2.tsv',sep='\t', usecols=['docid','text'])

time: 12.6 s (started: 2024-04-04 21:03:17 +02:00)


In [4]:
ourdocs

Unnamed: 0,docid,text
0,D2981241,What do you call a group of lions? Lions Vocab...
1,D687756,". The A Priori Argument ( also, Rationalizatio..."
2,D913099,Everything You Need To Learn How To Cook Veget...
3,D328017,"What is the difference between latitude, longi..."
4,D1636347,When was the pulley invented? Answers.com ® Wi...
...,...,...
92560,D3379210,Top 39 Doctor insights on: Can An Iud Cause Ha...
92561,D3068739,How to get back your DirecTV cancellation fees...
92562,D1590402,Certification FAQs Fingerprinting 1. Where can...
92563,D2175490,Greenhouse gas emissions by Canadian economic ...


time: 11.2 ms (started: 2024-04-04 21:03:31 +02:00)


## Vectorize

In [5]:
# Long: takes > 3min
vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode')
X = vectorizer.fit_transform(ourdocs.text)
print('Vectorization is done', X.shape)

Vectorization is done (92565, 2067446)
time: 3min 27s (started: 2024-04-04 21:03:35 +02:00)


In [7]:
queries = pd.read_csv('/bigstorage/pavlo/testing/our.msmarco.queries.tsv',sep='\t',usecols=[1,2]) 
training_queries=queries.iloc[:500]
testing_queries=queries.iloc[500:]
training_queries

Unnamed: 0,qid,query
0,687888,what is a jpe
1,480210,price for asphalt driveway
2,591004,what causes pressure skin bruising
3,260536,how long drive from flagstaff to grand canyon
4,39422,average number of bowel movements per day for ...
...,...,...
495,133970,definition of dietary fiber
496,79788,can you start up a video record?
497,791583,what is rheumatoid spondylosis
498,732078,what is coleman fuel made out of


time: 11.5 ms (started: 2024-04-04 21:07:46 +02:00)


In [8]:
train_qvs = vectorizer.transform(training_queries['query'])
test_qvs = vectorizer.transform(testing_queries['query'])
train_qvs.shape

(500, 2067446)

time: 77.2 ms (started: 2024-04-04 21:07:54 +02:00)


### 🚧 Todo:

- What happens for words that are in a query and not in the documents?

If they are not present in the learned vocabulary (based on the documents) then they won't be represented in the resulting document-term matrix.

- Why did we use $transform$ and not $fit_transform$?

We used $fit_transform$ on the docs and the corresponding document-term matrix was created alongside with the learned weights. As we have already learned the weights, we can apply directly $transform$ to the training and testing queries.

In [9]:
train_qvs[0].shape

(1, 2067446)

time: 5.51 ms (started: 2024-04-04 21:13:41 +02:00)


In [11]:
gold = pd.read_csv('/bigstorage/pavlo/testing/our.msmarco.gold.tsv',sep='\t',usecols=[1,3,4,5])
gold

Unnamed: 0,qid,docid,rank,score
0,310290,D579750,1,-5.11498
1,310290,D579754,2,-5.57703
2,310290,D2380815,3,-5.84852
3,310290,D822566,4,-5.95002
4,310290,D2249695,5,-6.08326
...,...,...,...,...
99995,257942,D253854,96,-6.32693
99996,257942,D3056621,97,-6.32837
99997,257942,D1323491,98,-6.32871
99998,257942,D2722485,99,-6.33100


time: 110 ms (started: 2024-04-04 21:13:48 +02:00)


## Scores

In [12]:
qid = 257942
# understanding the score column:
gold[(gold.qid==qid)]

Unnamed: 0,qid,docid,rank,score
99900,257942,D1451841,1,-5.66844
99901,257942,D1361861,2,-5.75280
99902,257942,D2627169,3,-5.76717
99903,257942,D3074165,4,-5.86465
99904,257942,D1361864,5,-5.95433
...,...,...,...,...
99995,257942,D253854,96,-6.32693
99996,257942,D3056621,97,-6.32837
99997,257942,D1323491,98,-6.32871
99998,257942,D2722485,99,-6.33100


time: 15.3 ms (started: 2024-04-04 21:17:42 +02:00)


In [13]:
# getting the top 10 documents for each query
qgold = gold[(gold.qid==qid) & (gold['rank']<=10)].docid
qgold

99900    D1451841
99901    D1361861
99902    D2627169
99903    D3074165
99904    D1361864
99905    D2531901
99906    D1624776
99907    D2926659
99908    D1805809
99909     D885919
Name: docid, dtype: object

time: 10.7 ms (started: 2024-04-04 21:17:50 +02:00)


In [14]:
# getting the relevance score for the top document for the query
gold[(gold.qid==qid) & (gold.docid==qgold.iloc[0])].score

99900   -5.66844
Name: score, dtype: float64

time: 35.3 ms (started: 2024-04-04 21:17:52 +02:00)


In [15]:
# getting the average relevance score for all top results of test queries
def get_avg_score(testing_queries, top=10):
    return gold[(gold.qid.isin(testing_queries.qid)) & (gold['rank']<=top)].score.mean()

for top in [1, 2, 3, 4, 5, 10, 100]:
    print(top, get_avg_score(testing_queries, top))



1 -5.197412539999999
2 -5.27601475
3 -5.33337584
4 -5.378957215000001
5 -5.416540648
10 -5.543673964
100 -6.049410218599999
time: 42.5 ms (started: 2024-04-04 21:17:55 +02:00)


### p@10

In [71]:
def pAt10(qid, k=10, docs=ourdocs, gold=gold):
    id = training_queries[training_queries.qid==qid].index[0] # get the index of the query in the training_queries
    xqv = X*train_qvs[id].T # cosine similarity between query and all documents
    predki = np.argpartition(xqv.A.flat, -k)[-k:] # top k
    query = training_queries[training_queries.qid==qid]['query'].values[0]
    sampledocs = [query] + ourdocs.loc[predki].text.to_list()
    embeddings = model.encode(sampledocs)
    similarities = cos_sim(embeddings[0], embeddings[1:])
    df = docs.loc[predki].copy(deep=True)
    df['similarity'] = similarities.flatten()
    df = df.sort_values(by='similarity', ascending=False)
    intersection = np.intersect1d(df.loc[predki].docid.head(10),gold[gold.qid==qid].docid)
    return len(intersection)/10

time: 740 µs (started: 2024-04-04 23:03:19 +02:00)


## Case Study

### Can we improve the relevance score by using a dense embedding?

In [16]:
training_queries[training_queries.qid==251898]

Unnamed: 0,qid,query
111,251898,how long does getting a doctorate take


time: 12.6 ms (started: 2024-04-04 21:18:16 +02:00)


In [17]:
gold[gold.qid==251898]

Unnamed: 0,qid,docid,rank,score
36200,251898,D2865964,1,-4.74293
36201,251898,D3557816,2,-4.90695
36202,251898,D2723985,3,-4.95911
36203,251898,D1951655,4,-4.97272
36204,251898,D1709749,5,-5.02176
...,...,...,...,...
36295,251898,D2531901,96,-5.56896
36296,251898,D2956542,97,-5.57138
36297,251898,D301873,98,-5.57262
36298,251898,D2952336,99,-5.57504


time: 23.2 ms (started: 2024-04-04 21:18:33 +02:00)


In [59]:
qid = 729561
gold[gold.qid==qid].docid

# Show lines of ourdocs where docid is in gold[gold.qid==qid].docid
ourdocs[ourdocs.docid.isin(gold[(gold.qid == qid) & (gold['rank'] <= 20)].docid)]
# gold[(gold.qid == qid) & (gold['rank'] <= 10)]
df = ourdocs[ourdocs.docid.isin(gold[(gold.qid == qid) & (gold['rank'] <= 20)].docid)].copy()
df['counter'] = range(1, len(df) + 1)
df

Unnamed: 0,docid,text,counter
737,D2696063,What tv channel is syfy on if you have dish ne...,1
2277,D2516893,Chapter 12-Nervous System 129 terms rotilla10C...,2
2322,D588822,What channel is NBC on XFINITY? Comcast Xfinit...,3
2955,D1422234,What channel is the yes network on comcast in ...,4
10801,D831199,Chapter 7: Nerve Cells Chapter 7: Nerve Cells8...,5
10953,D214463,What channel is wgn on Verizon fios? C.skater ...,6
15313,D1642934,Types of Receptors 38 terms brittney_harvell T...,7
15749,D1137136,What channel is tnt on dish network? Answers.c...,8
22362,D893699,What channel is wgn on Verizon fios? C.skater ...,9
22632,D1203964,What channel is CBS for DISH Network? Answers....,10


time: 71.6 ms (started: 2024-04-04 23:01:11 +02:00)


### 🚧 Todo:
- Complete the xxx in the lines 
- Comment each line

In [60]:
def show_doc(qid, rank=0, show_gold=False, k=10, similarities=None, docs=ourdocs, gold=gold):
    """
    qid: query id
    rank: rank of the document to show. if rank=0, show the top k documents
    show_gold: if True, show the gold documents. if False, show the predicted documents
    k: number of documents to show
    docs: documents DataFrame
    gold: gold DataFrame
    """
    print(f"query: {training_queries[training_queries.qid==qid]['query'].values[0]}")
    id = training_queries[training_queries.qid==qid].index[0] # get the index of the query in the training_queries
    xqv = X*train_qvs[id].T # cosine similarity between query and all documents
    predki = np.argpartition(xqv.A.flat, -k)[-k:] # top k
    if show_gold:
        if rank == 0:
            print(f"top {k} gold documents for query {qid}:")
            df = ourdocs[ourdocs.docid.isin(gold[(gold.qid == qid) & (gold['rank'] <= k)].docid)].copy() # Create a copy to avoid modifying the original DataFrame
            pred10i = np.argpartition(xqv.A.flat, -10)[-10:] # top 10
            pred100i = np.argpartition(xqv.A.flat, -100)[-100:] # top 100
            df['in_top_10_pred'] = df['docid'].isin(docs.loc[pred10i].docid)
            df['in_top_100_pred'] = df['docid'].isin(docs.loc[pred100i].docid)
            display(df)
        else:
            print(f"gold document {rank} for query {qid}:")
            display(ourdocs[ourdocs.docid==gold[(gold.qid==qid) & (gold['rank']==rank)].docid.iloc[0]].text.values[0][:999])
    else: # show predictions
        if rank == 0:
            print(f"top {k} predicted documents for query {qid}:")
           
            qgold10 = gold[(gold.qid == qid) & (gold['rank'] <= 10)].docid
            qgold100 = gold[(gold.qid == qid) & (gold['rank'] <= 100)].docid
    
            df = docs.loc[predki].copy()  # Create a copy to avoid modifying the original DataFrame
            df['in_qgold10'] = df['docid'].isin(qgold10)
            df['in_qgold100'] = df['docid'].isin(qgold100)
            if similarities is not None:
                df['similarity'] = similarities
                df = df.sort_values(by='similarity', ascending=False)
            display(df)
        else:
            print(f"predicted document {rank} for query {qid}:")
            display(ourdocs[ourdocs.docid==docs.loc[predki].docid.iloc[rank]].text.values[0][:999])


time: 7.17 ms (started: 2024-04-04 23:01:18 +02:00)


#### Call 1

In [21]:
show_doc(729561,rank=7)

query: what is channeling
predicted document 7 for query 729561:


". Arc Resistant Switchgear ANSI/IEEE C37.20.7 defines switchgear arc resistance in two basic categories: ANSI type 1 Arc resistance from the front of gear only ANSI type 2 Arc resistance provided from the front, sides and rear A suffix may be added to either of these two types to further define the type of protection provided: A: Basic design B: Arc resistance is maintained even while opening designated low voltage compartments C: Arc resistance is maintained even when opening designated adjacent compartments D: Special designation that supplements the Type 1 designation, but identifies additional arc resistance in certain structures Eaton offers arc resistant medium voltage (MV),and low voltage (LV) switchgear and arc resistant MV motor control. Eaton also offers a unique “arc preventative” LV motor control center (Flash Gard) that protects the operator by reducing the likelihood that an arc would occur. Arc Resistant MV Switchgear Eaton's VCP-W arc-resistant vacuum switchgear (5 k V

time: 364 ms (started: 2024-04-04 21:20:24 +02:00)


#### Call 2

In [24]:
show_doc(729561)

query: what is channeling
top 10 predicted documents for query 729561:


Unnamed: 0,docid,text,in_qgold10,in_qgold100
39754,D858701,Polar Bears Polar Bears Spending the dark wint...,False,False
25371,D461511,Earrings dream meaning Earrings dream meaning ...,False,False
15563,D2167062,What to do with lemon curd. Are you channeling...,False,False
64517,D3400862,FAQ FAQHow do I select the right replacement t...,False,False
49665,D1681032,Behind the gates at Ramtha's school By Lisa Pe...,False,False
86431,D567760,"Roosevelt Roosevelt [ roh -z uh -velt, -v uh l...",False,False
65542,D873198,"The Magician The Magician February 27, 2014 by...",False,False
56830,D1636027,. Arc Resistant Switchgear ANSI/IEEE C37.20.7 ...,False,False
67662,D685247,What is Channeling? What is Channeling? By SHE...,False,True
34985,D1690699,- Are you channeling your best self with this ...,False,False


time: 359 ms (started: 2024-04-04 21:20:58 +02:00)


#### Call 3

In [25]:
show_doc(729561,show_gold=True, k=20)

query: what is channeling
top 20 gold documents for query 729561:


Unnamed: 0,docid,text,in_top_10_pred,in_top_100_pred
737,D2696063,What tv channel is syfy on if you have dish ne...,False,False
2277,D2516893,Chapter 12-Nervous System 129 terms rotilla10C...,False,False
2322,D588822,What channel is NBC on XFINITY? Comcast Xfinit...,False,False
2955,D1422234,What channel is the yes network on comcast in ...,False,False
10801,D831199,Chapter 7: Nerve Cells Chapter 7: Nerve Cells8...,False,False
10953,D214463,What channel is wgn on Verizon fios? C.skater ...,False,False
15313,D1642934,Types of Receptors 38 terms brittney_harvell T...,False,False
15749,D1137136,What channel is tnt on dish network? Answers.c...,False,False
22362,D893699,What channel is wgn on Verizon fios? C.skater ...,False,False
22632,D1203964,What channel is CBS for DISH Network? Answers....,False,False


time: 323 ms (started: 2024-04-04 21:21:43 +02:00)


#### 2nd case:

In [26]:
show_doc(251898)

query: how long does getting a doctorate take
top 10 predicted documents for query 251898:


Unnamed: 0,docid,text,in_qgold10,in_qgold100
50610,D2749717,How long does it take to become a Physical The...,False,True
56982,D2865964,How long does it take to get a post-doctoral d...,True,True
83787,D1951655,How long does it take to get a Doctorate of Nu...,True,True
67623,D2723985,How long would it take to get a doctorate's de...,True,True
66313,D2956542,How long does it take to finish a doctorate? E...,False,True
86925,D2703712,How Long Does It Take To Become A Psychologist...,False,True
52957,D1402632,How to get doctorate degree How to get doctora...,False,True
42225,D2782441,How many years does a doctorate degree require...,False,True
65033,D1736725,How long does it take to get a doctoral degree...,True,True
91275,D3090762,How Long Does It Take to Get a Doctorate in En...,False,True


time: 319 ms (started: 2024-04-04 21:22:03 +02:00)


#### TODO: 3rd case:
- Find yourself another case where the retrieved top 10 contains interesting documents but the order is not good and the top 1 is not relevant

The queries ID = 947184, 651254 and 685501 meet those requirements.


In [76]:
show_doc(685501)

query: what is a good substitute for coconut oil
top 10 predicted documents for query 685501:


Unnamed: 0,docid,text,in_qgold10,in_qgold100
29239,D795578,Why and How to Use Coconut Oil to Replace Butt...,False,True
70763,D3350443,Is Coconut Oil Good for Frying on High Tempera...,False,True
1798,D1879579,What Is the Difference Between Coconut Oil & C...,False,True
25483,D423625,How to Use Coconut Oil 1 Select the right kind...,False,True
62124,D2921380,How to Use Coconut Oil 1 Select the right kind...,False,True
17026,D2152717,How Much Coconut Oil Should I Eat Daily? How M...,False,True
28043,D3082321,How to Substitute Coconut Oil for Vegetable Oi...,False,True
3729,D795576,What is a Good Substitute for Coconut Milk and...,True,True
8131,D1401032,How Do I Store Coconut Oil? How Do I Store Coc...,False,True
72564,D1403176,The Differences Between Refined and Virgin Coc...,False,True


time: 329 ms (started: 2024-04-04 23:07:50 +02:00)


# Dense embeddings - MixedBread

#### Sample: https://www.mixedbread.ai/blog/mxbai-embed-large-v1 

#### Model Load

In [63]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

# 1. load model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

# For retrieval you need to pass this prompt.
query = 'Represent this sentence for searching relevant passages: A man is eating a piece of bread'

sampledocs = [
    query,
    "A man is eating food.",
    "A man is eating pasta.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
]

time: 1.57 s (started: 2024-04-04 23:01:46 +02:00)


#### Encode

In [64]:
# 2. Encode
embeddings = model.encode(sampledocs)

similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities)

similarities: tensor([[0.7920, 0.6369, 0.1651, 0.3621]])
time: 61.5 ms (started: 2024-04-04 23:01:49 +02:00)


### 1st Case - Ms-Marco

In [65]:
k=10
id = training_queries[training_queries.qid==qid].index[0] # get the index of the query in the training_queries
xqv = X*train_qvs[id].T # cosine similarity between query and all documents
predki = np.argpartition(xqv.A.flat, -k)[-k:] # top k
ourdocs.loc[predki].text


39754    Polar Bears Polar Bears Spending the dark wint...
25371    Earrings dream meaning Earrings dream meaning ...
15563    What to do with lemon curd. Are you channeling...
64517    FAQ FAQHow do I select the right replacement t...
49665    Behind the gates at Ramtha's school By Lisa Pe...
86431    Roosevelt Roosevelt [ roh -z uh -velt, -v uh l...
65542    The Magician The Magician February 27, 2014 by...
56830    . Arc Resistant Switchgear ANSI/IEEE C37.20.7 ...
67662    What is Channeling? What is Channeling? By SHE...
34985    - Are you channeling your best self with this ...
Name: text, dtype: object

time: 372 ms (started: 2024-04-04 23:01:52 +02:00)


In [66]:
query = f"Represent this sentence for searching relevant passages: {training_queries[training_queries.qid==qid]['query'].values[0]}"
print(query)

Represent this sentence for searching relevant passages: what is channeling
time: 1.11 ms (started: 2024-04-04 23:02:02 +02:00)


In [67]:
sampledocs = [query] + ourdocs.loc[predki].text.to_list()
embeddings = model.encode(sampledocs) # this takes > a second per document although the context size is only 512 tokens

time: 395 ms (started: 2024-04-04 23:02:12 +02:00)


In [68]:
similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities[0])

similarities: tensor([0.3053, 0.4345, 0.3230, 0.3587, 0.5007, 0.3244, 0.4571, 0.3515, 0.8457,
        0.2805])
time: 2.98 ms (started: 2024-04-04 23:02:14 +02:00)


In [69]:
df = ourdocs.loc[predki].copy()  # Create a copy to avoid modifying the original DataFrame
df['similarity'] = similarities[0]
df = df.sort_values(by='similarity', ascending=False)
df

Unnamed: 0,docid,text,similarity
67662,D685247,What is Channeling? What is Channeling? By SHE...,0.845656
49665,D1681032,Behind the gates at Ramtha's school By Lisa Pe...,0.500672
65542,D873198,"The Magician The Magician February 27, 2014 by...",0.457076
25371,D461511,Earrings dream meaning Earrings dream meaning ...,0.434478
64517,D3400862,FAQ FAQHow do I select the right replacement t...,0.358695
56830,D1636027,. Arc Resistant Switchgear ANSI/IEEE C37.20.7 ...,0.351459
86431,D567760,"Roosevelt Roosevelt [ roh -z uh -velt, -v uh l...",0.324389
15563,D2167062,What to do with lemon curd. Are you channeling...,0.323024
39754,D858701,Polar Bears Polar Bears Spending the dark wint...,0.305322
34985,D1690699,- Are you channeling your best self with this ...,0.280519


time: 22.5 ms (started: 2024-04-04 23:02:24 +02:00)


In [70]:
show_doc(729561, similarities=similarities[0])


query: what is channeling
top 10 predicted documents for query 729561:


Unnamed: 0,docid,text,in_qgold10,in_qgold100,similarity
67662,D685247,What is Channeling? What is Channeling? By SHE...,False,True,0.845656
49665,D1681032,Behind the gates at Ramtha's school By Lisa Pe...,False,False,0.500672
65542,D873198,"The Magician The Magician February 27, 2014 by...",False,False,0.457076
25371,D461511,Earrings dream meaning Earrings dream meaning ...,False,False,0.434478
64517,D3400862,FAQ FAQHow do I select the right replacement t...,False,False,0.358695
56830,D1636027,. Arc Resistant Switchgear ANSI/IEEE C37.20.7 ...,False,False,0.351459
86431,D567760,"Roosevelt Roosevelt [ roh -z uh -velt, -v uh l...",False,False,0.324389
15563,D2167062,What to do with lemon curd. Are you channeling...,False,False,0.323024
39754,D858701,Polar Bears Polar Bears Spending the dark wint...,False,False,0.305322
34985,D1690699,- Are you channeling your best self with this ...,False,False,0.280519


time: 378 ms (started: 2024-04-04 23:02:39 +02:00)


### p@10

In [72]:
print(f"P@10 for {qid} when k = 10: ",pAt10(qid, k=10))

P@10 for 729561 when k = 10:  0.1
time: 713 ms (started: 2024-04-04 23:04:58 +02:00)


### 2nd Case

In [73]:
qid = 251898
show_doc(qid)


query: how long does getting a doctorate take
top 10 predicted documents for query 251898:


Unnamed: 0,docid,text,in_qgold10,in_qgold100
50610,D2749717,How long does it take to become a Physical The...,False,True
56982,D2865964,How long does it take to get a post-doctoral d...,True,True
83787,D1951655,How long does it take to get a Doctorate of Nu...,True,True
67623,D2723985,How long would it take to get a doctorate's de...,True,True
66313,D2956542,How long does it take to finish a doctorate? E...,False,True
86925,D2703712,How Long Does It Take To Become A Psychologist...,False,True
52957,D1402632,How to get doctorate degree How to get doctora...,False,True
42225,D2782441,How many years does a doctorate degree require...,False,True
65033,D1736725,How long does it take to get a doctoral degree...,True,True
91275,D3090762,How Long Does It Take to Get a Doctorate in En...,False,True


time: 322 ms (started: 2024-04-04 23:05:15 +02:00)


In [74]:
k=10
id = training_queries[training_queries.qid==qid].index[0] # get the index of the query in the training_queries
xqv = X*train_qvs[id].T # cosine similarity between query and all documents
predki = np.argpartition(xqv.A.flat, -k)[-k:] # top k
ourdocs.loc[predki].text
query = f"Represent this sentence for searching relevant passages: {training_queries[training_queries.qid==qid]['query'].values[0]}"
sampledocs = [query] + ourdocs.loc[predki].text.to_list()
embeddings = model.encode(sampledocs)
similarities = cos_sim(embeddings[0], embeddings[1:])
show_doc(qid, similarities=similarities[0])


query: how long does getting a doctorate take
top 10 predicted documents for query 251898:


Unnamed: 0,docid,text,in_qgold10,in_qgold100,similarity
66313,D2956542,How long does it take to finish a doctorate? E...,False,True,0.843658
91275,D3090762,How Long Does It Take to Get a Doctorate in En...,False,True,0.840919
65033,D1736725,How long does it take to get a doctoral degree...,True,True,0.83736
56982,D2865964,How long does it take to get a post-doctoral d...,True,True,0.813156
42225,D2782441,How many years does a doctorate degree require...,False,True,0.804683
67623,D2723985,How long would it take to get a doctorate's de...,True,True,0.773761
83787,D1951655,How long does it take to get a Doctorate of Nu...,True,True,0.756096
52957,D1402632,How to get doctorate degree How to get doctora...,False,True,0.7416
86925,D2703712,How Long Does It Take To Become A Psychologist...,False,True,0.721173
50610,D2749717,How long does it take to become a Physical The...,False,True,0.667144


time: 934 ms (started: 2024-04-04 23:05:23 +02:00)


### p@10

In [75]:
print(f"P@10 for {qid} when k = 10: ",pAt10(qid, k=10))

P@10 for 251898 when k = 10:  1.0
time: 669 ms (started: 2024-04-04 23:05:44 +02:00)


### 🚧 Todo:
- Do your 3rd case and comment on the results.

In [45]:
queries.head(30)

Unnamed: 0,qid,query
0,687888,what is a jpe
1,480210,price for asphalt driveway
2,591004,what causes pressure skin bruising
3,260536,how long drive from flagstaff to grand canyon
4,39422,average number of bowel movements per day for ...
5,400614,is a personality disorder a mental illness
6,79798,can you stick weld aluminum
7,92082,cholesterol and triglycerides are because of t...
8,120204,define detailed design (software)
9,133639,definition of cosmic


time: 8.66 ms (started: 2024-04-04 22:45:55 +02:00)


In [79]:
k=10
id = training_queries[training_queries.qid==685501].index[0] # get the index of the query in the training_queries
xqv = X*train_qvs[id].T # cosine similarity between query and all documents
predki = np.argpartition(xqv.A.flat, -k)[-k:] # top k
ourdocs.loc[predki].text
query = training_queries[training_queries.qid==685501]['query'].values[0]
sampledocs = [query] + ourdocs.loc[predki].text.to_list()
embeddings = model.encode(sampledocs)
similarities = cos_sim(embeddings[0], embeddings[1:])
show_doc(685501, similarities=similarities[0])

query: what is a good substitute for coconut oil
top 10 predicted documents for query 685501:


Unnamed: 0,docid,text,in_qgold10,in_qgold100,similarity
28043,D3082321,How to Substitute Coconut Oil for Vegetable Oi...,False,True,0.845558
29239,D795578,Why and How to Use Coconut Oil to Replace Butt...,False,True,0.816008
3729,D795576,What is a Good Substitute for Coconut Milk and...,True,True,0.813952
25483,D423625,How to Use Coconut Oil 1 Select the right kind...,False,True,0.746064
62124,D2921380,How to Use Coconut Oil 1 Select the right kind...,False,True,0.746064
1798,D1879579,What Is the Difference Between Coconut Oil & C...,False,True,0.719958
17026,D2152717,How Much Coconut Oil Should I Eat Daily? How M...,False,True,0.714263
72564,D1403176,The Differences Between Refined and Virgin Coc...,False,True,0.688196
8131,D1401032,How Do I Store Coconut Oil? How Do I Store Coc...,False,True,0.6852
70763,D3350443,Is Coconut Oil Good for Frying on High Tempera...,False,True,0.682068


time: 959 ms (started: 2024-04-04 23:10:13 +02:00)


#### Analysis Results

1. None of the documents listed were marked as relevant in the top 10 results (in_qgold10=False for all but one entry), yet all were marked as relevant in the top 100 results (in_qgold100=True for all entries). This suggests that the criteria for relevance become broader as the result set expands or that some relevant documents are ranked lower and only appear as relevant when considering a larger set of top results. Notably, the document with docid of D795576 is an exception being marked as relevant in both sets, indicating a high relevance to the query.


2. The similarity scores do not strictly dictate whether a document is within the top 10 for relevance (in_qgold10). For instance, the document with the highest similarity score (0.845558) was not considered relevant in the top 10 but was in the top 100. This observation could suggest that the threshold for relevance in the top 10 is very strict or based on additional criteria not captured solely by the similarity score.

In [78]:
print(f"P@10 for {685501} when k = 10: ",pAt10(685501, k=10))

P@10 for 685501 when k = 10:  1.0
time: 625 ms (started: 2024-04-04 23:09:56 +02:00)


### 🚧 Todo:
- Do an evaluation on our 3 cases to see whether the relevance score improves.

#### Summary + Evaluation of Results

In [82]:
print(f"P@10 for {729561} when k = 10: ",pAt10(729561, k=10))
print(f"P@10 for {251898} when k = 10: ",pAt10(251898, k=10))
print(f"P@10 for {685501} when k = 10: ",pAt10(685501, k=10))
print("----------------------------------------------------")
print(f"P@50 for {729561} when k = 50: ",pAt10(729561, k=50))
print(f"P@50 for {251898} when k = 50: ",pAt10(251898, k=50))
print(f"P@50 for {685501} when k = 50: ",pAt10(685501, k=50))
print("----------------------------------------------------")
print(f"P@100 for {729561} when k = 100: ",pAt10(729561, k=100))
print(f"P@100 for {251898} when k = 100: ",pAt10(251898, k=100))
print(f"P@100 for {685501} when k = 100: ",pAt10(685501, k=100))

P@10 for 729561 when k = 10:  0.1
P@10 for 251898 when k = 10:  1.0
P@10 for 685501 when k = 10:  1.0
----------------------------------------------------
P@50 for 729561 when k = 50:  0.0
P@50 for 251898 when k = 50:  0.5
P@50 for 685501 when k = 50:  0.7
----------------------------------------------------
P@100 for 729561 when k = 100:  0.0
P@100 for 251898 when k = 100:  0.5
P@100 for 685501 when k = 100:  0.5
time: 15.3 s (started: 2024-04-04 23:23:12 +02:00)


#### Results

Given the results indicating precision at different cutoffs (P@10, P@50, and P@100) for the three queries (729561, 251898, and 685501), we can interpret the retrieval system's performance:

##### P@10 Results

- **Query 729561**: Precision at 10 (P@10) is 0.1, suggesting only 1 out of the top 10 documents is relevant, indicating low precision for this query within the top 10 results.

- **Query 251898 & Query 685501**: Both queries achieve a P@10 of 1.0, meaning all top 10 documents are relevant for these queries, indicating excellent precision within the top 10 results.

##### P@50 Results

- **Query 729561**: Precision at 50 (P@50) is 0.0, indicating no relevant documents are found within the top 50, suggesting the query might be challenging or poorly represented in the data.

- **Query 251898**: P@50 is 0.5, indicating 25 out of the top 50 documents are relevant, showing a significant drop in precision from the top 10 to the top 50, which might be due to a dilution of relevant documents as more results are considered.

- **Query 685501**: P@50 is 0.7, meaning 35 out of the top 50 documents are relevant, indicating relatively high precision even when expanding the set of results to the top 50.

##### P@100 Results

- **Query 729561**: P@100 remains 0.0, consistent with P@50, underscoring challenges with finding relevant documents for this query even within the top 100 results.

- **Query 251898 & Query 685501**: Both queries have a P@100 of 0.5, indicating 50 out of the top 100 documents are relevant. This maintains the precision level seen at P@50 for these queries but shows a significant decrease from P@10, reflecting the challenge in maintaining high precision as more results are included.

##### General Interpretation

- **Consistency Across Different Ks**: For query 729561, the precision is consistently low or nonexistent across different cutoffs, suggesting specific difficulties in retrieving relevant documents for this query. For queries 251898 and 685501, there's a notable drop in precision from P@10 to P@50 and P@100, indicating that while the system effectively identifies relevant documents within the top 10, its ability to do so diminishes as the number of considered documents increases.

- **Query Specificity and System Effectiveness**: The variation in performance across different queries suggests that the retrieval system's effectiveness might vary significantly based on the query's specificity and the availability of relevant documents. Queries 251898 and 685501 seem well-served within the top 10 results, but for broader result sets, the precision decreases.

### Check out the tokenization

In [84]:
from transformers import AutoModel, AutoTokenizer
model_id = 'mixedbread-ai/mxbai-embed-large-v1'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id) #.cuda()

time: 505 ms (started: 2024-04-04 23:29:05 +02:00)


In [85]:
long_words = [
    "pneumonoultramicroscopicsilicovolcanoconiosis",  # A lung disease caused by inhaling silica dust
    "pseudopseudohypoparathyroidism",             # A rare genetic disorder
    "hippopotomonstrosesquippedaliophobia",         # Fear of long words (interestingly enough)
    "anticonstitutionallyestablishmentarianism",  # Opposition to establishing a constitution
    "supercalifragilisticexpialidocious",          # From Mary Poppins (not actually that long!)
    "floccinaucinihilipilification",              # The act of estimating something as worthless
    "electroencephalogram",                         # Recording electrical activity of the brain
    "rhinencephalospheropneumonectomy",            # Surgical removal of part of the brain and lung
    "incomprehensibilities",                       # The state of being impossible to understand
    "honorificabilitudinitatibus",                 # (Shakespearean) A long word for "worthiness"
]

' '.join(tokenizer.tokenize('Hello wondrous information retrieval world: Oh, an unseenword!'+' '.join(long_words)))

'hello won ##dro ##us information retrieval world : oh , an unseen ##word ! p ##ne ##um ##ono ##ult ##ram ##ic ##ros ##copic ##sil ##ico ##vo ##lc ##ano ##con ##ios ##is pseudo ##pse ##ud ##oh ##yp ##opa ##rath ##yr ##oid ##ism hip ##pop ##oto ##mons ##tro ##ses ##qui ##pped ##ali ##op ##ho ##bia anti ##con ##sti ##tu ##tion ##ally ##est ##ab ##lish ##ment ##arian ##ism super ##cal ##if ##rag ##ilis ##tic ##ex ##pia ##lid ##oc ##ious fl ##oc ##cina ##uc ##ini ##hil ##ip ##ili ##fication electro ##ence ##pha ##logram rhine ##nce ##pha ##los ##pher ##op ##ne ##um ##one ##ct ##omy inc ##omp ##re ##hen ##si ##bilities honor ##ific ##abi ##lit ##udi ##ni ##tat ##ib ##us'

time: 4.33 ms (started: 2024-04-04 23:29:10 +02:00)


### 🚧 Todo:

- Write a function that gives a list of tokens for a given words that are split up by the tokenizer

In [89]:
#Tokenizes a list of words and returns a list of tokens for each word.
def tokenizer_splitter(tokenizer, words):
    return [tokenizer.tokenize(word) for word in words]

time: 1.1 ms (started: 2024-04-04 23:34:08 +02:00)


In [90]:
tokenized_long_words = tokenizer_splitter(tokenizer, long_words)

# Display the result
for word, tokens in zip(long_words, tokenized_long_words):
    print(f"Word: {word}")
    print(f"Tokens: {tokens}\n")

Word: pneumonoultramicroscopicsilicovolcanoconiosis
Tokens: ['p', '##ne', '##um', '##ono', '##ult', '##ram', '##ic', '##ros', '##copic', '##sil', '##ico', '##vo', '##lc', '##ano', '##con', '##ios', '##is']

Word: pseudopseudohypoparathyroidism
Tokens: ['pseudo', '##pse', '##ud', '##oh', '##yp', '##opa', '##rath', '##yr', '##oid', '##ism']

Word: hippopotomonstrosesquippedaliophobia
Tokens: ['hip', '##pop', '##oto', '##mons', '##tro', '##ses', '##qui', '##pped', '##ali', '##op', '##ho', '##bia']

Word: anticonstitutionallyestablishmentarianism
Tokens: ['anti', '##con', '##sti', '##tu', '##tion', '##ally', '##est', '##ab', '##lish', '##ment', '##arian', '##ism']

Word: supercalifragilisticexpialidocious
Tokens: ['super', '##cal', '##if', '##rag', '##ilis', '##tic', '##ex', '##pia', '##lid', '##oc', '##ious']

Word: floccinaucinihilipilification
Tokens: ['fl', '##oc', '##cina', '##uc', '##ini', '##hil', '##ip', '##ili', '##fication']

Word: electroencephalogram
Tokens: ['electro', '##ence

### Other things that we could look into (if we had the time and the computational resources):

- Easy:
  - Use the mixedbread reranking feature https://www.mixedbread.ai/blog/mxbai-rerank-v1 (see below)


- Hard
  - Complete text embeddings beyond 512 tokens
  - And then saving and sharing precomputed embeddings
  - Using only this embedding on all documents and compare it with tf-idf/bm25


In [93]:
def query_info(qid, k=10, training_queries=training_queries, ourdocs=ourdocs, gold=gold):
    # Directly retrieve the query vector for the given qid
    qv = train_qvs[training_queries[training_queries.qid==qid].index[0]].T
    # Compute cosine similarity between the query vector and all document vectors
    xqv = X * qv
    # Get indices of top k documents based on similarity
    predki = np.argpartition(xqv.A.flat, -k)[-k:]
    # Retrieve the query text
    query = training_queries.loc[training_queries.qid==qid, 'query'].values[0]
    # Retrieve the text of the predicted top k documents
    sampledocs = ourdocs.loc[predki, 'text'].to_list()
    return query, sampledocs


time: 1.1 ms (started: 2024-04-04 23:41:04 +02:00)


In [95]:
from mixedbread_ai.client import MixedbreadAI

mxbai = MixedbreadAI(api_key="{MIXEDBREAD_API_KEY}")
query1, sampledocs1 = query_info(729561)
res1 = mxbai.reranking(
    model="mixedbread-ai/mxbai-rerank-large-v1",
    query=query1,
    input=sampledocs1,
    top_k=10,
    return_input=False
)

print(res1.data)
#Does not work as it requires API key

UnauthorizedError: status_code: 401, body: type='unauthorized_error' details=None message='Invalid API key. We are unable to map the provided API key to a valid user or organization.' url='https://www.mixedbread.ai/api-reference/authentication'

time: 978 ms (started: 2024-04-04 23:41:17 +02:00)


### Additional stuff - can savely be ignored:

In [98]:
def p_at(qid, p=10, queries=queries, gold=gold, docs=ourdocs):
    """
    takes a query id and returns the precision at 10 for that query
    gold contains the 100 best documents for each query
    """
    query = queries[queries.qid==qid]['query']
    # print(query)
    qv = vectorizer.transform(query) # query vector
    xqv = X*qv.T # cosine similarity between query and all documents
    pred10i = np.argpartition(xqv.A.flat, -p)[-p:] # top 10
    # display(docs.loc[pred10i],gold[gold.qid==qid].docid,docs.loc[pred10i].isin(gold[gold.qid==qid].docid).docid)
    qgold = gold[(gold.qid==qid) & (gold['rank']<=p)].docid
    intersection = np.intersect1d(docs.loc[pred10i].docid, qgold) # intersection of top p and gold
    return len(intersection)/p

print(p_at(251898))
print(p_at(687888))

0.4
0.8
time: 664 ms (started: 2024-04-04 23:43:21 +02:00)


In [99]:
training_at_10 = training_queries.qid.progress_apply(p_at)

  0%|          | 0/500 [00:00<?, ?it/s]

time: 2min 44s (started: 2024-04-04 23:43:27 +02:00)


In [100]:
# how many of the 500 test queries have at least one relevant document in the top 10?
len(training_at_10[training_at_10>0])

430

time: 4.21 ms (started: 2024-04-04 23:47:52 +02:00)


In [101]:
# average number of relevant documents in the top 10
training_at_10[training_at_10>0].mean()

0.2744186046511628

time: 6.24 ms (started: 2024-04-04 23:47:55 +02:00)


In [102]:
training_at_10[training_at_10>0]

0      0.8
1      0.1
2      0.1
3      0.3
4      0.3
      ... 
495    0.1
496    0.2
497    0.5
498    0.2
499    0.6
Name: qid, Length: 430, dtype: float64

time: 10.9 ms (started: 2024-04-04 23:47:57 +02:00)


## Reranking

In [103]:
# %pip install sentence_transformers -U 

time: 574 µs (started: 2024-04-04 23:48:16 +02:00)


In [104]:
from sentence_transformers import CrossEncoder

# Load the model, here we use our base sized model
model = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")

time: 1.47 s (started: 2024-04-04 23:48:31 +02:00)


In [105]:
# Example query and documents
query = "Who wrote 'To Kill a Mockingbird'?"
documents = [
    "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.",
    "The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil.",
    "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.",
    "Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.",
    "The 'Harry Potter' series, which consists of seven fantasy novels written by British author J.K. Rowling, is among the most popular and critically acclaimed books of the modern era.",
    "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan."
]

time: 757 µs (started: 2024-04-04 23:48:43 +02:00)


In [106]:
results = model.rank(query, documents, return_documents=True, top_k=3)

time: 140 ms (started: 2024-04-04 23:48:55 +02:00)


In [107]:
results

[{'corpus_id': 0,
  'score': 0.99463475,
  'text': "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature."},
 {'corpus_id': 2,
  'score': 0.98541903,
  'text': "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961."},
 {'corpus_id': 3,
  'score': 0.5803452,
  'text': 'Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.'}]

time: 4.32 ms (started: 2024-04-04 23:48:57 +02:00)
