# Information Retrieval Notebook 3

In [199]:
#!pip install rank_bm25 spacy Sense2Vec
#!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from rank_bm25 import BM25Okapi
import spacy
from sense2vec import Sense2Vec
tqdm.pandas()

time: 14 ms (started: 2024-03-28 17:47:20 +01:00)


In [200]:
# this turns on the autotimer, so that every cell has a timing information below
try:
    %load_ext autotime
except:
    #!pip install ipython-autotime
    %load_ext autotime
# stop using:
# %unload_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.94 ms (started: 2024-03-28 17:47:20 +01:00)


## Getting the best combinations from last time and writing them into files

In [201]:
#origdocs = pd.read_csv('our_msmarco/our.msmarco.docs.tsv',sep='\t',usecols=[1,2,3])
#origdocs['title'].fillna('-', inplace=True)
#origdocs['body'].fillna('-', inplace=True)
#origdocs

time: 797 µs (started: 2024-03-28 17:47:20 +01:00)


In [202]:
#docs = pd.DataFrame(columns = ['docid', 'text'])
#docs['docid']=origdocs.docid
#docs['text']=origdocs.title+' '+origdocs.body
#docs

time: 710 µs (started: 2024-03-28 17:47:21 +01:00)


In [203]:
#del origdocs # saving memory

time: 521 µs (started: 2024-03-28 17:47:21 +01:00)


In [204]:
#docs.to_csv('our.text.msmarco.docs.tsv',sep='\t', columns=['docid','text'])

time: 608 µs (started: 2024-03-28 17:47:21 +01:00)


## Reading back in just for checking the files - or for restarting here

In [205]:
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from rank_bm25 import BM25Okapi

tqdm.pandas()

time: 4.85 ms (started: 2024-03-28 17:47:21 +01:00)


In [206]:
# this is a different doc, no longer distinguishing title and body
docs = pd.read_csv("/bigstorage/pavlo/testing/our.text.msmarco.docs_2.tsv",sep='\t',usecols=[1,2]) 
docs

Unnamed: 0,docid,text
0,D2981241,What do you call a group of lions? Lions Vocab...
1,D687756,". The A Priori Argument ( also, Rationalizatio..."
2,D913099,Everything You Need To Learn How To Cook Veget...
3,D328017,"What is the difference between latitude, longi..."
4,D1636347,When was the pulley invented? Answers.com ® Wi...
...,...,...
92560,D3379210,Top 39 Doctor insights on: Can An Iud Cause Ha...
92561,D3068739,How to get back your DirecTV cancellation fees...
92562,D1590402,Certification FAQs Fingerprinting 1. Where can...
92563,D2175490,Greenhouse gas emissions by Canadian economic ...


time: 15.1 s (started: 2024-03-28 17:47:21 +01:00)


### Pre-tokenization for BM25

In [207]:
vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode')
tokenized_corpus = docs.text.progress_apply(vectorizer.build_analyzer())

  0%|          | 0/92565 [00:00<?, ?it/s]

time: 3min 5s (started: 2024-03-28 17:47:37 +01:00)


### Training & Testing Queries

In [208]:
# use only col 1 if you have memory problems and do BM25 only
queries = pd.read_csv('/bigstorage/pavlo/testing/our.msmarco.queries.tsv',sep='\t',usecols=[1,2]) 
training_queries=queries.iloc[:500]
testing_queries=queries.iloc[500:]
training_queries

Unnamed: 0,qid,query
0,687888,what is a jpe
1,480210,price for asphalt driveway
2,591004,what causes pressure skin bruising
3,260536,how long drive from flagstaff to grand canyon
4,39422,average number of bowel movements per day for ...
...,...,...
495,133970,definition of dietary fiber
496,79788,can you start up a video record?
497,791583,what is rheumatoid spondylosis
498,732078,what is coleman fuel made out of


time: 21.5 ms (started: 2024-03-28 17:50:42 +01:00)


### Gold

In [209]:
gold = pd.read_csv('/bigstorage/pavlo/testing/our.msmarco.gold.tsv',sep='\t',usecols=[1,3,4,5])
gold

Unnamed: 0,qid,docid,rank,score
0,310290,D579750,1,-5.11498
1,310290,D579754,2,-5.57703
2,310290,D2380815,3,-5.84852
3,310290,D822566,4,-5.95002
4,310290,D2249695,5,-6.08326
...,...,...,...,...
99995,257942,D253854,96,-6.32693
99996,257942,D3056621,97,-6.32837
99997,257942,D1323491,98,-6.32871
99998,257942,D2722485,99,-6.33100


time: 80.1 ms (started: 2024-03-28 17:50:42 +01:00)


## Redoing the vectorization for my two best results

### 🚧 Todo:

#### Use TfidfVectorizer, BM25Okapi, and our own BM25 function to measure whether there are significant differences.

#### TF-IDF

In [210]:
def pAt10(qid):
    query = queries[queries.qid==qid]['query']
    qv = vectorizer.transform(query)
    xqv = X*qv.T
    pred10i = np.argpartition(xqv.A.flat, -10)[-10:]
    intersection = np.intersect1d(docs.loc[pred10i].docid,gold[gold.qid==qid].docid)
    return len(intersection)/10

time: 665 µs (started: 2024-03-28 17:50:43 +01:00)


In [211]:
vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode')
X = vectorizer.fit_transform(docs.text)
print(len(vectorizer.get_feature_names_out()),'features, for example',vectorizer.get_feature_names_out()[44444:44449])
tfidfresults = training_queries.qid.progress_apply(pAt10)
tfidfresults.mean()

2067446 features, for example ['0highest' '0highs' '0highways' '0hihow' '0hill']


  0%|          | 0/500 [00:00<?, ?it/s]

0.9348

time: 7min 8s (started: 2024-03-28 17:50:43 +01:00)


#### BM25Okapi

In [212]:
def pAt10Bm25(qid):
    tquery = queries[queries.qid==qid]['query'].apply(vectorizer.build_analyzer())
    doc_scores = bm25.get_scores(tquery.tolist()[0])
    pred10i = np.argpartition(doc_scores, -10)[-10:]
    intersection = np.intersect1d(docs.loc[pred10i].docid,gold[gold.qid==qid].docid)
    return len(intersection)/10

time: 9.2 ms (started: 2024-03-28 17:57:52 +01:00)


In [213]:
bm25 = BM25Okapi(tokenized_corpus)
bm25results_Okapi = training_queries.qid.progress_apply(pAt10Bm25)
bm25results_Okapi.mean()

  0%|          | 0/500 [00:00<?, ?it/s]

0.9848

time: 3min 32s (started: 2024-03-28 17:57:52 +01:00)


#### BM25

In [214]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
import gc

class BM25Score(BaseEstimator, ClassifierMixin):
    def __init__(self, k1=1.5, b=0.75): # vectorized_docs: word counts
        self.k1 = k1
        self.b = b
        # self.vectorized_docs will be set in the fit method


    def fit(self, X, y=None):
        self.vectorized_docs = X
        self.n_d = self.vectorized_docs.sum(axis=1).A.flatten()
        self.avgdl = np.mean(self.n_d)
        self.n_docs = self.vectorized_docs.shape[0]

        # Calculate IDF
        document_frequency = (self.vectorized_docs > 0).sum(axis=0).A.flatten()
        self.idf = np.log((self.n_docs - document_frequency + 0.5) / (document_frequency + 0.5) + 1)
        return self

    def predict(self, X):
        # bigger batches => faster but more memory heavy
        n_queries = X.shape[0]
        batch_size = 50
        final_scores = []

        for batch_idx in range(0, n_queries, batch_size):
            end_idx = min(batch_idx + batch_size, n_queries)
            batch_vectorized_queries = X[batch_idx:end_idx]
            idx_tokens = np.flatnonzero(batch_vectorized_queries.sum(axis=0))
            batch_vectorized_queries = batch_vectorized_queries[:, idx_tokens]
            vectorized_docs = self.vectorized_docs[:, idx_tokens].toarray()
            idf = self.idf[idx_tokens]

            scores = idf * (
                (vectorized_docs * (self.k1 + 1)) /
                (vectorized_docs + self.k1 * (1 - self.b + self.b * (self.n_d[:, None] / self.avgdl)))
            )
            final_scores.append(((batch_vectorized_queries > 0).astype(np.float32) @ scores.T))

            del vectorized_docs, scores
            gc.collect()

        return np.concatenate(final_scores, axis=0)


time: 4.1 ms (started: 2024-03-28 18:01:24 +01:00)


In [215]:
def evaluate_model_bm25_optimized(scores, docs, queries, gold_standard):
    # Assuming 'scores' is a 2D array: rows are queries, columns are documents.
    p_at_10_scores = []
    
    for idx, row in enumerate(queries.itertuples()):
        query_id = getattr(row, 'qid')
        
        # Get scores for the current query
        query_scores = scores[idx]
        
        # Efficiently find the indices of the top 10 scores for this query
        top10_indices = np.argpartition(query_scores, -10)[-10:]
        top10_indices_sorted = top10_indices[np.argsort(query_scores[top10_indices])[::-1]]
        
        # Assuming doc_ids is a list of document IDs that aligns with columns in 'scores'
        top10_docids = [docs.iloc[idx]['docid'] for idx in top10_indices_sorted]
        
        # Get the relevant docids for the query_id from gold_standard
        relevant_docids = gold_standard[gold_standard['qid'] == query_id]['docid'].values
        
        # Calculate P@10 for the query
        relevant_in_top_10 = np.intersect1d(top10_docids, relevant_docids)
        p_at_10 = len(relevant_in_top_10) / 10.0
        p_at_10_scores.append(p_at_10)
    
    # Calculate the average P@10
    average_p_at_10 = np.mean(p_at_10_scores)
    return average_p_at_10

time: 4.31 ms (started: 2024-03-28 18:01:24 +01:00)


In [216]:
# Vectorizing the text
vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', binary=True, use_idf=False, norm=None)
X_docs = vectorizer.fit_transform(docs.text)

time: 4min (started: 2024-03-28 18:01:25 +01:00)


In [217]:
# Instantiation of BM25Score + fitting + prediction
bm25 = BM25Score()
bm25.fit(X_docs)

time: 7.04 s (started: 2024-03-28 18:05:25 +01:00)


In [218]:
queries["query"]

0                                          what is a jpe
1                             price for asphalt driveway
2                     what causes pressure skin bruising
3          how long drive from flagstaff to grand canyon
4      average number of bowel movements per day for ...
                             ...                        
995                              cell voltage mv meaning
996                                   what an ip address
997                       what is daily max citizens atm
998        do i need a florida commercial driver license
999                    what does tv on cameras stand for
Name: query, Length: 1000, dtype: object

time: 14.3 ms (started: 2024-03-28 18:05:32 +01:00)


In [219]:
# Transforming queries
X_queries = vectorizer.transform(queries["query"])

time: 11.7 ms (started: 2024-03-28 18:05:32 +01:00)


In [220]:
# Predict the scores for the queries
bm25_scores = bm25.predict(X_queries)

time: 1min 31s (started: 2024-03-28 18:05:33 +01:00)


In [221]:
# Compute the mean P@10 score
mean_p_at_10 = evaluate_model_bm25_optimized(bm25_scores, docs, training_queries, gold)
print(f"Mean BM25 P@10: {mean_p_at_10}")

Mean BM25 P@10: 0.8626
time: 832 ms (started: 2024-03-28 18:07:05 +01:00)


### Conclusion - Results

The p@10 performances are:

- TF-IDF: 93.48
- BM25Okapi: 98.48
- BM25: 86.26

# 🔎 Manual Error Mining

- Let's look at where things go wrong

### 🚧 Todo:
- What's the lowest p@10 we got? 

- What's the 10 questions that got the worst score, from worst to slightly better?

#### Lowest p@10

In [222]:
lowest_p_at_10 = bm25results_Okapi.min()
print('Lowest p@10:',lowest_p_at_10)

Lowest p@10: 0.1
time: 827 µs (started: 2024-03-28 18:07:06 +01:00)


#### Top 10 worst questions - TF-IDF

In [223]:
tfidfresults

0      1.0
1      1.0
2      0.9
3      1.0
4      0.7
      ... 
495    1.0
496    1.0
497    1.0
498    1.0
499    0.8
Name: qid, Length: 500, dtype: float64

time: 5.86 ms (started: 2024-03-28 18:07:06 +01:00)


In [224]:
worst10tfidf = tfidfresults.sort_values(ascending=True).head(10)
wors10tfidf_index = worst10tfidf.index
training_queries.loc[wors10tfidf_index]

Unnamed: 0,qid,query
285,127145,define skin doctor
172,729561,what is channeling
65,71027,can lyme disease cause coughing
37,417380,is mark applier?
270,424296,is spain bigger than italy
335,850892,what is the the bug std
21,1049686,who sang almost paradise
256,99399,cooking time for roasted beef short ribs
300,903134,what tests or procedures do they have for chec...
92,393188,in a democracy the idea of the consent of the ...


time: 16.4 ms (started: 2024-03-28 18:07:06 +01:00)


#### Top 10 worst questions - BM25Okapi

In [225]:
bm25results_Okapi

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
495    1.0
496    1.0
497    1.0
498    1.0
499    1.0
Name: qid, Length: 500, dtype: float64

time: 6.91 ms (started: 2024-03-28 18:07:06 +01:00)


In [226]:
worst10bm25Okapi = bm25results.sort_values(ascending=True).head(10)
worst10bm25Okapi_index = worst10bm25Okapi.index
training_queries.loc[worst10bm25Okapi_index]

Unnamed: 0,qid,query
172,729561,what is channeling
398,195543,goes how long cost and valid
37,417380,is mark applier?
308,651254,what does the name connor mean
285,127145,define skin doctor
443,11059,actresses who died of lung cancer
167,859358,what is uplifting in geography
184,1078582,wired definition
456,868366,what kind of food should heart attached person
455,567103,what are the airports in south virginia


time: 7.58 ms (started: 2024-03-28 18:07:07 +01:00)


#### Top 10 worst questions - BM25

In [227]:
bm25_scores.shape

(1000, 92565)

time: 3.41 ms (started: 2024-03-28 18:07:07 +01:00)


In [228]:
queries

Unnamed: 0,qid,query
0,687888,what is a jpe
1,480210,price for asphalt driveway
2,591004,what causes pressure skin bruising
3,260536,how long drive from flagstaff to grand canyon
4,39422,average number of bowel movements per day for ...
...,...,...
995,89597,cell voltage mv meaning
996,1167043,what an ip address
997,737304,what is daily max citizens atm
998,156934,do i need a florida commercial driver license


time: 16.3 ms (started: 2024-03-28 18:07:07 +01:00)


In [229]:
def pred_docs(qid):
    # Get scores for the current query
    idx = queries.loc[queries.qid == qid].index[0]
    query_scores = bm25_scores[idx]
    
    # Efficiently find the indices of the top 10 scores for this query
    pred_indices = np.argpartition(query_scores, -10)    
    pred_indices_sorted = pred_indices[np.argsort(query_scores[pred_indices])[::-1]]
    
    # Extracting docids
    pred_docids = [docs.iloc[idx]['docid'] for idx in pred_indices_sorted]
    return pred_docids


time: 10.7 ms (started: 2024-03-28 18:07:07 +01:00)


In [230]:
def list_scores_BM25(bm25_scores, docs, training_queries, gold):

    # Function to extract the list with top 10 scores for the BM25 model, not the mean of them
    p_at_10_scores = []
        
    for idx, row in enumerate(training_queries.itertuples()):
        query_id = getattr(row, 'qid')
        
        # Get scores for the current query
        query_scores = bm25_scores[idx]
        
        # Efficiently find the indices of the top 10 scores for this query
        top10_indices = np.argpartition(query_scores, -10)[-10:]
        top10_indices_sorted = top10_indices[np.argsort(query_scores[top10_indices])[::-1]]
        
        # Extracting docids
        top10_docids = [docs.iloc[idx]['docid'] for idx in top10_indices_sorted]
        
        # Get the relevant docids for the query_id from gold_standard
        relevant_docids = gold[gold['qid'] == query_id]['docid'].values
        
        # Calculate P@10 for the query
        relevant_in_top_10 = np.intersect1d(top10_docids, relevant_docids)
        p_at_10 = len(relevant_in_top_10) / 10.0
        p_at_10_scores.append(p_at_10)

    return p_at_10_scores

p_at_10_scores = list_scores_BM25(bm25_scores, docs, training_queries, gold)

time: 887 ms (started: 2024-03-28 18:07:08 +01:00)


In [231]:
# Sort scores in ascending order and get the indices of the 10 worst scores
worst10bm25_index = np.argsort(p_at_10_scores)[:10]

print(worst10bm25_index)

[486  37 285 289 172 242 155 443 270 398]
time: 706 µs (started: 2024-03-28 18:07:09 +01:00)


In [232]:
training_queries.iloc[worst10bm25_index]

Unnamed: 0,qid,query
486,401719,is aids a disease
37,417380,is mark applier?
285,127145,define skin doctor
289,743838,what is endolymhatic system
172,729561,what is channeling
242,601391,what could cause pain in your liver
155,888314,what pressure
443,11059,actresses who died of lung cancer
270,424296,is spain bigger than italy
398,195543,goes how long cost and valid


time: 12 ms (started: 2024-03-28 18:07:09 +01:00)


### 🚧 Todo:
- Write a function showDoc that takes qid, rank, and predicted as parameters
    - If predicted=True, shows the predicted doc of rank to the query qid
    - If predicted=False, shows the gold doc
    - Prints the first 999 characters of the texts
- For the worst query
    - Look at the 10 best gold vs 10 best predicted 
    - Hypothetize why the results are so bad for the worst query

#### ShowDoc - BM25

In [233]:
def showDoc(qid, rank, predicted=False):
    if predicted:
        predicted_docs = pred_docs(qid)
        relevant_doc = predicted_docs[rank-1]
        text = docs[docs.docid == relevant_doc]["text"].values[0][:999]
        return text
    else:
        # Filtering according to qid and rank and extracting docid
        gold_docid = gold[(gold['qid'] == qid) & (gold['rank'] == rank)]["docid"].values[0]
        text = docs[docs["docid"] == gold_docid]["text"].values[0][:999]
        return text

print("Gold:\n", showDoc(729561,1))
print("\nPredicted:\n", showDoc(729561,2, predicted=True))

Gold:
 What channel is NBC on XFINITY? Comcast Xfinity (product) NBC Comcast Products and Services TV Channels Television What channel is NBC on XFINITY?3 Answers Greg Monti, Radio broadcast engineer fascinated by over-the-air television Answered Jun 4, 2016 · Upvoted by Ryan Ace, XFINITY Sales Professional at Comcast There is no standard channel number on which NBC is carried on Xfinity cable systems. Why? Because NBC is a television network transmitted by a local affiliate station in each market, much like ABC, CBS, Fox, Ion, My Network TV, PBS, TBN, Telemundo, The CW and Univision. NBC is technically neither a basic nor a premium cable channel. It is up to the local, over-the-air NBC affiiate stations and the cable companies that serve its broadcast area how the channel will be carried. Usually, the cable operator is either forced (by a must-carry FCC rule) or negotiates to carry the station on the cable channel number that most closely matches the virtual over-the-air channel that 

#### Worst Query

In [234]:
worst_query = training_queries.loc[worst10bm25Okapi_index].iloc[0]
worst_query_id = worst_query['qid']
worst_query_best_gold_docs = gold[gold['qid'] == worst_query_id].sort_values('rank').head(10)

print("-------------------------------------------------")
print("Worst Query:")
print("\n", worst_query)
print("-------------------------------------------------")
print("\n")
print("-------------------------------------------------")
print("Best Gold Documents:")
for i in range(10):
    print(f"\nRank {i+1}:")
    print(showDoc(729561, i+1))
print("-------------------------------------------------")
print("\n")
print("-------------------------------------------------")
print("Best Predicted Documents:")
for i in range(10):
    print(f"\nRank {i+1}:")
    print(showDoc(729561, i+1, predicted=True))
print("-------------------------------------------------")

-------------------------------------------------
Worst Query:

 qid                  729561
query    what is channeling
Name: 172, dtype: object
-------------------------------------------------


-------------------------------------------------
Best Gold Documents:

Rank 1:
What channel is NBC on XFINITY? Comcast Xfinity (product) NBC Comcast Products and Services TV Channels Television What channel is NBC on XFINITY?3 Answers Greg Monti, Radio broadcast engineer fascinated by over-the-air television Answered Jun 4, 2016 · Upvoted by Ryan Ace, XFINITY Sales Professional at Comcast There is no standard channel number on which NBC is carried on Xfinity cable systems. Why? Because NBC is a television network transmitted by a local affiliate station in each market, much like ABC, CBS, Fox, Ion, My Network TV, PBS, TBN, Telemundo, The CW and Univision. NBC is technically neither a basic nor a premium cable channel. It is up to the local, over-the-air NBC affiiate stations and the cable c

. Arc Resistant Switchgear ANSI/IEEE C37.20.7 defines switchgear arc resistance in two basic categories: ANSI type 1 Arc resistance from the front of gear only ANSI type 2 Arc resistance provided from the front, sides and rear A suffix may be added to either of these two types to further define the type of protection provided: A: Basic design B: Arc resistance is maintained even while opening designated low voltage compartments C: Arc resistance is maintained even when opening designated adjacent compartments D: Special designation that supplements the Type 1 designation, but identifies additional arc resistance in certain structures Eaton offers arc resistant medium voltage (MV),and low voltage (LV) switchgear and arc resistant MV motor control. Eaton also offers a unique “arc preventative” LV motor control center (Flash Gard) that protects the operator by reducing the likelihood that an arc would occur. Arc Resistant MV Switchgear Eaton's VCP-W arc-resistant vacuum switchgear (5 k V


#### Conclusion

The poor performance for the given query "what is channeling" with the BM25Okapi model, could be due to several factors:

1. **Ambiguity of the Query**: The term "channeling" can have multiple meanings. It could refer to spiritualism (communicating with spirits), broadcasting (TV channels), or even diverting water in hydrology. If the documents in the corpus address different meanings, this can dilute the relevance of results.

2. **Lack of Context**: Without additional context or terms, the model may struggle to understand the intent behind the query, leading to a broad range of results that may not align with user expectations.

3. **Vocabulary Mismatch**: If the language used in relevant documents is vastly different from the query terms (synonyms, technical jargon, etc.), the model might not rank these documents highly, even if they are topically relevant.

4. **Document Quality and Content**: If the corpus includes many documents with high keyword overlap but low-quality content (such as keyword-stuffed pages), the model could erroneously give these documents a higher score over more relevant ones.

5. **Model Parameters**: The BM25Okapi model's efficiency also depends on its tuning parameters. The k1 and b parameters influence term frequency and document length normalization, which, if not optimized for the corpus and query types, can result in less accurate scoring.

Improvements might involve refining the model with more appropriate parameter values or enhancing the query preprocessing with NLP techniques to better capture user intent.

### 🚧 Todo: Can we characterize these difficult cases?
- Do they have specicific problems?
- Do we know when we are doing badly?
    - Are the distances between query vector and the best documents bigger than average?
    

#### Specific Problems:

As previously mentioned:

- Vagueness/Ambiguity of Query: When a search term has several potential interpretations, this can confuse the search mechanism, yielding a spread of results that may not be pertinent.
- Insufficient Detail/Lack of Context: A query that doesn't provide enough specific information can pull up a scattered selection of documents, diluting the overall relevance.
- Indexing and Content Quality Issues: If documents are not indexed properly or the content quality is poor, it could hinder the search system's ability to identify and rank the most pertinent documents effectively.

#### Identifying Poor Performance:

- Evaluating Vector Space Distances: In models that utilize vector spaces, such as those involving TF-IDF, BM25Okapi and BM25 or word embeddings, the comparison of distances between vectors can reveal important information. When the distance between a query's vector and the vectors of documents it retrieves is found to be larger than the usual, this may suggest the model's difficulty in locating documents that are closely related to the query.

# 🚀 Spacy

- Look at https://github.com/explosion/sense2vec/blob/master/README.md

In [235]:
import spacy
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg') # or the smaller md model!!!

time: 5.04 s (started: 2024-03-28 18:07:33 +01:00)


### 🚧 Todo:
- Explain what's going on here:

In [236]:
sent1 = nlp("I am happy")
sent2 = nlp("I am sad")
sent3 = nlp("I am joyful")
sent1.similarity(sent2), sent1.similarity(sent3)

(0.9740256438192407, 0.9844193900834773)

time: 45.1 ms (started: 2024-03-28 18:07:38 +01:00)


#### Explanation

- We are using the natural language processing library Spacy to compare the semantic similarity of sentences. Three sentences have been processed to create document objects, and the semantic similarity between the first sentence ("I am happy") and the other two sentences ("I am sad", "I am joyful") is being calculated.

- The similarity method computes how semantically similar two objects are (in this case, sentences). The similarity is a number between 0 and 1, where 1 means exactly the same and 0 means completely different. The similarity scores obtained indicate that "I am happy" is more similar to "I am joyful" (0.984) than to "I am sad,"(0.974) which makes intuitive sense since "happy" and "joyful" are synonyms, while "sad" is an antonym of "happy."

### Let's try sense2vec

- Depending on your machine, download one of the two versions of sense2vec from https://github.com/explosion/sense2vec/blob/master/README.md
  - s2v_reddit_2019_lg 	4 GB 	Reddit comments 2019 (01-07) 	part 1, part 2, part 3
      - cat s2v_reddit_2019_lg.tar.gz.* > s2v_reddit_2019_lg.tar.gz
  - s2v_reddit_2015_md 	573 MB 	Reddit comments 2015 	part 1
- Unzip
- Try it, and understand what's going on:


In [237]:
s2v = Sense2Vec().from_disk("/bigstorage/pavlo/testing/s2v_reddit_2015_md")

time: 17.7 s (started: 2024-03-28 18:07:38 +01:00)


In [238]:
seeds = "natural language processing, machine learning, artificial intelligence".split(',')
seed_keys = [s2v.get_best_sense(seed.strip()) for seed in seeds]
seed_keys

['natural_language_processing|NOUN',
 'machine_learning|NOUN',
 'artificial_intelligence|NOUN']

time: 7.53 ms (started: 2024-03-28 18:07:56 +01:00)


In [239]:
most_similar = s2v.most_similar(seed_keys, n=10)
most_similar

[('deep_learning|NOUN', 0.909),
 ('computer_vision|NOUN', 0.9051),
 ('neural_nets|NOUN', 0.8968),
 ('neural_networks|NOUN', 0.8784),
 ('Machine_learning|NOUN', 0.8629),
 ('genetic_algorithms|NOUN', 0.8488),
 ('data_analysis|NOUN', 0.8483),
 ('big_data|NOUN', 0.8476),
 ('complexity_theory|NOUN', 0.8428),
 ('data_science|NOUN', 0.8413)]

time: 2 s (started: 2024-03-28 18:07:56 +01:00)


### 🚧 Todo: What is it that you couldn't do in Word2Vec?

#### One line Answer:

- Just one line of answer.
- Answer: Sense2Vec differentiates the meanings of words that have multiple interpretations by tagging each word with its corresponding grammatical role.

#### Detailed Answer

Word2Vec and Sense2Vec are both tools for creating word embeddings, but they approach the representation of words differently. The 2 main differences to highlight are the following:

- Distinguish Between Different Senses of a Word: Word2Vec typically treats each word as a single entity without distinguishing between different meanings. For example, the word "bank" would have the same embedding whether it's used in the context of a financial institution or the land alongside a river. Sense2Vec, however, can distinguish between "bank|NOUN" and "bank|VERB", providing different embeddings for each sense, which allows for more nuanced understanding and usage.

- Contextual Similarity: Word2Vec embeddings are static and do not change based on sentence context. Sense2Vec leverages part-of-speech tags and syntactic dependencies which can give a better sense of how words are used in different contexts, thus providing a contextual similarity that Word2Vec does not.

### 🚧 Todo:
- Try also the following functions: 
    - Similarity, get_other_senses, get_freq, s2v[query]
    - Most_similar is very slow. check this to speed things up (optional): https://towardsdatascience.com/how-to-build-a-fast-most-similar-words-method-in-spacy-32ed104fe498

#### Similarity Score

In [240]:
similarity_score = s2v.similarity("natural_language_processing|NOUN", "machine_learning|NOUN")
print(f"Similarity: {similarity_score}")

Similarity: 0.8986966013908386
time: 641 µs (started: 2024-03-28 18:07:58 +01:00)


#### Other Senses

In [241]:
other_senses = s2v.get_other_senses("goal|NOUN")
print(f"Other senses: {other_senses}")

Other senses: ['GOAL|ORG', 'GOAL|VERB', 'GOAL|PERSON', 'Goal|PRODUCT', 'Goal|ORG', 'Goal|VERB', 'Goal|PERSON', 'Goal|ADJ', 'goal|VERB']
time: 11.4 ms (started: 2024-03-28 18:07:59 +01:00)


#### Frequency

In [242]:
frequency = s2v.get_freq("artificial_intelligence|NOUN")
print(f"Frequency: {frequency}")

Frequency: 2586
time: 1.19 ms (started: 2024-03-28 18:07:59 +01:00)


#### Vector

In [243]:
vector = s2v["machine_learning|NOUN"]
print(f"Vector: {vector}")

Vector: [-0.05336883  0.12930974 -0.71271354 -0.5335285   0.41187796 -0.41128084
  0.56169933  0.16269535  0.13582484 -0.2628744   0.04485684  0.26824757
 -0.07436837 -0.14284399 -0.13258563 -0.39240766  0.15834026  0.12557378
  0.44947657 -0.0558163   0.28013074  0.05871433 -0.2624601  -0.38285995
 -0.12845571  0.17524104 -0.3379021  -0.40019804 -0.38391396 -0.02481981
  0.1439937   0.3706888  -0.02673293 -0.32380423 -0.07437262 -0.51619464
 -0.11755529  0.52363133 -0.21070772 -0.25421444 -0.12879154  0.07283065
  0.11947857  0.16317505  0.05901384  0.02164277 -0.21194217 -0.18807572
 -0.30689004  0.39276177  0.19204976 -0.44071558  0.28767404 -0.30080068
 -0.20588249  0.06826613  0.37439212  0.10536108 -0.12716247  0.3068472
  0.19814798 -0.14777422  0.14253798  0.23994699 -0.1717181  -0.5000647
 -0.7780123  -0.23943943 -0.26456913 -0.25027943  0.10723579 -0.04512829
  0.02671034 -0.10834537  0.17228423 -0.13542852 -0.20428821 -0.5055861
 -0.11388295  0.42077857 -0.43394044 -0.062427

### 🚧 Todo:
- Try whether expanding your query by adding similar terms to the 10 worst queries improves the results


In [244]:
analyzer = vectorizer.build_analyzer()

augmented_queries = []
for query in training_queries.loc[worst10bm25Okapi_index]['query']:
    query_tokens = analyzer(query)
    augmented_query_tokens = []
    for token in query_tokens:
        token_key = s2v.get_best_sense(token)
        if token_key is not None:
            similar_tokens = s2v.most_similar(token_key, n=3)
            augmented_query_tokens.extend([similar_token[0].split("|")[0] for similar_token in similar_tokens])
    augmented_query = " ".join(augmented_query_tokens)
    augmented_queries.append(augmented_query)

time: 1min 14s (started: 2024-03-28 18:07:59 +01:00)


#### Previous Queries

In [245]:
training_queries.loc[worst10bm25Okapi_index]

Unnamed: 0,qid,query
172,729561,what is channeling
398,195543,goes how long cost and valid
37,417380,is mark applier?
308,651254,what does the name connor mean
285,127145,define skin doctor
443,11059,actresses who died of lung cancer
167,859358,what is uplifting in geography
184,1078582,wired definition
456,868366,what kind of food should heart attached person
455,567103,what are the airports in south virginia


time: 11.5 ms (started: 2024-03-28 18:09:14 +01:00)


#### Augmented Queries

In [246]:
aug_queries = pd.DataFrame(augmented_queries, columns=["query"])
qid_index = training_queries.loc[worst10bm25Okapi_index].qid.values
augmented_queries_df = pd.DataFrame({
    "qid": qid_index,
    "query": augmented_queries
})

augmented_queries_df

Unnamed: 0,qid,query
0,729561,just_what even_what exactly_what 's seems obvi...
1,195543,comes turns puts what really actually long as ...
2,417380,'s seems obviously chosen replies Serious
3,651254,just_what even_what exactly_what does't n't me...
4,127145,defining redefine defined normal_skin scalp su...
5,11059,actors/actresses actors male_actors only_peopl...
6,859358,just_what even_what exactly_what 's seems obvi...
7,1078582,hardwired hardwire rewired defintion exact_def...
8,868366,just_what even_what exactly_what sort type oth...
9,567103,just_what even_what exactly_what those arn't o...


time: 22.8 ms (started: 2024-03-28 18:09:15 +01:00)


In [247]:
updated_queries = training_queries.copy(deep=True)
query_ids = list(worst10bm25Okapi_index)
id_to_new_text = dict(zip(query_ids, augmented_queries))
updated_queries.loc[query_ids, 'query'] = updated_queries.loc[query_ids].index.map(id_to_new_text)
updated_queries

Unnamed: 0,qid,query
0,687888,what is a jpe
1,480210,price for asphalt driveway
2,591004,what causes pressure skin bruising
3,260536,how long drive from flagstaff to grand canyon
4,39422,average number of bowel movements per day for ...
...,...,...
495,133970,definition of dietary fiber
496,79788,can you start up a video record?
497,791583,what is rheumatoid spondylosis
498,732078,what is coleman fuel made out of


time: 26 ms (started: 2024-03-28 18:09:15 +01:00)


#### New Results

In [248]:
bm25 = BM25Okapi(tokenized_corpus)
bm25results_Okapi = updated_queries.qid.progress_apply(pAt10Bm25)
bm25results_Okapi.mean()

  0%|          | 0/500 [00:00<?, ?it/s]

0.9848

time: 3min 21s (started: 2024-03-28 18:09:15 +01:00)


#### Conclusion

Unfortunately, the results don't seem to improve, maybe it should be considered further preprocessing and different tokenization.

### 🚧 Todo:
- Try misspelling a word and see whether you can fix that with sense2vec


#### Misspelling example 1

In [249]:
misspelled_word1 = "juputer"
correct_word1 = s2v.get_best_sense(misspelled_word1)
print(f"Corrected word: {correct_word1}")

Corrected word: None
time: 3.69 ms (started: 2024-03-28 18:12:37 +01:00)


#### Mispelling example 2

In [250]:
misspelled_word1 = "helo"
correct_word1 = s2v.get_best_sense(misspelled_word1)
print(f"Corrected word: {correct_word1}")

Corrected word: helo|NOUN
time: 6.23 ms (started: 2024-03-28 18:12:37 +01:00)


### 🚧 Todo:
- Try embeddings for a few queries (all would take too long except if you have a GPU)
    - Are the gold top 10 similar to the query itself?
    - Check whether the gold top 10 answers for our most difficult question are really closer to the question than the currently predicted top10
         - How to get every doc as a vector: 
             - https://spacy.io/api/doc#vector "A real-valued meaning representation. Defaults to an average of the token vectors."
        - Every doc has a similarity function taking another doc as argument: 
            - https://spacy.io/api/doc#similarity

In [252]:
selected_queries = training_queries.iloc[180:190]

for _,query_row in selected_queries.iterrows():
    query_text = query_row['query']
    query_id = query_row['qid']
    query_doc = nlp(query_text)

    # Calculate similarity with gold top 10 documents
    gold_top10_docs = gold[gold['qid'] == query_id].sort_values('rank').head(10)
    gold_similarities = []
    for _,doc in gold_top10_docs.iterrows():
        doc_text = docs[docs.docid == doc.docid].text.values[0]
        doc_doc = nlp(doc_text)
        similarity_score = query_doc.similarity(doc_doc)
        gold_similarities.append(similarity_score)

    tquery = queries[queries.qid==query_id]['query'].apply(analyzer)
    doc_scores = bm25.get_scores(tquery.tolist()[0])
    predicted_top10_docs = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:10]
    predicted_similarities = []
    
    for idx in predicted_top10_docs:
        doc_text = docs['text'].iloc[idx]
        doc_doc = nlp(doc_text)
        similarity_score = query_doc.similarity(doc_doc)
        predicted_similarities.append(similarity_score)

    print("------------------------------------------------------")
    print(f"Query ID: {query_id}")
    print(f"\nGold Similarities: {gold_similarities}")
    print(f"\nPredicted Similarities: {predicted_similarities}")
    print("------------------------------------------------------")


------------------------------------------------------
Query ID: 333029

Gold Similarities: [0.7988719456653133, 0.8074835170757475, 0.8061850299920799, 0.8281570546926511, 0.7970559246407134, 0.8460853807976707, 0.8088189764537648, 0.8221287150061221, 0.7968105130911767, 0.7717767822698696]

Predicted Similarities: [0.7988719456653133, 0.7717767822698696, 0.7679191089382106, 0.8460853807976707, 0.8221287150061221, 0.7377280067248264, 0.7652416593718304, 0.8095978050844249, 0.8181364722218312, 0.7694116320314736]
------------------------------------------------------
------------------------------------------------------
Query ID: 260773

Gold Similarities: [0.7408109784683735, 0.7408109784683735, 0.7521725627572964, 0.7336286553121919, 0.7543014738153314, 0.7542932695831854, 0.780779705913727, 0.7935853926474039, 0.7838247672017876, 0.7641256643339809]

Predicted Similarities: [0.7408109784683735, 0.7408109784683735, 0.7521725627572964, 0.7543014738153314, 0.7336286553121919, 0.754293