In [88]:
!pip install python_terrier
!pip install krovetzstemmer

import numpy as np
import math as mt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')


import pyterrier as pt
if not pt.started():
    pt.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# DATA COLLECTING

In [89]:
msmarco_document = pt.datasets.get_dataset("msmarco_document")
msmarcov2_document = pt.datasets.get_dataset("msmarcov2_document")
msmarco_passage = pt.datasets.get_dataset("msmarco_passage")
msmarcov2_passage = pt.datasets.get_dataset("msmarcov2_passage")
vaswani = pt.datasets.get_dataset("vaswani")

# RETRIEVAL

KROVETZSTEMMER 
Its effectiveness is comparable to the Porter stemmer. It has a lower false positive rate, but somewhat higher false negative

In [90]:
from krovetzstemmer import Stemmer
krovetz = Stemmer()
krovetz.stem('utilities') 


'utility'

### Tools

In [91]:
#Check InQuery Stop List
def remove_stopwords(token_text):
    stops = stopwords.words('english')
    new_tokentext = []
    for word in token_text:
        if word not in stops:
            new_tokentext.append(word)
    return new_tokentext

In [92]:
#Majuscule et ponctuation, CHECK NUMBERS et mot d'une lettre
def preproc(coll):
    for docno in coll.keys():
        tmp = remove_stopwords(RegexpTokenizer(r'\w+').tokenize(coll[docno].lower())) # lower + remove punc + remove stopwords
        tmp2 = []
        for j in range(len(tmp)):
            if(len(tmp[j])>1 and not tmp[j].isnumeric() ): # + d'une lettre et pas un nombre
              tmp2.append(krovetz.stem(tmp[j]))
        coll[docno] = tmp2

In [93]:
# 1000 mots les plus fréquents des documents + requetes
def top_vocab(coll, queries, top_terms=1000):
    list_words = []
    for docno in coll.keys():
        list_words += coll[docno]
    
    for q_id in queries.keys():
        list_words += queries[q_id]
            
    return dict(Counter(list_words).most_common(top_terms))

In [94]:
# Reformule les docs/requetes selon les 1000 mots les + fréquents
def truncate(coll, queries, top_terms=1000):
    vocabulary = list(top_vocab(coll, queries, top_terms).keys())
    for docno in coll.keys():
        tmp = coll[docno]
        new_doc = []
        for word in tmp:
            if word in vocabulary:
                new_doc.append(word)
        coll[docno] = new_doc
        
    for q_id in queries.keys():
        tmp = queries[q_id]
        new_query = []
        for word in tmp:
            if word in vocabulary:
                new_query.append(word)
        queries[q_id] = new_query

In [95]:
#Term frequency pour un doc de la collection
def tf_dict(coll, docno):
    tf = dict()
    for word in coll[docno]:
        if word not in tf.keys():
            tf[word] = 1
        else:
            tf[word] += 1
    return tf

In [96]:
#Term frequency pour tous les docs de la collection (dictionnaire de dictionnaire {1 : {mot : 3, ...}} )
def dict_alldocs(coll):
    dictTF_alldocs = dict()
    for docno in coll.keys():
        dictTF_alldocs[docno] = tf_dict(coll, docno)
    return dictTF_alldocs

In [97]:
#Nb mot pour un document
def term_occ(coll, docno):
    return len(coll[docno])

## UNEXPANDED RETRIEVAL

In [98]:
#P_ml, number of times "w" occurs in document D divided by the number of term occurrences in D
def p_ml(dictTF_alldocs, coll, w, docno):
    if w not in coll[docno]:
        return 0
    else:
        return dictTF_alldocs[docno][w] / term_occ(coll, docno)

In [99]:
#Same but for the entire collection
def p_ml_coll(dictTF_alldocs, coll, w):
    sum_tf = 0
    for docno in coll.keys():
        if w in coll[docno]:
            sum_tf += dictTF_alldocs[docno][w]
            
    return sum_tf / sum([term_occ(coll, docno) for docno in coll.keys()])

In [100]:
def smoothing(dictTF_alldocs, coll, w, docno, lamb): # P(w|D)
    return lamb * p_ml(dictTF_alldocs, coll, w, docno) + (1 - lamb) * p_ml_coll(dictTF_alldocs, coll, w)

In [101]:
def dirichlet_smoothing(dictTF_alldocs, coll, w, docno, mu=1000):
    lamb = term_occ(coll, docno) / (term_occ(coll, docno) + mu)
    return smoothing(dictTF_alldocs, coll, w, docno, lamb)

In [102]:
#P(Q|D)
def query_likelihood_retr(dictTF_alldocs, coll, query, docno, prob_func = dirichlet_smoothing): # P(Q|D)
    return np.prod(np.array([prob_func(dictTF_alldocs, coll, term, docno) for term in query]))

# TESTTTSTETSTSTTTTEEESSSTT

In [103]:
# ONLY FOR MINI TESTS ; OFFICIAL DATASET TO USE IS MSMARCO_DOCUMENT
queries_l = vaswani.get_topics()['query'].to_list()
id_queries = vaswani.get_topics()['qid'].to_list()

queries = dict()
i=0
for id_q in id_queries:
    queries[id_q] = queries_l[i]
    i += 1

coll = dict()
for doc in vaswani.get_corpus_iter():
    coll[doc['docno']] = doc['text']

preproc(coll)
preproc(queries)
truncate(coll, queries, top_terms=1000)
dictTF_alldocs = dict_alldocs(coll)

In [104]:
#Meilleur doc parmi les 20 premiers pour la query 1.
l = []
for docno in coll.keys():
  if(int(docno)>20):
    break
  l.append((docno, query_likelihood_retr(dictTF_alldocs, coll, queries['1'], docno)))
l.sort(key=lambda a: a[1])

print(l)


[('8', 1.495074748165239e-19), ('2', 1.5367674491879992e-19), ('16', 1.5367674491879992e-19), ('5', 1.5391070203266714e-19), ('3', 1.5581111077572958e-19), ('9', 1.5602524261486503e-19), ('14', 1.5689098554154208e-19), ('17', 1.5689098554154208e-19), ('18', 1.581730497311069e-19), ('11', 1.5925961214279465e-19), ('13', 1.6018228779665836e-19), ('20', 1.614584288219249e-19), ('1', 1.6355273642495733e-19), ('12', 1.6584473027568297e-19), ('4', 1.6700447902065813e-19), ('15', 1.6935189394043833e-19), ('19', 1.6935189394043833e-19), ('6', 1.7298653826164043e-19), ('7', 2.506528215089573e-19), ('10', 2.515032758261039e-19)]


## 2.3 EXPANDED RETRIEVAL

### 2.3.1 Relevance models

In [105]:
#P(D|Q) = P(Q|D)*P(D)
def bayes(dictTF_alldocs, coll, query, docno, prob_func = dirichlet_smoothing): 
    return query_likelihood_retr(dictTF_alldocs, coll, query, docno, prob_func) * 1/len(coll) # uniformity for P(D), we assume that we have same probs for all documents

In [106]:
print(bayes(dictTF_alldocs, coll, queries['1'], '1'))

1.431032779989127e-23


In [107]:
# P(w|Q) = sum D in R ( P(w|D)*P(D|Q) ) où R est le top50 documents
def relevance_model(dictTF_alldocs, coll, w, query, prob_func= dirichlet_smoothing, lamb = 0.9):
    #50 Meilleurs doc parmi les 200 premiers de la coll pour la query.
    # A voir si on garde ce dict des fréquences ou si on change pour avoir celui du top 50
    score_top50 = []
    for docno in coll.keys():
      if(int(docno)>10):
        break
      score_top50.append((docno, bayes(dictTF_alldocs, coll, query, docno)))
    score_top50.sort(key=lambda a: a[1])
    score_top50 = dict(score_top50[:10])
    top50=dict()
    for d in score_top50:
      top50[d[0]] = coll[d[0]]
    return sum([smoothing(dictTF_alldocs, top50, w, docno, lamb) * bayes(dictTF_alldocs, top50, query, docno) for docno in top50.keys()])


### 2.3.2 Relevance model retrieval

In [108]:
# PAS SUR, A VERIFIER
def rel_mod_retrieval(dictTF_alldocs, coll, query, docno, prob_func = dirichlet_smoothing, lamb = 0.2):  
  model = []
  rel_model = []
  for w in coll[docno]:
    model.append(smoothing(dictTF_alldocs, coll, w, docno, lamb))
    rel_model.append(relevance_model(dictTF_alldocs, coll, w, query))

  #cross entropy
  m = len(model)
  return -1/m * sum([ model[i] * np.log(rel_model[i] + 1e-15) for i in range(m)])



In [109]:
print(rel_mod_retrieval(dictTF_alldocs, coll, queries['1'], '1'))

1.0557408529270536


# 3. EXPANSION PREDICTION TASK

### Tools

In [110]:
def all_P_wQ(vocab, dictTF_top50, top50, query):
    return np.array([relevance_model(dictTF_top50, top50, w,query) for w in vocab])

def P_wColl(term, coll):
    return p_ml_coll(term, coll) # P(w|coll) = frequency of term in the entire collection, as well as P_ML(w|coll)

def all_P_wcoll(vocab, coll):
    return np.array([P_wColl(w, coll) for w in vocab])

## 3.1 CLARITY METHOD

### Weighted clarity scores

In [111]:
def u(w, query, gamma):  # u == func which defines each weight of w
    if w in query:
        return gamma
    else:
        return 1

In [None]:
def clarity(u, dictTF_top50, top50, coll, query, gamma): # u == func which defines each weight of w
    vocabulary = list(top_vocab(coll, {}).keys())
    P_WQ = all_P_wQ(vocabulary,dictTF_top50, top50, query)
    P_WColl = all_P_wcoll(vocabulary, query)
    u_W = np.array([u(w, query, gamma) for w in vocabulary])
    E_AU = np.sum(u_W * P_WQ)
    return np.sum((u_W * P_WQ / E_AU) * np.log2(P_WQ / P_WColl))

In [None]:
from scipy.stats import spearmanr # Rank correlation

gammas = np.logspace(0.1, 1e7, 29)
clarities = np.array([np.array([clarity(u, dictTF_alldocs, coll, coll, query, gamma) for gamma in gammas]) for query in queries])
qldr = np.array([np.array([query_likelihood_retr(query, doc, dirichlet_smoothing) for query in queries]) for doc in coll])

## 3.2 Overlap Method

On prend les n ( = 100 dans l'article) meilleurs documents pour la requête non étendu et les 100 meilleurs pour la requête étendue et on garde l'intersection.
Si jai bien comprus

Pour une requete ce serait :

In [124]:
#1 Meilleurs doc parmi les 2 premiers pour la query 1 avec unexpanded expanded. (POUR TESTS)
unexpanded = []
expanded = []
for docno in coll.keys():
  if(int(docno)>2):
    break
  unexpanded.append((docno, query_likelihood_retr(dictTF_alldocs, coll, queries['1'], docno)))
  expanded.append((docno, rel_mod_retrieval(dictTF_alldocs, coll, queries['1'], docno)))



In [126]:
unexpanded.sort(key=lambda a: a[1])
expanded.sort(key=lambda a: a[1])

expanded = expanded[:1]
unexpanded = unexpanded[:1]

overlap = []
for d1 in unexpanded:
  for d2 in expanded:
    if d1[0] == d2[0]: # On regarde si un doc de expanded est dans unexpanded, si oui on le garde
      overlap.append(d1[0])
      break

print(unexpanded)
print(expanded)
print(overlap)

[('2', 1.5367674491879992e-19)]
[('2', 0.5491576932354653)]
['2']
