In [1]:
#!pip install python_terrier
#!pip install krovetzstemmer

import numpy as np
import math as mt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd

#nltk.download('stopwords')


import pyterrier as pt
if not pt.started():
    pt.init()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



# DATA COLLECTING

In [3]:
msmarco_document = pt.datasets.get_dataset("msmarco_document")
msmarcov2_document = pt.datasets.get_dataset("msmarcov2_document")
msmarco_passage = pt.datasets.get_dataset("msmarco_passage")
msmarcov2_passage = pt.datasets.get_dataset("msmarcov2_passage")
vaswani = pt.datasets.get_dataset("vaswani")

# RETRIEVAL

KROVETZSTEMMER 
Its effectiveness is comparable to the Porter stemmer. It has a lower false positive rate, but somewhat higher false negative

In [4]:
from krovetzstemmer import Stemmer
krovetz = Stemmer()
krovetz.stem('utilities') 

'utility'

### Tools

In [5]:
#Check InQuery Stop List
def remove_stopwords(token_text):
    stops = stopwords.words('english')
    new_tokentext = []
    for word in token_text:
        if word not in stops:
            new_tokentext.append(word)
    return new_tokentext

In [6]:
#Majuscule et ponctuation, CHECK NUMBERS et mot d'une lettre
def preproc(coll):
    for docno in coll.keys():
        tmp = remove_stopwords(RegexpTokenizer(r'\w+').tokenize(coll[docno].lower())) # lower + remove punc + remove stopwords
        tmp2 = []
        for j in range(len(tmp)):
            if(len(tmp[j])>1 and not tmp[j].isnumeric() ): # + d'une lettre et pas un nombre
                tmp2.append(krovetz.stem(tmp[j]))
        coll[docno] = tmp2

In [28]:
# 1000 mots les plus fréquents des documents + requetes
def top_vocab(coll, queries, top_terms=1000):
    list_words = []
    for docno in coll.keys():
        list_words += coll[docno]
    
    for q_id in queries.keys():
        list_words += queries[q_id]
    
    if top_terms==None:
        return list_words
    else:
        return dict(Counter(list_words).most_common(top_terms))

In [8]:
# Reformule les docs/requetes selon les 1000 mots les + fréquents
def truncate(coll, queries, top_terms=1000):
    vocabulary = list(top_vocab(coll, queries, top_terms).keys())
    for docno in coll.keys():
        tmp = coll[docno]
        new_doc = []
        for word in tmp:
            if word in vocabulary:
                new_doc.append(word)
        coll[docno] = new_doc
        
    for q_id in queries.keys():
        tmp = queries[q_id]
        new_query = []
        for word in tmp:
            if word in vocabulary:
                new_query.append(word)
        queries[q_id] = new_query

In [9]:
#Term frequency pour un doc de la collection
def tf_dict(coll, docno):
    tf = dict()
    for word in coll[docno]:
        if word not in tf.keys():
            tf[word] = 1
        else:
            tf[word] += 1
    return tf

In [10]:
#Term frequency pour tous les docs de la collection (dictionnaire de dictionnaire {1 : {mot : 3, ...}} )
def dict_alldocs(coll):
    dictTF_alldocs = dict()
    for docno in coll.keys():
        dictTF_alldocs[docno] = tf_dict(coll, docno)
    return dictTF_alldocs

In [11]:
#Nb mot pour un document
def term_occ(coll, docno):
    return len(coll[docno])

## UNEXPANDED RETRIEVAL

In [12]:
#P_ml, number of times "w" occurs in document D divided by the number of term occurrences in D
def p_ml(dictTF_alldocs, coll, w, docno):
    if w not in coll[docno]:
        return 0
    else:
        return dictTF_alldocs[docno][w] / term_occ(coll, docno)

In [13]:
#Same but for the entire collection
def p_ml_coll(dictTF_alldocs, coll, w):
    sum_tf = 0
    for docno in coll.keys():
        if w in coll[docno]:
            sum_tf += dictTF_alldocs[docno][w]
            
    return sum_tf / sum([term_occ(coll, docno) for docno in coll.keys()])

In [14]:
def smoothing(dictTF_alldocs, coll, w, docno, lamb): # P(w|D)
    return lamb * p_ml(dictTF_alldocs, coll, w, docno) + (1 - lamb) * p_ml_coll(dictTF_alldocs, coll, w)

In [15]:
def dirichlet_smoothing(dictTF_alldocs, coll, w, docno, mu=1000):
    lamb = term_occ(coll, docno) / (term_occ(coll, docno) + mu)
    return smoothing(dictTF_alldocs, coll, w, docno, lamb)

In [16]:
#P(Q|D)
def query_likelihood_retr(dictTF_alldocs, coll, query, docno, prob_func = dirichlet_smoothing): # P(Q|D)
    return np.prod(np.array([prob_func(dictTF_alldocs, coll, term, docno) for term in query]))

# TESTTTSTETSTSTTTTEEESSSTT

In [17]:
# ONLY FOR MINI TESTS ; OFFICIAL DATASET TO USE IS MSMARCO_DOCUMENT
queries_l = vaswani.get_topics()['query'].to_list()
id_queries = vaswani.get_topics()['qid'].to_list()

queries = dict()
i=0
for id_q in id_queries:
    queries[id_q] = queries_l[i]
    i += 1

coll = dict()
for doc in vaswani.get_corpus_iter():
    coll[doc['docno']] = doc['text']

preproc(coll)
preproc(queries)
truncate(coll, queries, top_terms=1000)
dictTF_alldocs = dict_alldocs(coll)

## 2.3 EXPANDED RETRIEVAL

### 2.3.1 Relevance models

In [18]:
#P(D|Q) = P(Q|D)*P(D)
def bayes(dictTF_alldocs, coll, query, docno, prob_func = dirichlet_smoothing): 
    return query_likelihood_retr(dictTF_alldocs, coll, query, docno, prob_func) * 1/len(coll) # uniformity for P(D), we assume that we have same probs for all documents

In [19]:
#print(bayes(dictTF_alldocs, coll, queries['1'], '1'))

In [20]:
# FAIRE INDEX ET RETRIEVING ICI (et bien dire que c'est un ajustement, dire dans le CR que ça manque dans le papier (la manière d'indexer et la manière de retrieve))

# VASWANI
coll_df = coll.copy()

for docno in coll_df.keys():
    coll_df[docno] = " ".join(coll_df[docno])

df = pd.DataFrame(coll_df.items(), columns=['docno', 'text'])

indexer = pt.DFIndexer("./index")
#indexref = indexer.index(df["text"], df["docno"])

# index = pt.IndexFactory.of(indexref)
index = pt.IndexFactory.of("./index/data.properties")

bm25 = pt.BatchRetrieve(index, wmodel="BM25") % 50

'\nqueries_df = queries.copy()\n\nfor qid in queries_df.keys():\n    queries_df[qid] = " ".join(queries_df[qid])\n\ndf_queries = pd.DataFrame(queries_df.items(), columns=[\'qid\', \'query\'])\n\ntop_docs = bm25.transform(df_queries)\n'

In [36]:
# P(w|Q) = sum D in R ( P(w|D)*P(D|Q) ) où R est le top50 documents
def relevance_model(dictTF_alldocs, coll, w, query, prob_func=smoothing, lamb=0.9, retriever=bm25):
    top50 = bm25.transform(query)['docno'].to_list()[:50]
    
    return np.sum(np.array([prob_func(dictTF_alldocs, coll, w, docno, lamb) * bayes(dictTF_alldocs, coll, query, docno) for docno in top50]))

In [38]:
print(relevance_model(dictTF_alldocs, coll, coll['1'][0], queries['1']))

  res = self.transformer.transform(topics_and_res)


3.4999567273853614e-26


### 2.3.2 Relevance model retrieval

In [41]:
# PAS SUR, A VERIFIER
def rel_mod_retrieval(dictTF_alldocs, coll, query, docno, prob_func=smoothing, lamb=0.2):  
    model = []
    rel_model = []
    for w in coll[docno]:
        model.append(prob_func(dictTF_alldocs, coll, w, docno, lamb))
        rel_model.append(relevance_model(dictTF_alldocs, coll, w, query))
    
    #cross entropy
    ce = np.sum(np.array(model) * np.log(np.array(rel_model)))
    m = len(model)
    return -1/m * ce

In [42]:
print(rel_mod_retrieval(dictTF_alldocs, coll, queries['1'], '1'))

  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)


1.7077890952863548


# 3. EXPANSION PREDICTION TASK

### Tools

In [51]:
def all_P_wQ(vocab, dictTF, coll, query):
    return np.array([relevance_model(dictTF, coll, w, query) for w in vocab])

def P_wColl(dictTF, term, coll):
    return p_ml_coll(dictTF, coll, term) # P(w|coll) = frequency of term in the entire collection, as well as P_ML(w|coll)

def all_P_wcoll(dictTF, vocab, coll):
    return np.array([P_wColl(dictTF, w, coll) for w in vocab])

## 3.1 CLARITY METHOD

### Weighted clarity scores

In [52]:
def u(w, query, gamma):  # u == func which defines each weight of w
    if w in query:
        return gamma
    else:
        return 1

In [53]:
def clarity(u, dictTF, coll, query, gamma): # u == func which defines each weight of w
    vocabulary = list(top_vocab(coll, {}, top_terms=1000).keys())
    P_WQ = all_P_wQ(vocabulary, dictTF, coll, query)
    P_WColl = all_P_wcoll(dictTF, vocabulary, coll)
    u_W = np.array([u(w, query, gamma) for w in vocabulary])
    E_AU = np.sum(u_W * P_WQ)
    return np.sum((u_W * P_WQ / E_AU) * np.log2(P_WQ / P_WColl))

In [None]:
from scipy.stats import spearmanr # Rank correlation, chosen arbitrarly

queries_df = queries.copy()

for qid in queries_df.keys():
    queries_df[qid] = " ".join(queries_df[qid])

df_queries = pd.DataFrame(queries_df.items(), columns=['qid', 'query'])

top_docno = bm25.transform(df_queries)['docno'].to_list()[:50]
top_docs = dict()
for docno in top_docno:
    top_docs[docno] = coll[docno]

dictTF_topdocs = dict()
for docno in top_docno:
    dictTF_topdocs[docno] = dictTF_alldocs[docno]

gammas = np.logspace(0.1, 1e7, 29)
clarities = np.array([np.array([clarity(u, dictTF_topdocs, top_docs, query, gamma) for gamma in gammas]) for query in queries])

  return _nx.power(base, y)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  res = self.transformer.transform(topics_and_res)
  r

In [None]:
qldr = np.array([np.array([query_likelihood_retr(dictTF_topdocs, top_docs, query, docno, prob_func = dirichlet_smoothing) for docno in top_docs.keys()]) for query in queries])

In [None]:
res = spearmanr(clarities, b=qldr, axis=None)
R = res.statistic

## 3.2 Overlap Method

On prend les n ( = 100 dans l'article) meilleurs documents pour la requête non étendu et les 100 meilleurs pour la requête étendue et on garde l'intersection.
Si jai bien comprus

Pour une requete ce serait :

In [124]:
#1 Meilleurs doc parmi les 2 premiers pour la query 1 avec unexpanded expanded. (POUR TESTS)
unexpanded = []
expanded = []
for docno in coll.keys():
    if(int(docno)>2):
        break
    unexpanded.append((docno, query_likelihood_retr(dictTF_alldocs, coll, queries['1'], docno)))
    expanded.append((docno, rel_mod_retrieval(dictTF_alldocs, coll, queries['1'], docno)))



In [126]:
unexpanded.sort(key=lambda a: a[1])
expanded.sort(key=lambda a: a[1])

expanded = expanded[:1]
unexpanded = unexpanded[:1]

overlap = []
for d1 in unexpanded:
    for d2 in expanded:
        if d1[0] == d2[0]: # On regarde si un doc de expanded est dans unexpanded, si oui on le garde
            overlap.append(d1[0])
        break

print(unexpanded)
print(expanded)
print(overlap)

[('2', 1.5367674491879992e-19)]
[('2', 0.5491576932354653)]
['2']
