In [57]:
import numpy as np
import math as mt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords

from porter import stem

nltk.download('stopwords')

import pyterrier as pt
if not pt.started():
    pt.init()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gameselo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# DATA COLLECTING

In [161]:
msmarco_document = pt.datasets.get_dataset("msmarco_document")
msmarcov2_document = pt.datasets.get_dataset("msmarcov2_document")
msmarco_passage = pt.datasets.get_dataset("msmarco_passage")
msmarcov2_passage = pt.datasets.get_dataset("msmarcov2_passage")
vaswani = pt.datasets.get_dataset("vaswani")

In [162]:
# ONLY FOR MINI TESTS ; OFFICIAL DATASET TO USE IS MSMARCO_DOCUMENT

queries_l = vaswani.get_topics()['query'].to_list()
id_queries = vaswani.get_topics()['qid'].to_list()

queries = dict()
i=0
for id_q in id_queries:
    queries[id_q] = queries_l[i]
    i += 1

coll = dict()
for doc in vaswani.get_corpus_iter():
    coll[doc['docno']] = doc['text']

In [163]:
print(queries['1'])

measurement of dielectric constant of liquids by the use of microwave techniques


In [164]:
print(coll['1'])

compact memories have flexible capacities  a digital data storage
system with capacity up to bits and random and or sequential access
is described


# RETRIEVAL

### Tools

In [165]:
def remove_stopwords(token_text):
    stops = stopwords.words('english')
    new_tokentext = []
    for word in token_text:
        if word not in stops:
            new_tokentext.append(word)
    return new_tokentext

In [166]:
def preproc(coll):
    for docno in coll.keys():
        tmp = remove_stopwords(RegexpTokenizer(r'\w+').tokenize(coll[docno].lower())) # lower + remove punc + remove stopwords
        for j in range(len(tmp)):
            tmp[j] = stem(tmp[j])
        coll[docno] = tmp
        
preproc(coll)
preproc(queries)

In [167]:
print(coll['1'])
print(queries['1'])

['compact', 'memori', 'flexibl', 'capac', 'digit', 'data', 'storag', 'system', 'capac', 'bit', 'random', 'sequenti', 'access', 'describ']
['measur', 'dielectr', 'constant', 'liquid', 'us', 'microwav', 'techniqu']


In [168]:
def top_vocab(coll, queries, top_terms=1000):
    list_words = []
    for docno in coll.keys():
        list_words += coll[docno]
    
    for q_id in queries.keys():
        list_words += queries[q_id]
            
    return dict(Counter(list_words).most_common(top_terms))

In [169]:
# print(top_vocab(coll, queries))

In [170]:
def truncate(coll, queries, top_terms=1000):
    vocabulary = list(top_vocab(coll, queries, top_terms).keys())
    for docno in coll.keys():
        tmp = coll[docno]
        new_doc = []
        for word in tmp:
            if word in vocabulary:
                new_doc.append(word)
        coll[docno] = new_doc
        
    for q_id in queries.keys():
        tmp = queries[q_id]
        new_query = []
        for word in tmp:
            if word in vocabulary:
                new_query.append(word)
        queries[q_id] = new_query
        
truncate(coll, queries, top_terms=1000)

In [171]:
print(coll['1'])
print(queries['1'])

['memori', 'digit', 'data', 'storag', 'system', 'random', 'describ']
['measur', 'dielectr', 'constant', 'liquid', 'us', 'microwav', 'techniqu']


In [175]:
def tf_dict(coll, docno):
    tf = dict()
    for word in coll[docno]:
        if word not in tf.keys():
            tf[word] = 1
        else:
            tf[word] += 1
    return tf

In [176]:
def dict_alldocs(coll):
    dictTF_alldocs = dict()
    for docno in coll.keys():
        dictTF_alldocs[docno] = tf_dict(coll, docno)
    return dictTF_alldocs

In [177]:
dictTF_alldocs = dict_alldocs(coll)
print(dictTF_alldocs['1'])

{'memori': 1, 'digit': 1, 'data': 1, 'storag': 1, 'system': 1, 'random': 1, 'describ': 1}


In [178]:
def term_occ(coll, docno):
    return len(coll[docno])

In [179]:
print(term_occ(coll, '1'))

7


## UNEXPANDED RETRIEVAL

In [199]:
def p_ml(coll, term, docno):
    if term not in coll[docno]:
        return 0
    else:
        return dictTF_alldocs[docno][term] / term_occ(coll, docno)

In [200]:
def p_ml_coll(term, coll):
    sum_tf = 0
    for docno in coll.keys():
        if term in coll[docno]:
            sum_tf += dictTF_alldocs[docno][term]
            
    return sum_tf / sum([term_occ(coll, docno) for docno in coll.keys()])

In [201]:
def smoothing(coll, term, doc, lamb): # P(w|D)
    return lamb * p_ml(coll, term, doc) + (1 - lamb) * p_ml_coll(term, coll)

In [202]:
def dirichlet_smoothing(coll, term, doc, mu=1000):
    lamb = term_occ(coll, doc) / (term_occ(coll, doc) + mu)
    return smoothing(coll, term, doc, lamb)

In [203]:
def query_likelihood_retr(coll, query, doc, prob_func): # P(Q|D)
    return np.prod(np.array([prob_func(coll,term,doc) for term in query]))

In [204]:
print(query_likelihood_retr(coll, queries['1'], '1', dirichlet_smoothing))

1.563380651717102e-19


## EXPANDED RETRIEVAL

### Relevance models

In [205]:
def bayes(coll, query, doc, prob_func): # returns P(D|Q)
    return query_likelihood_retr(coll, query, doc, prob_func) * 1/len(coll) # uniformity for P(D), we assume that we have same probs for all documents

In [206]:
print(bayes(coll, queries['1'], '1', dirichlet_smoothing))

1.3679067737484488e-23


In [None]:
# RANKING DOCUMENTS TODO

In [21]:
# DOCUMENTS RENVOYÉS PAR LE RETRIEVING = retreived_docs (top 50 docs)
# ici coll = top 50 documents SEULEMENT

def relevance_model(coll, term, query): # P(w|Q)
    return sum([smoothing(coll, term, doc, 0.9) * bayes(coll, query, doc, prob_func) for doc in coll])

### Relevance model retrieval

In [None]:
# TODO

# EXPANSION PREDICTION TASK

### Tools

In [None]:
def all_P_wQ(vocab, query):
    return np.array([relevance_model(w,query) for w in vocab])

def P_wColl(term, coll):
    return p_ml_coll(term, coll) # P(w|coll) = frequency of term in the entire collection, as well as P_ML(w|coll)

def all_P_wcoll(vocab, coll):
    return np.array([P_wColl(w, coll) for w in vocab])

## CLARITY METHOD

### Weighted clarity scores

In [None]:
def u(w, query, gamma):
    if w in query:
        return gamma
    else:
        return 1

In [None]:
def clarity(u, coll, query, gamma): # u == func which defines each weight of w
    vocabulary = list(top_vocab(coll).keys())
    P_WQ = all_P_wQ(vocabulary, query)
    P_WColl = all_P_wcoll(vocabulary, query)
    u_W = np.array([u(w, query, gamma) for w in vocabulary])
    E_AU = np.sum(u_W * P_WQ)
    return np.sum((u_W * P_WQ / E_AU) * np.log2(P_WQ / P_WColl))

In [None]:
from scipy.stats import spearmanr # Rank correlation

gammas = np.logspace(0.1, 1e7, 29)
clarities = np.array([np.array([clarity(u, coll, query, gamma) for gamma in gammas]) for query in queries])
qldr = np.array([np.array([query_likelihood_retr(query, doc, dirichlet_smoothing) for query in queries]) for doc in coll])