In [3]:
import numpy as np
import math as mt
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords

from scipy.special import entropy # Kullback-Liebler divergence

from porter import stem

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gameselo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# RETRIEVAL

### Tools

In [None]:
def remove_stopwords(token_text):
    stops = stopwords.words('english')
    new_tokentext = []
    for word in token_text:
        if word not in stops:
            new_tokentext.append(word)
    return new_tokentext

In [None]:
def preproc(coll):
    for i in range(len(coll)):
        tmp = remove_stopwords(RegexpTokenizer(r'\w+').tokenize(coll[i].lower())) # lower + remove punc + remove stopwords
        for j in range(len(tmp)):
            tmp[j] = stem(tmp[i])
        coll[i] = " ".join(tmp)
        
preproc(coll)

In [None]:
def top_vocab(coll, top_terms=1000):
    merged_coll = []
    for doc in coll:
        merged_coll.append(" ".join(doc))
    
    merged_coll = " ".append(merged_coll)
            
    return dict(Counter(merged_coll).most_common(top_terms))

In [None]:
def truncate_coll(coll, top_terms=1000):
    vocabulary = list(top_vocab(coll, top_terms).keys())
    for i in range(len(coll)):
        tmp = word_tokenize(coll[i])
        new_doc = []
        for word in tmp:
            if word in vocabulary:
                new_doc.append(word)
        coll[i] = " ".join(new_doc)
        
truncate_coll(coll, top_terms=1000)

In [None]:
def dict_alldocs(coll):
    dictTF_alldocs = {}
    for doc in coll:
        dictTF_alldocs[doc] = tf_dict(doc)
    return dictTF_alldocs

In [None]:
def term_occ(doc):
    return sum(list(dictTF_alldocs[doc].values()))

## UNEXPANDED RETRIEVAL

In [None]:
def query_likelihood_retr(query, doc, prob_func): # P(Q|D)
    return np.prod(np.array([prob_func(term,doc) for term in doc]))

In [None]:
def p_ml(term, doc):
    return dictTF_alldocs[doc][term] / term_occ(doc)

In [None]:
def p_ml_coll(term, coll):
    return sum([dictTF_alldocs[doc][term] for doc in coll]) / sum([term_occ(doc) for doc in coll])

In [None]:
def smoothing(term, doc, lamb): # P(w|D)
    return lamb * p_ml(term, doc) + (1 - lamb) * p_ml_coll(term, coll)

In [None]:
def dirichlet_smoothing(term, doc, mu=1000):
    lamb = term_occ(doc) / (term_occ(doc) + mu)
    return smoothing(term, doc, mu)

## EXPANDED RETRIEVAL

### Relevance models

In [None]:
def bayes(query, doc, prob_func): # returns P(D|Q)
    return query_likelihood_retr(query, doc, prob_func) * 1/len(coll) # uniformity for P(D), we assume that we have same probs for all documents

In [None]:
# DOCUMENTS RENVOYÉS PAR LE RETRIEVING = retreived_docs (top 50 docs)

def relevance_model(term, query): # P(w|Q)
    return sum([smoothing(term, doc, 0.9) * bayes(query, doc, prob_func) for doc in retrieved_docs])

### Relevance model retrieval

In [None]:
# TODO

# EXPANSION PREDICTION TASK

### Tools

In [None]:
def all_P_wQ(vocab, query):
    return np.array([relevance_model(w,query) for w in vocab])

def all_P_wcoll(vocab, coll):
    pass

## CLARITY METHOD

### Weighted clarity scores

In [None]:
def clarity(p_func, q_func, u, coll, query):
    vocabulary = list(top_vocab(coll).keys())
    P_WQ = all_P_wQ(vocabulary, query)
    u_W = np.array([u])
    E_AU = np.sum()
    return sum([])