# Examples for the Probabilistic Model
## Creating our example corpus and query

In [1]:
import pandas as pd

In [2]:
document_1 = ['is', 'information', 'retrieval', 'the', 'study', 'of', 'retrieving', 'documents']
document_2 = ['documents', 'are', 'important', 'for', 'study']
document_3 = ['this', 'is', 'a', 'filler', 'text', 'lorem', 'ipsum']
corpus = [document_1, document_2, document_3]

query = ['documents', 'study', 'of', 'filler', 'information']

Creating vectors that have information on if a term is in a document:

In [3]:
def create_unique_vector(document, words):
    return [word in document for word in words]

words = document_1 + document_2 + document_3
unique_words = set(words)
vectors = [create_unique_vector(document, unique_words) for document in corpus] + [create_unique_vector(query, unique_words)]

df_vectors = pd.DataFrame(vectors, ['document_' + str(i + 1) for i in range(len(corpus))] + ['query'], [word for word in unique_words])
df_vectors

Unnamed: 0,filler,ipsum,is,lorem,the,retrieving,a,text,are,study,retrieval,for,important,this,of,information,documents
document_1,False,False,True,False,True,True,False,False,False,True,True,False,False,False,True,True,True
document_2,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,False,True
document_3,True,True,True,True,False,False,True,True,False,False,False,False,False,True,False,False,False
query,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,True


## Definition of necessary functions

The IDF component is defined as: $\sum_{k_i \in q \wedge k_i \in d_j} \log{\frac{N - n_i + 0.5}{n_i + 0.5}}$,
where:
- $N$ is the total number of documents in the corpus
- $n_i$ is the amount of documents in the corpus that contain the term "i" at least once

In [4]:
import math

def calculate_prob_idf(target, vectors):
    query_vector = vectors[len(vectors)-1]
    document_vectors = vectors[:len(vectors)-1]

    # create respective vectors for n_i as well as the intersecting positions
    df_vector = [sum([vector[index] > 0 for vector in document_vectors]) for index in range(len(document_vectors[0]))]
    relevant_vector = [target[index] > 0 and query_vector[index] > 0 for index in range(len(query_vector))]

    N = len(document_vectors)
    return [calculate_partial_idf(df_vector[index], N, relevant_vector[index] > 0) for index in range(len(relevant_vector))]

# only calculated when both query and document have the term
def calculate_partial_idf(df_value, N, toAdd):
    if not toAdd: return 0
    return math.log((N-df_value+0.5) / (df_value + 0.5), 2)

The TF component is defined as: $B_{ij} = \frac{(k_1 + 1) * f_{ij}}{k_1 * [(1-b) + b * \frac{len(d_j)}{avg\_doclen}] + f_{ij}}$,
where:
- $f_{ij}$ is the frequency of term "i" in document j
- $k_1$ is a constant, usually in the intervall of $[1.2, 2.0]$
- $b$ is a constant in the intervall of $[0, 1]$
- $len(d_j)$ is the length of document j
- $avg\_doclen$ is the average length of a document in the corpus

In [5]:
avg_doc_len = sum([len(document) for document in corpus]) / (len(corpus))

def calculate_prob_tf(k, b, document):
    return [((k+1)*document.count(term)) / (k*((1-b) + b*(len(document)/avg_doc_len)) + document.count(term)) for term in document]

In total we get: $\sum_{k_i \in q \wedge k_i \in d_j} B_{ij} * \log{\frac{N - n_i + 0.5}{n_i + 0.5}}$

## Calculating the results

IDF can be used for any variation of BMXX:

In [6]:
idf_values = [calculate_prob_idf(vectors[index], vectors) for index in range(len(vectors)-1)]

### BM25 variation

In [7]:
# k=1.2, b=0.75
tf_25_values = [calculate_prob_tf(1.2, 0.75, vectors[index]) for index in range(len(vectors)-1)]
bm25 = [sum([pair[0]*pair[1] for pair in zip(idf_values[index], tf_25_values[index])]) for index in range(len(idf_values))]

df_bm25 = pd.DataFrame(zip(idf_values, tf_25_values, bm25), ['document_' + str(i + 1) for i in range(len(corpus))], ['idf', 'tf25', 'bm25'])
df_bm25

Unnamed: 0,idf,tf25,bm25
document_1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.7076326002587325, 1.7076326002587325, 1.661...",0.0
document_2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.8088386433710177, 1.8088386433710177, 1.808...",-2.134726
document_3,"[0.7369655941662062, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.6050026055237108, 1.6050026055237108, 1.605...",1.182832


### BM15 variation

In [8]:
# k=1.2, b=0
tf_15_values = [calculate_prob_tf(1.2, 0, vectors[index]) for index in range(len(vectors)-1)]
bm15 = [sum([pair[0]*pair[1] for pair in zip(idf_values[index], tf_15_values[index])]) for index in range(len(idf_values))]

df_bm15 = pd.DataFrame(zip(idf_values, tf_15_values, bm15), ['document_' + str(i + 1) for i in range(len(corpus))], ['idf', 'tf15', 'bm15'])
df_bm15

Unnamed: 0,idf,tf15,bm15
document_1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.9411764705882355, 1.9411764705882355, 1.913...",0.0
document_2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[2.0000000000000004, 2.0000000000000004, 2.000...",-2.615039
document_3,"[0.7369655941662062, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.8780487804878052, 1.8780487804878052, 1.878...",1.384057


### BM11 variation

In [9]:
# k=1.2, b=1
tf_11_values = [calculate_prob_tf(1.2, 1, vectors[index]) for index in range(len(vectors)-1)]
bm11 = [sum([pair[0]*pair[1] for pair in zip(idf_values[index], tf_11_values[index])]) for index in range(len(idf_values))]

df_bm11 = pd.DataFrame(zip(idf_values, tf_11_values, bm11), ['document_' + str(i + 1) for i in range(len(corpus))], ['idf', 'tf11', 'bm11'])
df_bm11

Unnamed: 0,idf,tf11,bm11
document_1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.6417910447761197, 1.6417910447761197, 1.591...",0.0
document_2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.7529880478087652, 1.7529880478087652, 1.752...",-2.011569
document_3,"[0.7369655941662062, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.5308151093439368, 1.5308151093439368, 1.530...",1.128158


### BM variation with k=0

In [10]:
# k=0, b=0
tf_k0_values = [calculate_prob_tf(0, 0, vectors[index]) for index in range(len(vectors)-1)]
bm_k0 = [sum([pair[0]*pair[1] for pair in zip(idf_values[index], tf_k0_values[index])]) for index in range(len(idf_values))]

df_bm_k0 = pd.DataFrame(zip(idf_values, tf_k0_values, bm_k0), ['document_' + str(i + 1) for i in range(len(corpus))], ['idf', 'tf_k0', 'bm_k0'])
df_bm_k0

Unnamed: 0,idf,tf_k0,bm_k0
document_1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0
document_2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.736965594166206...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",-1.473931
document_3,"[0.7369655941662062, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.736966
