# Query Reformulation via Relevance Feedback

## Example Corpus

In [1]:
query = ["beautiful", "image", "generation", "stable", "diffusion"]
document_1 = ["image", "beautiful", "photoshop", "image", "beautiful"]
document_2 = ["stable", "diffusion", "spaces", "latent", "spaces", "generation"]
document_3 = ["generation", "beautiful", "photoshop", "diffusion"]
document_4 = ["photoshop", "image", "photoshop"]
corpus = [query, document_1, document_2, document_3, document_4]

In [2]:
words = set(query + document_1 + document_2 + document_3 + document_4)

## Creating our base weights as raw frequency

In [3]:
import pandas as pd
import numpy as np

In [4]:
vectors = [np.array([document.count(word) for word in words]) for document in corpus]

# term-document matrix
df_vectors = pd.DataFrame(vectors, ['query'] + ['document_' + str(i + 1) for i in range(len(corpus)-1)], [word for word in words])
df_vectors

Unnamed: 0,image,diffusion,photoshop,beautiful,spaces,latent,generation,stable
query,1,1,0,1,0,0,1,1
document_1,2,0,1,2,0,0,0,0
document_2,0,1,0,0,2,1,1,1
document_3,0,1,1,1,0,0,1,0
document_4,1,0,2,0,0,0,0,0


## Exploring the different variants

In [5]:
from IPython.display import display, Math

Predefined weights:

In [6]:
alpha = 0.9
beta = 0.5
gamma = 0.5

Predefined feedback:

In [7]:
query_vec = [vectors[0]]
relevant_docs_vec = [vectors[2], vectors[3]]
irrelevant_docs_vec = [vectors[1], vectors[4]]

Functions to create the necessary vectors as well as their representation in LaTeX:

Assume that: $\ weight = \omega, \ document\_vectors = D$
- For the relevant and irrelevant documents follows: $\frac{\omega}{|D|} * \sum_{\vec{d_j} \in D} \vec{d_j}$
- For the query follows due to $D=\{\vec{q}\}$: $\omega * \vec{q}$

In [8]:
def create_weighted_vector(documents, weight):
    return (weight / len(documents)) * np.array([sum([vector[index] for vector in documents]) for index in range(len(documents[0]))])

Creation of LaTeX specific strings:

In [9]:
def create_display_vector(vector):
    return '\\begin{bmatrix}' + ('\\\\'.join([str(entry) for entry in vector])) + '\\end{bmatrix}'

In [10]:
def create_display_component(vectors, weight, frac, simple):
    # no weight
    if simple : return create_display_vector(vectors)

    # weight
    # without fractional, i.e. |vectors| = 1
    if not frac:
        return '{0}'.format(weight) + '*' + create_display_vector(vectors)

    # with fractional, i.e. |vectors| > 1
    return '\\frac{' + '{0}'.format(weight) + '}{' + '{0}'.format(len(vectors)) + '}' + '*' + create_display_vector(create_weighted_vector(vectors, len(vectors)))

In [11]:
def create_relevance_display(query_vec, alpha, relevant_docs_vec, beta, irrelevant_docs_vec, gamma):
    # first display is with weight, second is the calculated vector with weights multiplied
    q_vec_display_1 = create_display_component(query_vec[0], alpha, False, False)
    q_vec_display_2 = create_display_component(create_weighted_vector(query_vec, alpha), 1, False, True)

    positive_vec_display_1 = create_display_component(relevant_docs_vec, beta, True, False)
    positive_vec_display_2 = create_display_component(create_weighted_vector(relevant_docs_vec, beta), 1, False, True)

    negative_vec_display_1 = create_display_component(irrelevant_docs_vec, gamma, True, False)
    negative_vec_display_2 = create_display_component(create_weighted_vector(irrelevant_docs_vec, gamma), 1, False, True)

    # calculate the total result
    result = create_display_component(np.round((alpha * query_vec[0] + create_weighted_vector(relevant_docs_vec, beta) - create_weighted_vector(irrelevant_docs_vec, gamma)), 2), 1, False, True)

    # add the strings together for display
    display_str = '\\vec{q}_m = ' + q_vec_display_1 + ' + ' + positive_vec_display_1 + ' - ' + negative_vec_display_1 + ' = ' + \
                  q_vec_display_2 + ' + ' + positive_vec_display_2 + ' - ' + negative_vec_display_2 + ' = ' + \
                  result

    return display_str

### Standard Rocchio

In [12]:
display(Math(create_relevance_display(query_vec, alpha, relevant_docs_vec, beta, irrelevant_docs_vec, gamma)))

<IPython.core.display.Math object>

### Ide* Regular

In [13]:
# by multiplying with the length we adjust to the raw weight
display(Math(create_relevance_display(query_vec, alpha, relevant_docs_vec, beta * len(relevant_docs_vec), irrelevant_docs_vec, gamma * len(irrelevant_docs_vec))))

<IPython.core.display.Math object>

### Ide "Dec Hi" Method

In [14]:
# create a tuple of (index of vector, sum of frequencies of vector) and select the one with the highest total frequency as most irrelevant
# (selection of most irrelevant depends on user, but for demonstration purposes we follow this strategy)
most_irrelevant_doc_index = max([(index, sum(irrelevant_docs_vec[index])) for index in range(len(irrelevant_docs_vec))], key=lambda x:x[1])[0]
most_irrelevant_doc = [irrelevant_docs_vec[most_irrelevant_doc_index]]

display(Math(create_relevance_display(query_vec, alpha, relevant_docs_vec, beta * len(relevant_docs_vec), most_irrelevant_doc, gamma)))

<IPython.core.display.Math object>