# Statistical Analysis

## Example Corpus

In [1]:
document_1 = ["lorem", "ipsum", "fill"]
document_2 = ["fill", "water", "water"]
document_3 = ["water", "lorem", "lorem"]
document_4 = ["ipsum", "fill"]
corpus = [document_1, document_2, document_3, document_4]

In [2]:
words = set(document_1 + document_2 + document_3 + document_4)
words_list = [word for word in words]

## Creating our base weights as raw frequency

In [3]:
import pandas as pd
import numpy as np

In [4]:
vectors = [np.array([document.count(word) for word in words]) for document in corpus]

# term-document matrix
df_vectors = pd.DataFrame(vectors, ['document_' + str(i + 1) for i in range(len(corpus))], words_list)
df_vectors

Unnamed: 0,lorem,ipsum,fill,water
document_1,1,1,1,0
document_2,0,0,1,2
document_3,2,0,0,1
document_4,0,1,1,0


## Creating an association matrix

As a reminder: $c_{ij} = \sum_{d_k \in D} f_{ik} \times f_{jk}$, where $f_{ik}$ is the frequency of term i in document k

In [5]:
def create_association_matrix(df, words):
    association_matrix = np.zeros((len(words), len(words)))

    for i in range(len(words)):
        for j in range(i+1):
            # each entry is the scalar product of each words column vector
            row_word = np.array(df.iloc[:, i].tolist())
            column_word = np.array(df.iloc[:, j].tolist())
            result = np.dot(row_word, column_word)

            # use symmetry to only calculate results once
            association_matrix[i][j] = result
            if i != j:
                association_matrix[j][i] = result

    return association_matrix

We normalize an entry the following way: $s_{ij} = \frac{c_{ij}}{c_{ii} + c_{jj} - c_{ij}}$

In [6]:
def normalize_association_matrix(matrix):
    normalized_matrix = np.zeros(matrix.shape)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            normalized_matrix[i][j] = matrix[i][j] / (matrix[i][i] + matrix[j][j] - matrix[i][j])

    return normalized_matrix

Visualising the results as tables:

In [7]:
association_matrix = create_association_matrix(df_vectors, words)

df_association = pd.DataFrame(association_matrix, words_list, words_list)
df_association

Unnamed: 0,lorem,ipsum,fill,water
lorem,5.0,1.0,1.0,2.0
ipsum,1.0,2.0,2.0,0.0
fill,1.0,2.0,3.0,2.0
water,2.0,0.0,2.0,5.0


In [8]:
normalized_association_matrix = np.round(normalize_association_matrix(association_matrix), 2)

df_association_normalized = pd.DataFrame(normalized_association_matrix, words_list, words_list)
df_association_normalized

Unnamed: 0,lorem,ipsum,fill,water
lorem,1.0,0.17,0.14,0.25
ipsum,0.17,1.0,0.67,0.0
fill,0.14,0.67,1.0,0.33
water,0.25,0.0,0.33,1.0


## Evaluate a query based on the found associations

Get the "best" term regarding a given word:

In [9]:
def get_highest_associated_term(df, word, words):
    # get row in matrix and delete relation to itself
    index = words.index(word)
    row = df.iloc[index,:].tolist()
    row.pop(index)

    # get "best" term's index
    resulting_index = row.index(max(row))
    if resulting_index >= index: resulting_index += 1 # adjusting needed as a value has been deleted

    return words[resulting_index]

The simple evaluation adds the "best" term for each word in the query:

In [10]:
def evaluate_simple(df, query, words):
    new_query = query.copy()

    added_words = [get_highest_associated_term(df, word, words) for word in query]
    for word in added_words:
        new_query.append(word)

    return new_query

The advanced evaluation adds a term to the query only if in total, it relates to all words in the query close enough:

In [11]:
def evaluate_advanced(df, query, words, threshold):
    new_query = query.copy()

    # added simularities (for all words)
    sim = [sum([ df.iloc[words.index(word),:].tolist()[words.index(w)] for w in query ]) for word in words]
    # indeces of the words in the query based on the vocabulary to exclude entries in sim
    indeces = [i for i in [words.index(word) for word in query]]

    for i in range(len(sim)):
        if i not in indeces and sim[i] >= threshold:
            new_query.append(words[i])

    return new_query

Simple showcase of the evaluations:

In [12]:
query = ["ipsum", "water"]

# the threshold could be chosen in a more sophisticated way; in this case 90% of the max entry has been taken
print("Simple association: ")
print("Simple: " + str(evaluate_simple(df_association, query, words_list)))
print("Advanced: " + str(evaluate_advanced(df_association, query, words_list, 4.5)))

print("-------------------------------------------")

print("Normalized association: ")
print("Simple: " + str(evaluate_simple(df_association_normalized, query, words_list)))
print("Advanced: " + str(evaluate_advanced(df_association_normalized, query, words_list, 0.9)))

Simple association: 
Simple: ['ipsum', 'water', 'fill', 'lorem']
Advanced: ['ipsum', 'water']
-------------------------------------------
Normalized association: 
Simple: ['ipsum', 'water', 'fill', 'fill']
Advanced: ['ipsum', 'water', 'fill']
