# Language Model

In [2]:
import numpy as np

# Given term-document matrix
term_document_matrix = np.array([
    [1, 1, 2, 1],
    [0, 2, 0, 1],
    [2, 0, 1, 0],
    [4, 0, 1, 2],
    [1, 2, 1, 0]
])

In [5]:
mu = 6
doc_length = term_document_matrix.sum(axis=0)
collection_length = term_document_matrix.sum()

# Function to calculate the probability of a term in the empirical language model of a document
def empirical_prob(term_index, doc_index):
    return term_document_matrix[term_index, doc_index] / doc_length[doc_index]

# Function to calculate the probability of a term in the background language model (entire collection)
def background_prob(term_index):
    return term_document_matrix[term_index].sum() / collection_length

# Function to calculate the probability of a term in the smoothed language model of a document
def smoothed_prob(term_index, doc_index):
    term_count = term_document_matrix[term_index, doc_index]
    return (term_count + mu * background_prob(term_index)) / (doc_length[doc_index] + mu)

# Function to calculate the score of a document for a given query
def score_doc(query_indices, doc_index):
    score = 1
    for term_index in query_indices:
        score *= smoothed_prob(term_index, doc_index)
    return score

prob_term5_empirical = empirical_prob(1, 1)
print(f"P(t5 | d1) = {prob_term5_empirical:.3f}")

prob_term4_background = background_prob(4)
print(f"P(t4 | C) = {prob_term4_background:.3f}")

prob_term2_smoothed_doc3 = smoothed_prob(0, 3)
print(f"P(t2 | Theta_d3) = {prob_term2_smoothed_doc3:.3f}")

smoothed_probs_doc2 = [smoothed_prob(i, 1) for i in range(term_document_matrix.shape[0])]
term_lowest_prob_doc2 = np.argmin(smoothed_probs_doc2)
print(f"Lowest smoothed prob in LM doc2 is term{term_lowest_prob_doc2+1}")

query_indices = [4, 1]
scores = [score_doc(query_indices, i) for i in range(term_document_matrix.shape[1])]
top_scoring_doc = np.argmax(scores)
print(f"Top scoring document for query \"term1 term3 term5\" is doc{top_scoring_doc + 1}")

P(t5 | d1) = 0.400
P(t4 | C) = 0.182
P(t2 | Theta_d3) = 0.236
Lowest smoothed prob in LM doc2 is term3
Top scoring document for query "term1 term3 term5" is doc2
