## Normalization

In [13]:
def min_max_normalize(arr):
    if not arr:  # Check if the array is empty
        return []

    min_val = min(arr)
    max_val = max(arr)
    
    if min_val == max_val:  # Check if all values in array are the same
        return [0] * len(arr)

    # Apply min-max normalization and round to 3 decimal places
    return [round((x - min_val) / (max_val - min_val), 3) for x in arr]


In [14]:
#example of use
arr = [2,12,7]
print(min_max_normalize(arr))

[0.0, 1.0, 0.5]


## Kmean clustering

In [15]:
import numpy as np

def euclidean_distance(point1, point2):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

def classify_points(points, centroids):
    """Classify points based on the closest centroid using K-means clustering.
    In case of equidistant points, choose the cluster with the lowest index.

    Args:
    points (list of lists): The points to classify.
    centroids (list of lists): The centroids to use for classification.

    Returns:
    list: A list of indices indicating the closest centroid for each point.
    """
    classifications = []
    for point in points:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        min_distance = min(distances)
        # Find indices of all centroids that have the minimum distance
        min_indices = [i for i, d in enumerate(distances) if d == min_distance]
        # Choose the centroid with the lowest index among those
        classification = min(min_indices)
        classifications.append(classification)
    return classifications


In [16]:
points = [(3,1,0,4), (1,3,4.5,5), (6,3,2,0)]
centroids = [(3,5,4,4.5), (1.5,2.5,1,2),(2,3.5,1,0)]

# Classify the points
classifications = classify_points(points, centroids)

# Output the classification result
for point, classification in zip(points, classifications):
    print(f"Point {point} is classified to centroid {centroids[classification]}")


Point (3, 1, 0, 4) is classified to centroid (1.5, 2.5, 1, 2)
Point (1, 3, 4.5, 5) is classified to centroid (3, 5, 4, 4.5)
Point (6, 3, 2, 0) is classified to centroid (2, 3.5, 1, 0)


## Dirichlet smoothing

In [17]:
def dirichlet_smoothing(term_doc_matrix, term_index, doc_index, mu=6):

    # Total count of the term in the entire collection
    term_count_in_collection = sum(row[term_index] for row in term_doc_matrix)

    # Length of the document (sum of counts of all terms in the document)
    doc_length = sum(term_doc_matrix[doc_index])

    # Count of the term in the document
    term_count_in_doc = term_doc_matrix[doc_index][term_index]

    # Total count of all terms in the collection
    total_term_count = sum(sum(row) for row in term_doc_matrix)

    # Calculate the probability of the term in the empirical language model of the document
    probability = (term_count_in_doc + mu * (term_count_in_collection / total_term_count)) / (doc_length + mu)

    return probability

# Example usage
# term_doc_matrix = [[2, 3], [0, 1], [1, 0]] # Example term-document matrix
# term_index = 0  # Index of the term
# doc_index = 1  # Index of the document
# probability = dirichlet_smoothing(term_doc_matrix, term_index, doc_index)
# print(probability)

In [18]:
term_document_matrix = np.array([
    [1, 1, 2, 1],  # term1
    [0, 2, 0, 1],  # term2
    [2, 0, 1, 0],  # term3
    [4, 0, 1, 2],  # term4
    [1, 2, 1, 0]   # term5
])

# Parameters
term_index = 4
doc_index = 0
mu = 6

# Probability of term5 in doc1 with Dirichlet smoothing
prob = dirichlet_smoothing(term_document_matrix, term_index, doc_index, mu)

print(f"Probability of term5 in doc1 is {prob:.3f}")

IndexError: index 4 is out of bounds for axis 0 with size 4

In [None]:
import numpy as np

# Term-document matrix as provided
term_document_matrix = np.array([
    [1, 1, 2, 1],  # term1
    [0, 2, 0, 1],  # term2
    [2, 0, 1, 0],  # term3
    [4, 0, 1, 2],  # term4
    [1, 2, 1, 0]   # term5
])

# Dirichlet smoothing parameter
mu = 6

# Calculate the total number of words in each document
doc_lengths = term_document_matrix.sum(axis=0)

# Calculate the total count of each term in the collection (sum over all documents)
term_frequencies = term_document_matrix.sum(axis=1)

# Total number of words in the collection
collection_length = term_frequencies.sum()

# Calculate the probability of each term in the collection
term_prob_collection = term_frequencies / collection_length

# Function to calculate the smoothed probability of a term in a document
def dirichlet_smoothed_probability(term_idx, doc_idx, term_document_matrix, mu, term_prob_collection):
    term_count = term_document_matrix[term_idx, doc_idx]
    doc_length = doc_lengths[doc_idx]
    prob_term_collection = term_prob_collection[term_idx]
    return (term_count + mu * prob_term_collection) / (doc_length + mu)

# Calculate the probability of term5 in the empirical language model of doc1
prob_term5_doc1 = dirichlet_smoothed_probability(4, 0, term_document_matrix, mu, term_prob_collection)

# Calculate the probability of term4 in the background language model (collection)
prob_term4_collection = term_prob_collection[3]

# Calculate the probability of term2 in the smoothed language model of doc3
prob_term2_doc3 = dirichlet_smoothed_probability(1, 2, term_document_matrix, mu, term_prob_collection)

# Calculate the smoothed probabilities for all terms in doc2 for finding the term with lowest probability
prob_terms_doc2 = [dirichlet_smoothed_probability(term_idx, 1, term_document_matrix, mu, term_prob_collection) for term_idx in range(term_document_matrix.shape[0])]
lowest_prob_term_doc2 = np.argmin(prob_terms_doc2) + 1 # adding 1 to match term numbering

# Function to calculate the score of a document for a given query
def document_score(query_terms, doc_idx, term_document_matrix, mu, term_prob_collection):
    score = 1
    for term in query_terms:
        term_idx = int(term[-1]) - 1  # Convert term to index (e.g., term1 to 0)
        score *= dirichlet_smoothed_probability(term_idx, doc_idx, term_document_matrix, mu, term_prob_collection)
    return score

# Calculate the scores for each document for the given query
query = ["term1", "term3", "term5"]
scores = [document_score(query, doc_idx, term_document_matrix, mu, term_prob_collection) for doc_idx in range(term_document_matrix.shape[1])]
top_scoring_doc = np.argmax(scores) + 1 # adding 1 to match document numbering

#print with 3 decimal places (prob_term5_doc1, prob_term4_collection, prob_term2_doc3, lowest_prob_term_doc2, top_scoring_doc)
print(f"Probability of term5 in doc1 is {prob_term5_doc1:.3f}")
print(f"Probability of term4 in the collection is {prob_term4_collection:.3f}")
print(f"Probability of term2 in doc3 is {prob_term2_doc3:.3f}")
print(f"Term with lowest probability in doc2 is term{lowest_prob_term_doc2}")
print(f"Document with highest score is doc{top_scoring_doc}")


Probability of term5 in doc1 is 0.149
Probability of term4 in the collection is 0.318
Probability of term2 in doc3 is 0.074
Term with lowest probability in doc2 is term3
Document with highest score is doc3


## Evaluation

In [22]:
import math

def dcg_at_value(ranked_documents, ground_truth, value):
    dcg = ground_truth.get(ranked_documents[0], 0)
    for i, doc_id in enumerate(ranked_documents[1:value], start=2):
        relevance_score = ground_truth.get(doc_id, 0)
        dcg += relevance_score / math.log2(i)  # +2 because we start counting ranks from 1 and log base 2
    return dcg

def ndcg_at_k(ranked_documents, ground_truth, k):
    dcg_max = dcg_at_value(sorted(ground_truth, key=ground_truth.get, reverse=True), ground_truth, k)
    if not dcg_max:
        return 0
    return dcg_at_value(ranked_documents, ground_truth, k) / dcg_max


ground_truth_scores = {1: 3, 
                       2: 2, 
                       3: 1, 
                       7: 3}  # Mapping from the document ID to its score

# Rankings from the image
system_a_ranking = [10, 7, 9, 8, 2, 1, 3, 4, 5, 6]
system_b_ranking = [3, 2, 1, 4, 5, 7, 8, 10, 9, 6]

# Calculate DCG@5 for both systems
dcg_at_5_system_a = dcg_at_value(system_a_ranking, ground_truth_scores, 5)
dcg_at_5_system_b = dcg_at_value(system_b_ranking, ground_truth_scores, 5)

print(f"DCG@5 for system A: {dcg_at_5_system_a:.3f}")
print(f"DCG@5 for system B: {dcg_at_5_system_b:.3f}")

# calculate ndcg@10 for both systems
ndcg_at_10_system_a = ndcg_at_k(system_a_ranking, ground_truth_scores, 10)
ndcg_at_10_system_b = ndcg_at_k(system_b_ranking, ground_truth_scores, 10)

print(f"NDCG@10 for system A: {ndcg_at_10_system_a:.3f}")
print(f"NDCG@10 for system B: {ndcg_at_10_system_b:.3f}")



DCG@5 for system A: 3.861
DCG@5 for system B: 4.893
NDCG@10 for system A: 0.693
NDCG@10 for system B: 0.780
