## Normalization

In [1]:
def min_max_normalize(arr):
    if not arr:  # Check if the array is empty
        return []

    min_val = min(arr)
    max_val = max(arr)
    
    if min_val == max_val:  # Check if all values in array are the same
        return [0] * len(arr)

    # Apply min-max normalization and round to 3 decimal places
    return [round((x - min_val) / (max_val - min_val), 3) for x in arr]


In [2]:
#example of use
arr = [2,12,7]
print(min_max_normalize(arr))

[0.0, 1.0, 0.5]


## Kmean clustering

In [3]:
import numpy as np

def euclidean_distance(point1, point2):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

def classify_points(points, centroids):
    """Classify points based on the closest centroid using K-means clustering.
    In case of equidistant points, choose the cluster with the lowest index.

    Args:
    points (list of lists): The points to classify.
    centroids (list of lists): The centroids to use for classification.

    Returns:
    list: A list of indices indicating the closest centroid for each point.
    """
    classifications = []
    for point in points:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        min_distance = min(distances)
        # Find indices of all centroids that have the minimum distance
        min_indices = [i for i, d in enumerate(distances) if d == min_distance]
        # Choose the centroid with the lowest index among those
        classification = min(min_indices)
        classifications.append(classification)
    return classifications


In [4]:
points = [(3,1,0,4), (1,3,4.5,5), (6,3,2,0)]
centroids = [(3,5,4,4.5), (1.5,2.5,1,2),(2,3.5,1,0)]

# Classify the points
classifications = classify_points(points, centroids)

# Output the classification result
for point, classification in zip(points, classifications):
    print(f"Point {point} is classified to centroid {centroids[classification]}")


Point (3, 1, 0, 4) is classified to centroid (1.5, 2.5, 1, 2)
Point (1, 3, 4.5, 5) is classified to centroid (3, 5, 4, 4.5)
Point (6, 3, 2, 0) is classified to centroid (2, 3.5, 1, 0)


## Dirichlet smoothing

In [None]:
import numpy as np

# Term-document matrix as provided
term_document_matrix = np.array([
    [1, 1, 2, 1],  # term1
    [0, 2, 0, 1],  # term2
    [2, 0, 1, 0],  # term3
    [4, 0, 1, 2],  # term4
    [1, 2, 1, 0]   # term5
])

# Dirichlet smoothing parameter
mu = 6

# Calculate the total number of words in each document
doc_lengths = term_document_matrix.sum(axis=0)

# Calculate the total count of each term in the collection (sum over all documents)
term_frequencies = term_document_matrix.sum(axis=1)

# Total number of words in the collection
collection_length = term_frequencies.sum()

# Calculate the probability of each term in the collection
term_prob_collection = term_frequencies / collection_length

# Function to calculate the smoothed probability of a term in a document
def dirichlet_smoothed_probability(term_idx, doc_idx, term_document_matrix, mu, term_prob_collection):
    term_count = term_document_matrix[term_idx, doc_idx]
    doc_length = doc_lengths[doc_idx]
    prob_term_collection = term_prob_collection[term_idx]
    return (term_count + mu * prob_term_collection) / (doc_length + mu)

# Calculate the probability of term5 in the empirical language model of doc1
prob_term5_doc1 = dirichlet_smoothed_probability(4, 0, term_document_matrix, mu, term_prob_collection)

# Calculate the probability of term4 in the background language model (collection)
prob_term4_collection = term_prob_collection[3]

# Calculate the probability of term2 in the smoothed language model of doc3
prob_term2_doc3 = dirichlet_smoothed_probability(1, 2, term_document_matrix, mu, term_prob_collection)

# Calculate the smoothed probabilities for all terms in doc2 for finding the term with lowest probability
prob_terms_doc2 = [dirichlet_smoothed_probability(term_idx, 1, term_document_matrix, mu, term_prob_collection) for term_idx in range(term_document_matrix.shape[0])]
lowest_prob_term_doc2 = np.argmin(prob_terms_doc2) + 1 # adding 1 to match term numbering

# Function to calculate the score of a document for a given query
def document_score(query_terms, doc_idx, term_document_matrix, mu, term_prob_collection):
    score = 1
    for term in query_terms:
        term_idx = int(term[-1]) - 1  # Convert term to index (e.g., term1 to 0)
        score *= dirichlet_smoothed_probability(term_idx, doc_idx, term_document_matrix, mu, term_prob_collection)
    return score

# Calculate the scores for each document for the given query
query = ["term1", "term3", "term5"]
scores = [document_score(query, doc_idx, term_document_matrix, mu, term_prob_collection) for doc_idx in range(term_document_matrix.shape[1])]
top_scoring_doc = np.argmax(scores) + 1 # adding 1 to match document numbering

#print with 3 decimal places (prob_term5_doc1, prob_term4_collection, prob_term2_doc3, lowest_prob_term_doc2, top_scoring_doc)
print(f"Probability of term5 in doc1 is {prob_term5_doc1:.3f}")
print(f"Probability of term4 in the collection is {prob_term4_collection:.3f}")
print(f"Probability of term2 in doc3 is {prob_term2_doc3:.3f}")
print(f"Term with lowest probability in doc2 is term{lowest_prob_term_doc2}")
print(f"Document with highest score is doc{top_scoring_doc}")


Probability of term5 in doc1 is 0.149
Probability of term4 in the collection is 0.318
Probability of term2 in doc3 is 0.074
Term with lowest probability in doc2 is term3
Document with highest score is doc3


## Evaluation

In [8]:
import pandas as pd
from sklearn.metrics import confusion_matrix

def calculate_scores(csv_path):
    # Read the CSV file
    data = pd.read_csv(csv_path)
    y_true = data['actual']  # Replace 'actual' with the column name for actual labels
    y_pred = data['predicted']  # Replace 'predicted' with the column name for predicted labels

    # Calculate TP, FP, FN, and TN
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculate precision, recall, F1, and F2 scores
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    f2_score = 5 * (precision * recall) / (4 * precision + recall) if (4 * precision + recall) != 0 else 0

    return precision, recall, f1_score, f2_score

# Usage
# precision, recall, f1_score, f2_score = calculate_scores('path_to_csv.csv')


In [9]:
calculate_scores("results.csv")

(0.5, 0.75, 0.6, 0.6818181818181818)

In [None]:
import math

def dcg_at_value(ranked_documents, ground_truth, value):
    dcg = ground_truth.get(ranked_documents[0], 0)
    for i, doc_id in enumerate(ranked_documents[1:value], start=2):
        relevance_score = ground_truth.get(doc_id, 0)
        dcg += relevance_score / math.log2(i)  # +2 because we start counting ranks from 1 and log base 2
    return dcg

def ndcg_at_k(ranked_documents, ground_truth, k):
    dcg_max = dcg_at_value(sorted(ground_truth, key=ground_truth.get, reverse=True), ground_truth, k)
    if not dcg_max:
        return 0
    return dcg_at_value(ranked_documents, ground_truth, k) / dcg_max


ground_truth_scores = {1: 3, 
                       2: 2, 
                       3: 1, 
                       7: 3}  # Mapping from the document ID to its score

# Rankings from the image
system_a_ranking = [10, 7, 9, 8, 2, 1, 3, 4, 5, 6]
system_b_ranking = [3, 2, 1, 4, 5, 7, 8, 10, 9, 6]

# Calculate DCG@5 for both systems
dcg_at_5_system_a = dcg_at_value(system_a_ranking, ground_truth_scores, 5)
dcg_at_5_system_b = dcg_at_value(system_b_ranking, ground_truth_scores, 5)

print(f"DCG@5 for system A: {dcg_at_5_system_a:.3f}")
print(f"DCG@5 for system B: {dcg_at_5_system_b:.3f}")

# calculate ndcg@10 for both systems
ndcg_at_10_system_a = ndcg_at_k(system_a_ranking, ground_truth_scores, 10)
ndcg_at_10_system_b = ndcg_at_k(system_b_ranking, ground_truth_scores, 10)

print(f"NDCG@10 for system A: {ndcg_at_10_system_a:.3f}")
print(f"NDCG@10 for system B: {ndcg_at_10_system_b:.3f}")



DCG@5 for system A: 3.861
DCG@5 for system B: 4.893
NDCG@10 for system A: 0.693
NDCG@10 for system B: 0.780


## Bayes

In [2]:
import pandas as pd
import numpy as np

# Function to calculate the probability P(t|c) given a term-document matrix and a class label
def calculate_probability(df, term, given_class):
    # Count the number of occurrences of the term in documents of the given class
    n_t_c = df[df['class'] == given_class][term].sum()
    
    # Count the total number of terms in documents of the given class
    total_terms = df[df['class'] == given_class].drop(columns='class').sum().sum()
    
    # Count the number of unique classes
    num_classes = df['class'].nunique()
    
    # Calculate the probability using the given formula
    probability = (n_t_c + 1) / (total_terms + num_classes)
    return probability

# Creating a dataframe from the data in the image
data = {
    't1': [2, 0, 3, 4, 1, 0],
    't2': [0, 0, 4, 0, 0, 1],
    't3': [1, 0, 0, 3, 0, 1],
    't4': [2, 0, 2, 1, 3, 0],
    't5': [0, 3, 0, 1, 1, 3],
    't6': [2, 2, 0, 1, 2, 4],
    't7': [4, 2, 2, 0, 0, 1],
    'class': ['C1', 'C3', 'C2', 'C3', 'C2', 'C1']
}

# Converting dictionary to DataFrame
df = pd.DataFrame(data)

# Example calculation for term 't1' and class 'C1'
calculate_probability(df, 't4', 'C2')


0.2857142857142857

In [5]:
import pandas as pd
from fractions import Fraction

# Define the document-term matrix and the classes
data = {
    't1': [2, 0, 3, 4, 1, 0],
    't2': [0, 0, 4, 0, 0, 1],
    't3': [1, 0, 0, 3, 0, 1],
    't4': [2, 0, 2, 1, 3, 0],
    't5': [0, 3, 0, 1, 1, 3],
    't6': [2, 2, 0, 1, 2, 4],
    't7': [4, 2, 2, 0, 0, 1],
    'class': ['C1', 'C3', 'C2', 'C3', 'C2', 'C1']
}
# Create a DataFrame
df = pd.DataFrame(data)

# Count the number of documents per class
class_counts = df['class'].value_counts()

# Number of classes
num_classes = len(class_counts)

# Prior class probability for C2
prior_C2 = Fraction(class_counts['C2'], sum(class_counts))

# Calculate the term frequency for each term in each class
term_frequencies = df.groupby('class').sum()

# Total number of terms in class C2
total_terms_C2 = term_frequencies.loc['C2'].sum()

# Smoothed probability of term "t4" belonging to C2 with Laplace smoothing
prob_t4_C2 = Fraction(term_frequencies.at['C2', 't4'] + 1, total_terms_C2 + num_classes)

# For a new document "t1", calculate the probability of belonging to C1
# Count of t1 in C1 + 1 / Total count in C1 + Number of terms
prob_t1_C1 = Fraction(term_frequencies.at['C1', 't1'] + 1, term_frequencies.loc['C1'].sum() + num_classes)

# For a new document "t1 t4 t5", calculate the probability of belonging to C3
# We'll calculate the individual probabilities and later multiply them
prob_t1_C3 = Fraction(term_frequencies.at['C3', 't1'] + 1, term_frequencies.loc['C3'].sum() + num_classes)
prob_t4_C3 = Fraction(term_frequencies.at['C3', 't4'] + 1, term_frequencies.loc['C3'].sum() + num_classes)
prob_t5_C3 = Fraction(term_frequencies.at['C3', 't5'] + 1, term_frequencies.loc['C3'].sum() + num_classes)
prob_t1t4t5_C3 = prob_t1_C3 * prob_t4_C3 * prob_t5_C3

# Calculate the class probability of document "t4 t5" for each class
# Initialize a dictionary to store probabilities for "t4 t5" for each class
class_probabilities = {}

for c in class_counts.index:
    prob_t4_c = Fraction(term_frequencies.at[c, 't4'] + 1, term_frequencies.loc[c].sum() + num_classes)
    prob_t5_c = Fraction(term_frequencies.at[c, 't5'] + 1, term_frequencies.loc[c].sum() + num_classes)
    # Multiply by the prior for the class
    class_probabilities[c] = prob_t4_c * prob_t5_c * Fraction(class_counts[c], sum(class_counts))

# The classification of "t4 t5" should be the class with the highest probability
classification_t4_t5 = max(class_probabilities, key=class_probabilities.get)

prior_C2, prob_t4_C2, prob_t1_C1, prob_t1t4t5_C3, classification_t4_t5, class_probabilities


(Fraction(1, 3),
 Fraction(2, 7),
 Fraction(1, 8),
 Fraction(1, 160),
 'C2',
 {'C1': Fraction(1, 144), 'C3': Fraction(1, 120), 'C2': Fraction(4, 441)})

## Entity retrival

## Stemmer

In [10]:
import nltk
from nltk.stem.porter import PorterStemmer

# Placeholder for Krovetz stemmer; this would require a more complex implementation or a specialized library.
def krovetz_stemmer(word):
    # Implement Krovetz stemming logic here or use a library that provides this stemmer.
    return word

# Simple Suffix-S Stemmer
def suffix_s_stemmer(word):
    if word.endswith('s'):
        return word[:-1]
    return word

def apply_stemming(word, stemmer_type):
    stemmer = None

    if stemmer_type == 'porter':
        stemmer = PorterStemmer()
        return stemmer.stem(word)
    elif stemmer_type == 'krovetz':
        return krovetz_stemmer(word)
    elif stemmer_type == 'suffix_s':
        return suffix_s_stemmer(word)
    else:
        raise ValueError("Invalid stemmer type. Choose 'porter', 'krovetz', or 'suffix_s'.")

# Example usage
word = "running"
for stemmer_type in ['porter', 'krovetz', 'suffix_s']:
    stemmed_word = apply_stemming(word, stemmer_type)
    print(f"{stemmer_type.capitalize()} Stemmer: {stemmed_word}")


Porter Stemmer: run
Krovetz Stemmer: running
Suffix_s Stemmer: running


In [11]:
string = "Two faders and two sons went fishing"

apply_stemming(string, 'porter')

'two faders and two sons went fish'

## indexing

In [12]:
import re
from collections import defaultdict

# Define the documents
docs = {
    1: "The old man and his two sons went fishing.",
    2: "Recreational fishing is an activity with important social implications.",
    3: "Introduction to social protection benefits for old age.",
    4: "Introduction to how lake trout fishing works."
}

# Define the stopwords
stopwords = set([
    "an", "and", "are", "for", "how", "in", "is", "not", "or", "the", "these", "this", "to", "with"
])

# Function to preprocess the documents
def preprocess(text):
    # Tokenize by word
    tokens = re.findall(r'\b\w+\b', text.lower())
    # Stemming: remove trailing 's' character
    tokens = [re.sub(r's$', '', token) for token in tokens]
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

# Create the inverted index
inverted_index = defaultdict(lambda: defaultdict(int))

for doc_id, text in docs.items():
    tokens = preprocess(text)
    for token in tokens:
        inverted_index[token][doc_id] += 1

# Convert the inverted index into a posting list format
posting_lists = []
for token, doc_freqs in inverted_index.items():
    posting_list = f"{token} ->" + ''.join([f" d{doc_id}:{freq}" for doc_id, freq in doc_freqs.items()])
    posting_lists.append(posting_list)

posting_lists.sort()
posting_lists


['activity -> d2:1',
 'age -> d3:1',
 'benefit -> d3:1',
 'fishing -> d1:1 d2:1 d4:1',
 'hi -> d1:1',
 'i -> d2:1',
 'implication -> d2:1',
 'important -> d2:1',
 'introduction -> d3:1 d4:1',
 'lake -> d4:1',
 'man -> d1:1',
 'old -> d1:1 d3:1',
 'protection -> d3:1',
 'recreational -> d2:1',
 'social -> d2:1 d3:1',
 'son -> d1:1',
 'trout -> d4:1',
 'two -> d1:1',
 'went -> d1:1',
 'work -> d4:1']

## text similarity

In [15]:
#jaccard
# Here is the code from the image transcribed into executable Python code.

def text_similarity(doc_1: str, doc_2: str) -> float:
    set1, set2 = set(doc_1.split()), set(doc_2.split())
    return len(set1.intersection(set2)) / len(set1.union(set2))

# Example usage of the function with two hypothetical documents
doc1 = "this is some piece of text"
doc2 = "slightly different piece of text"

# Calculate the similarity between doc1 and doc2
similarity_score = text_similarity(doc1, doc2)
similarity_score


0.375

## bm25

In [26]:
import math

# Constants from the image
N = 1000  # Total number of documents
avgdl = 50  # Average document length
k1 = 1.25
b = 0.8

# Term frequency table from the image (here using only the available data)
term_frequencies = {
    'T1': {'Doc1': 3, 'Doc2': 4, 'Collection': 100},
    'T2': {'Doc1': 0, 'Doc2': 3, 'Collection': 50},
    'T3': {'Doc1': 2, 'Doc2': 3, 'Collection': 80},
    'T4': {'Doc1': 1, 'Doc2': 2, 'Collection': 93},
    'T5': {'Doc1': 10, 'Doc2': 1, 'Collection': 100},
    'T6': {'Doc1': 5, 'Doc2': 7, 'Collection': 25},
}

# Document lengths from the image
doc_lengths = {
    'Doc1': 21,
    'Doc2': 20
}

# Function to calculate BM25 for a single term in a document
def bm25(doc_id, term_id):
    # Extract the necessary information for the calculations
    ft_d = term_frequencies[term_id].get(doc_id, 0)  # frequency of term in document
    nt = term_frequencies[term_id]['Collection']  # number of documents containing the term
    doc_length = doc_lengths[doc_id]  # length of the document

    # Calculate idft using base-10 logarithm
    idft = math.log10(N / nt)

    # Calculate the BM25 score
    numerator = ft_d * (1 + k1)
    denominator = ft_d + k1 * (1 - b + b * (doc_length / avgdl))
    score = idft * (numerator / denominator)
    
    return score

# Example usage of the function
# Get the BM25 score for Term 1 in Document 1
bm25_score = bm25('Doc1', 'T5')
bm25_score


2.1087160262417997

In [23]:
multi = bm25('Doc1', 'T2')+bm25('Doc1', 'T2')+bm25('Doc1', 'T5')
print(multi)

2.1087160262417997


In [24]:
multi = bm25('Doc2', 'T2')+bm25('Doc2', 'T2')+bm25('Doc2', 'T5')
print(multi)

6.175665114722323
