In [1]:
import os
import math
import re
from collections import defaultdict, Counter


# Preprocessing: normalize, case folding, remove punctuation, and tokenize
def preprocess(text):
    text = text.lower()  # case folding
    text = re.sub(r"\W+", " ", text)  # remove punctuation
    tokens = text.split()  # tokenize by space
    return tokens


# Build the inverted index with document frequencies and term frequencies
def build_index(corpus_dir):
    index = defaultdict(list)  # term -> postings (docID, term frequency)
    doc_lengths = {}  # docID -> document length (for normalization)
    N = 0  # total number of documents

    for docID, filename in enumerate(os.listdir(corpus_dir), 1):
        N += 1
        with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as file:
            content = file.read()
            tokens = preprocess(content)
            term_freqs = Counter(tokens)
            doc_length = 0

            for term, freq in term_freqs.items():
                log_tf = 1 + math.log10(freq)  # log(tf) for documents
                index[term].append((docID, log_tf))
                doc_length += log_tf**2

            doc_lengths[docID] = math.sqrt(
                doc_length
            )  # store the length for normalization

    return index, doc_lengths, N


# Calculate the tf-idf scores for the query
def compute_query_tfidf(query, index, N):
    query_tokens = preprocess(query)
    query_term_freqs = Counter(query_tokens)
    query_tfidf = {}
    query_length = 0

    for term, freq in query_term_freqs.items():
        if term in index:
            df = len(index[term])
            idf = math.log10(N / df)  # log(N/df) for queries
            log_tf = 1 + math.log10(freq)  # log(tf) for queries
            query_tfidf[term] = log_tf * idf  # tf-idf for query
            query_length += (log_tf * idf) ** 2

    query_length = math.sqrt(query_length)  # query normalization
    return query_tfidf, query_length


# Compute cosine similarity between query and documents
def cosine_similarity(query_tfidf, query_length, index, doc_lengths):
    doc_scores = defaultdict(float)

    for term, query_weight in query_tfidf.items():
        if term in index:
            for docID, doc_weight in index[term]:
                doc_scores[docID] += query_weight * doc_weight

    # Normalize by document lengths
    for docID in doc_scores:
        doc_scores[docID] /= query_length * doc_lengths[docID]

    return sorted(
        doc_scores.items(), key=lambda x: (-x[1], x[0])
    )  # sort by score and docID


# Search function to process queries and return top 10 relevant documents
def search(query, index, doc_lengths, N):
    query_tfidf, query_length = compute_query_tfidf(query, index, N)
    ranked_docs = cosine_similarity(query_tfidf, query_length, index, doc_lengths)
    return ranked_docs[:10]  # return top 10 results


# Load and index the corpus
corpus_dir = "corpus"  # change this to your corpus directory
index, doc_lengths, N = build_index(corpus_dir)

# Test queries
queries = [
    "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
    "Warwickshire, came from an ancient family and was the heiress to some land",
]

for i, query in enumerate(queries, 1):
    print(f"Query {i}: {query}")
    results = search(query, index, doc_lengths, N)
    for docID, score in results:
        print(f"DocID: {docID}, Score: {score:.6f}")
    print("-" * 50)

Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
DocID: 41, Score: 0.185878
DocID: 34, Score: 0.101136
DocID: 17, Score: 0.060491
DocID: 14, Score: 0.059895
DocID: 40, Score: 0.052352
DocID: 26, Score: 0.048949
DocID: 30, Score: 0.043797
DocID: 39, Score: 0.034275
DocID: 35, Score: 0.034162
DocID: 36, Score: 0.033587
--------------------------------------------------
Query 2: Warwickshire, came from an ancient family and was the heiress to some land
DocID: 29, Score: 0.107435
DocID: 16, Score: 0.026933
DocID: 1, Score: 0.021333
DocID: 20, Score: 0.020221
DocID: 13, Score: 0.016643
DocID: 41, Score: 0.015930
DocID: 6, Score: 0.015060
DocID: 27, Score: 0.013712
DocID: 30, Score: 0.012880
DocID: 36, Score: 0.011273
--------------------------------------------------
