In [6]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

# Initialize stemmer, lemmatizer, and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


# Preprocessing function: lowercase, remove punctuation, remove stopwords, apply stemming and lemmatization
def preprocess(text):
    text = text.lower()  # case folding
    text = re.sub(r"\W+", " ", text)  # remove non-word characters
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(stemmer.stem(token))
        for token in tokens
        if token not in stop_words
    ]
    return tokens


# Build the inverted index and document lengths
def build_index(corpus_dir):
    index = defaultdict(list)  # term -> postings (docID, term frequency)
    doc_lengths = {}  # docID -> document length (for normalization)
    N = 0  # total number of documents
    doc_ids = {}  # mapping of filenames to docIDs

    for docID, filename in enumerate(os.listdir(corpus_dir), 1):
        if filename.endswith(".txt"):  # process only text files
            N += 1
            doc_ids[docID] = filename  # map docID to filename
            with open(
                os.path.join(corpus_dir, filename), "r", encoding="utf-8"
            ) as file:
                content = file.read()
                tokens = preprocess(content)
                term_freqs = Counter(tokens)
                doc_length = 0

                for term, freq in term_freqs.items():
                    log_tf = 1 + math.log10(freq)  # log(tf) for documents
                    index[term].append((docID, log_tf))
                    doc_length += log_tf**2

                doc_lengths[docID] = math.sqrt(
                    doc_length
                )  # store the length for normalization

    return index, doc_lengths, N, doc_ids


# Calculate the tf-idf for the query
def compute_query_tfidf(query, index, N):
    query_tokens = preprocess(query)
    query_term_freqs = Counter(query_tokens)
    query_tfidf = {}
    query_length = 0

    for term, freq in query_term_freqs.items():
        if term in index:
            df = len(index[term])
            idf = math.log10(N / df)  # log(N/df) for queries
            log_tf = 1 + math.log10(freq)  # log(tf) for queries
            query_tfidf[term] = log_tf * idf  # tf-idf for query
            query_length += (log_tf * idf) ** 2

    query_length = math.sqrt(query_length)  # query normalization
    return query_tfidf, query_length


# Compute cosine similarity between query and documents
def cosine_similarity(query_tfidf, query_length, index, doc_lengths):
    doc_scores = defaultdict(float)

    for term, query_weight in query_tfidf.items():
        if term in index:
            for docID, doc_weight in index[term]:
                doc_scores[docID] += query_weight * doc_weight

    # Normalize by document lengths
    for docID in doc_scores:
        doc_scores[docID] /= query_length * doc_lengths[docID]

    return sorted(
        doc_scores.items(), key=lambda x: (-x[1], x[0])
    )  # sort by score and docID


# Search function to process queries and return top 10 relevant documents
def search(query, index, doc_lengths, N, doc_ids):
    query_tfidf, query_length = compute_query_tfidf(query, index, N)
    ranked_docs = cosine_similarity(query_tfidf, query_length, index, doc_lengths)

    # Map docIDs back to filenames and return top 10 results
    ranked_files = [(doc_ids[docID], score) for docID, score in ranked_docs[:10]]
    return ranked_files


# Load and index the corpus
corpus_dir = "corpus"  # change this to your corpus directory
index, doc_lengths, N, doc_ids = build_index(corpus_dir)

# Test queries
queries = [
    "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
    "Warwickshire, came from an ancient family and was the heiress to some land",
]

for i, query in enumerate(queries, 1):
    print(f"Query {i}: {query}")
    results = search(query, index, doc_lengths, N, doc_ids)
    if results:
        print("\nTop results:")
        for rank, (filename, score) in enumerate(results, 1):
            print(f"{rank}. {filename} (Score: {score:.6f})")
    else:
        print("No documents match the query.")
    print("-" * 50)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Top results:
1. zomato.txt (Score: 0.207095)
2. swiggy.txt (Score: 0.124229)
3. instagram.txt (Score: 0.058227)
4. messenger.txt (Score: 0.057373)
5. youtube.txt (Score: 0.056774)
6. Discord.txt (Score: 0.051449)
7. bing.txt (Score: 0.049875)
8. paypal.txt (Score: 0.046539)
9. reddit.txt (Score: 0.042547)
10. flipkart.txt (Score: 0.040045)
--------------------------------------------------
Query 2: Warwickshire, came from an ancient family and was the heiress to some land

Top results:
1. shakespeare.txt (Score: 0.117581)
2. levis.txt (Score: 0.023595)
3. Adobe.txt (Score: 0.022267)
4. google.txt (Score: 0.020327)
5. nike.txt (Score: 0.018433)
6. zomato.txt (Score: 0.017084)
7. huawei.txt (Score: 0.013440)
8. skype.txt (Score: 0.011370)
9. blackberry.txt (Score: 0.010775)
10. Dell.txt (Score: 0.010586)
--------------------------------------------------
