In [1]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

# Initialize stemmer, lemmatizer, and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Preprocessing function: lowercase, remove punctuation, remove stopwords, apply stemming and lemmatization
def preprocess(text):
    text = text.lower()  # case folding
    text = re.sub(r"\W+", " ", text)  # remove non-word characters
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(stemmer.stem(token))
        for token in tokens
        if token not in stop_words
    ]
    return tokens


def build_index(corpus_dir):
    index = defaultdict(list)  # term -> postings (docID, tf_weight)
    doc_lengths = {}  # docID -> document length (for normalization)
    N = 0  # total number of documents
    doc_ids = {}  # mapping of filenames to docIDs

    for docID, filename in enumerate(os.listdir(corpus_dir), 1):
        if filename.endswith(".txt"):
            N += 1
            doc_ids[docID] = filename
            with open(
                os.path.join(corpus_dir, filename), "r", encoding="utf-8"
            ) as file:
                content = file.read()
                tokens = preprocess(content)
                term_freqs = Counter(tokens)
                tf_weights = {}
                doc_length = 0

                for term, freq in term_freqs.items():
                    tf_weight = 1 + math.log10(freq)  # lnc: 1 + log10(tf)
                    tf_weights[term] = tf_weight
                    doc_length += tf_weight**2  # sum of squares of weights

                doc_length = math.sqrt(doc_length)  # document length for normalization

                # Store raw tf_weight in the index (no normalization here)
                for term, tf_weight in tf_weights.items():
                    index[term].append((docID, tf_weight))

                doc_lengths[docID] = doc_length

    return index, doc_lengths, N, doc_ids

# Calculate the tf-idf for the query using ltc for queries
def compute_query_tfidf(query, index, N):
    query_tokens = preprocess(query)
    query_term_freqs = Counter(query_tokens)
    query_tfidf = {}
    query_length = 0  # sum of squares of term tf-idf weights

    for term, freq in query_term_freqs.items():
        if term in index:
            df = len(index[term])  # document frequency
            idf = math.log10(N / df)  # idf = log10(N/df)
            tf_weight = 1 + math.log10(freq)  # ltc: 1 + log10(tf) for query
            tf_idf = tf_weight * idf  # tf-idf = tf_weight * idf
            query_tfidf[term] = tf_idf
            query_length += tf_idf**2  # sum of squares of tf-idf weights

    query_length = math.sqrt(query_length)  # normalize query length

    # Normalize the query's tf-idf weights
    for term in query_tfidf:
        query_tfidf[term] /= query_length

    return query_tfidf, query_length


def cosine_similarity(query_tfidf, query_length, index, doc_lengths):
    doc_scores = defaultdict(float)

    for term, query_weight in query_tfidf.items():
        if term in index:
            for docID, doc_weight in index[term]:
                doc_scores[docID] += query_weight * doc_weight

    # Normalize by document lengths (keep this normalization)
    for docID in doc_scores:
        doc_scores[docID] /= doc_lengths[docID]

    return sorted(doc_scores.items(), key=lambda x: (-x[1], x[0]))


# Search function to process queries and return top 10 relevant documents
def search(query, index, doc_lengths, N, doc_ids):
    query_tfidf, query_length = compute_query_tfidf(query, index, N)
    ranked_docs = cosine_similarity(query_tfidf, query_length, index, doc_lengths)

    # Map docIDs back to filenames and return top 10 results
    ranked_files = [(doc_ids[docID], score) for docID, score in ranked_docs[:10]]
    return ranked_files


# Load and index the corpus
corpus_dir = "corpus"  # Directory containing your corpus of text files
queries = [
    "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
    "Warwickshire, came from an ancient family and was the heiress to some land",
    "Their Design School, filled with free video courses on a wide range of topics, is a good place to start. There's a ",
]
index, doc_lengths, N, doc_ids = build_index(corpus_dir)

# Test queries
for i, query in enumerate(queries, 1):
    print(f"Query {i}: {query}")
    results = search(query, index, doc_lengths, N, doc_ids)
    if results:
        print("\nTop results:")
        for rank, (filename, score) in enumerate(results, 1):
            print(f"{rank}. {filename} (Score: {score:.6f})")
    else:
        print("No documents match the query.")
    print("-" * 50)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Top results:
1. zomato.txt (Score: 0.214723)
2. swiggy.txt (Score: 0.131135)
3. instagram.txt (Score: 0.060525)
4. messenger.txt (Score: 0.059168)
5. youtube.txt (Score: 0.058451)
6. Discord.txt (Score: 0.053398)
7. bing.txt (Score: 0.051780)
8. paypal.txt (Score: 0.047086)
9. reddit.txt (Score: 0.044108)
10. flipkart.txt (Score: 0.040728)
--------------------------------------------------
Query 2: Warwickshire, came from an ancient family and was the heiress to some land

Top results:
1. shakespeare.txt (Score: 0.119976)
2. levis.txt (Score: 0.024142)
3. Adobe.txt (Score: 0.022651)
4. google.txt (Score: 0.020737)
5. nike.txt (Score: 0.019211)
6. zomato.txt (Score: 0.017713)
7. huawei.txt (Score: 0.013724)
8. skype.txt (Score: 0.011723)
9. blackberry.txt (Score: 0.010926)
10. Dell.txt (Score: 0.010766)
--------------------------------------------------
Query 3: Thei

In [2]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

# Initialize stemmer, lemmatizer, and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Preprocessing function: lowercase, remove punctuation, remove stopwords, apply stemming and lemmatization
def preprocess(text):
    text = text.lower()  # case folding
    text = re.sub(r"\W+", " ", text)  # remove non-word characters
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(stemmer.stem(token))
        for token in tokens
        if token not in stop_words
    ]
    return tokens

def build_index(corpus_dir):
    index = defaultdict(list)  # term -> postings (docID, tf_weight)
    doc_lengths = {}  # docID -> document length (for normalization)
    N = 0  # total number of documents
    doc_ids = {}  # mapping of filenames to docIDs

    for docID, filename in enumerate(os.listdir(corpus_dir), 1):
        if filename.endswith(".txt"):
            N += 1
            doc_ids[docID] = filename
            with open(
                os.path.join(corpus_dir, filename), "r", encoding="utf-8"
            ) as file:
                content = file.read()
                tokens = preprocess(content)
                term_freqs = Counter(tokens)
                tf_weights = {}
                doc_length = 0

                for term, freq in term_freqs.items():
                    tf_weight = 1 + math.log10(freq)  # lnc: 1 + log10(tf)
                    tf_weights[term] = tf_weight
                    doc_length += tf_weight**2  # sum of squares of weights

                doc_length = math.sqrt(doc_length)  # document length for normalization

                # Store raw tf_weight in the index (no normalization here)
                for term, tf_weight in tf_weights.items():
                    index[term].append((docID, tf_weight))

                doc_lengths[docID] = doc_length

    return index, doc_lengths, N, doc_ids

# Calculate the tf-idf for the query using ltn for queries
def compute_query_tfidf(query, index, N):
    query_tokens = preprocess(query)
    query_term_freqs = Counter(query_tokens)
    query_tfidf = {}
    query_length = 0  # sum of squares of term tf-idf weights

    for term, freq in query_term_freqs.items():
        if term in index:
            df = len(index[term])  # document frequency
            idf = math.log10(N / df)  # idf = log10(N/df)
            tf_weight = 1 + math.log10(freq)  # ltn: 1 + log10(tf) for query
            tf_idf = tf_weight * idf  # tf-idf = tf_weight * idf
            query_tfidf[term] = tf_idf
            query_length += tf_idf**2  # sum of squares of tf-idf weights

    query_length = math.sqrt(query_length)  # normalize query length

    # Normalize the query's tf-idf weights
    for term in query_tfidf:
        query_tfidf[term] /= query_length

    return query_tfidf, query_length

def cosine_similarity(query_tfidf, query_length, index, doc_lengths):
    doc_scores = defaultdict(float)

    for term, query_weight in query_tfidf.items():
        if term in index:
            for docID, doc_weight in index[term]:
                doc_scores[docID] += query_weight * doc_weight

    # Normalize by document lengths (keep this normalization)
    for docID in doc_scores:
        doc_scores[docID] /= doc_lengths[docID]

    return sorted(doc_scores.items(), key=lambda x: (-x[1], x[0]))

# Search function to process queries and return top 10 relevant documents
def search(query, index, doc_lengths, N, doc_ids):
    query_tfidf, query_length = compute_query_tfidf(query, index, N)
    ranked_docs = cosine_similarity(query_tfidf, query_length, index, doc_lengths)

    # Map docIDs back to filenames and return top 10 results
    ranked_files = [(doc_ids[docID], score) for docID, score in ranked_docs[:10]]
    return ranked_files

# Load and index the corpus
corpus_dir = "corpus"  # Directory containing your corpus of text files
queries = [
    "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
    "Warwickshire, came from an ancient family and was the heiress to some land",
    "Their Design School, filled with free video courses on a wide range of topics, is a good place to start. There's a ",
]
index, doc_lengths, N, doc_ids = build_index(corpus_dir)

# Test queries
for i, query in enumerate(queries, 1):
    print(f"Query {i}: {query}")
    results = search(query, index, doc_lengths, N, doc_ids)
    if results:
        print("\nTop results:")
        for rank, (filename, score) in enumerate(results, 1):
            print(f"{rank}. {filename} (Score: {score:.6f})")
    else:
        print("No documents match the query.")
    print("-" * 50)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Top results:
1. zomato.txt (Score: 0.214723)
2. swiggy.txt (Score: 0.131135)
3. instagram.txt (Score: 0.060525)
4. messenger.txt (Score: 0.059168)
5. youtube.txt (Score: 0.058451)
6. Discord.txt (Score: 0.053398)
7. bing.txt (Score: 0.051780)
8. paypal.txt (Score: 0.047086)
9. reddit.txt (Score: 0.044108)
10. flipkart.txt (Score: 0.040728)
--------------------------------------------------
Query 2: Warwickshire, came from an ancient family and was the heiress to some land

Top results:
1. shakespeare.txt (Score: 0.119976)
2. levis.txt (Score: 0.024142)
3. Adobe.txt (Score: 0.022651)
4. google.txt (Score: 0.020737)
5. nike.txt (Score: 0.019211)
6. zomato.txt (Score: 0.017713)
7. huawei.txt (Score: 0.013724)
8. skype.txt (Score: 0.011723)
9. blackberry.txt (Score: 0.010926)
10. Dell.txt (Score: 0.010766)
--------------------------------------------------
Query 3: Thei