In [1]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### lnc.ltc

In [2]:
# Initialize stemmer, lemmatizer, and stop words
class Preprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words("english"))

    def preprocess(self, text):
        """
        Preprocess text: lowercase, remove punctuation, remove stopwords,
        and apply stemming and lemmatization.
        """
        text = text.lower()  # Case folding
        text = re.sub(r"\W+", " ", text)  # Remove non-word characters
        tokens = text.split()
        tokens = [
            self.lemmatizer.lemmatize(self.stemmer.stem(token))
            for token in tokens
            if token not in self.stop_words
        ]
        return tokens

In [3]:
class InvertedIndex:
    def __init__(self, corpus_dir, preprocessor):
        self.index = defaultdict(list)  # term -> postings (docID, tf_weight)
        self.doc_lengths = {}  # docID -> document length
        self.N = 0  # total number of documents
        self.doc_ids = {}  # mapping of filenames to docIDs
        self.preprocessor = preprocessor
        self.build_index(corpus_dir)

    def build_index(self, corpus_dir):
        """
        Builds an inverted index for the corpus directory.
        """
        for docID, filename in enumerate(os.listdir(corpus_dir), 1):
            if filename.endswith(".txt"):
                self.N += 1
                self.doc_ids[docID] = filename
                with open(
                    os.path.join(corpus_dir, filename), "r", encoding="utf-8"
                ) as file:
                    content = file.read()
                    tokens = self.preprocessor.preprocess(content)
                    term_freqs = Counter(tokens)
                    tf_weights = {}
                    doc_length = 0

                    # Calculate term frequency weights
                    for term, freq in term_freqs.items():
                        tf_weight = 1 + math.log10(freq)  # lnc: 1 + log10(tf)
                        tf_weights[term] = tf_weight
                        doc_length += tf_weight**2  # sum of squares of weights

                    doc_length = math.sqrt(
                        doc_length
                    )  # document length for normalization

                    # Store raw tf_weight in the index
                    for term, tf_weight in tf_weights.items():
                        self.index[term].append((docID, tf_weight))

                    self.doc_lengths[docID] = doc_length

In [4]:
class QueryProcessor:
    """
    This class processes queries and calculates cosine similarity between
    the query and the documents in the corpus using different weighting schemes.
    """

    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def compute_query_tfidf(self, query, index, N, scheme="ltc"):
        """
        Calculates the tf-idf for the query using either the 'ltc' or 'ltn' scheme.
        - 'ltc': 1 + log10(tf) for query, idf is included, and the vector is normalized.
        - 'ltn': Same as 'ltc', but the query is not normalized.
        """
        query_tokens = self.preprocessor.preprocess(query)
        query_term_freqs = Counter(query_tokens)
        query_tfidf = {}
        query_length = 0  # sum of squares of term tf-idf weights

        # Calculate tf-idf weights based on the scheme
        for term, freq in query_term_freqs.items():
            if term in index:
                df = len(index[term])  # document frequency
                idf = math.log10(N / df)  # idf = log10(N/df)
                tf_weight = 1 + math.log10(freq)  # ltn: 1 + log10(tf)

                tf_idf = tf_weight * idf  # tf-idf = tf_weight * idf
                query_tfidf[term] = tf_idf
                query_length += tf_idf**2  # sum of squares of tf-idf weights

        # Normalize the query vector for 'ltc' (not for 'ltn')
        if scheme == "ltc":
            query_length = math.sqrt(query_length)
            for term in query_tfidf:
                query_tfidf[term] /= query_length

        return query_tfidf

    def cosine_similarity(self, query_tfidf, index, doc_lengths):
        """
        Calculates the cosine similarity between the query and documents.
        Normalizes the scores using the document lengths.
        """
        doc_scores = defaultdict(float)

        # Compute the cosine similarity between query and documents
        for term, query_weight in query_tfidf.items():
            if term in index:
                for docID, doc_weight in index[term]:
                    doc_scores[docID] += query_weight * doc_weight

        # Normalize the document scores using document lengths
        for docID in doc_scores:
            doc_scores[docID] /= doc_lengths[docID]

        return sorted(doc_scores.items(), key=lambda x: (-x[1], x[0]))

In [5]:
class SearchEngine:
    """
    The main class for the search engine. It manages the indexing and querying.
    """

    def __init__(self, corpus_dir):
        self.preprocessor = Preprocessor()
        self.index = InvertedIndex(corpus_dir, self.preprocessor)
        self.query_processor = QueryProcessor(self.preprocessor)

    def search(self, query, scheme="ltc"):
        """
        Search for the given query in the corpus and return top 10 results.
        The `scheme` parameter controls whether 'ltc' or 'ltn' is used for the query tf-idf.
        """
        query_tfidf = self.query_processor.compute_query_tfidf(
            query, self.index.index, self.index.N, scheme=scheme
        )
        ranked_docs = self.query_processor.cosine_similarity(
            query_tfidf, self.index.index, self.index.doc_lengths
        )

        # Map docIDs back to filenames and return top 10 results
        ranked_files = [
            (self.index.doc_ids[docID], score) for docID, score in ranked_docs[:10]
        ]
        return ranked_files

In [6]:
# Usage Example
if __name__ == "__main__":
    corpus_dir = "corpus"  # Directory containing your corpus of text files
    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land",
        "Their Design School, filled with free video courses on a wide range of topics, is a good place to start.",
    ]

    search_engine = SearchEngine(corpus_dir)

    for i, query in enumerate(queries, 1):
        print(f"Query {i}: {query}")
        
        # lnc.ltc scheme
        print("\nUsing lnc.ltc scheme:")
        results_ltc = search_engine.search(query, scheme="ltc")
        if results_ltc:
            print("Top results:")
            for rank, (filename, score) in enumerate(results_ltc, 1):
                print(f"{rank}. {filename} (Score: {score:.6f})")
        else:
            print("No documents match the query.")
        
        print("-" * 50)
        
        # lnc.ltn scheme
        print("\nUsing lnc.ltn scheme:")
        results_ltn = search_engine.search(query, scheme="ltn")
        if results_ltn:
            print("Top results:")
            for rank, (filename, score) in enumerate(results_ltn, 1):
                print(f"{rank}. {filename} (Score: {score:.6f})")
        else:
            print("No documents match the query.")
        
        print("=" * 50)

Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Using lnc.ltc scheme:
Top results:
1. zomato.txt (Score: 0.214723)
2. swiggy.txt (Score: 0.131135)
3. instagram.txt (Score: 0.060525)
4. messenger.txt (Score: 0.059168)
5. youtube.txt (Score: 0.058451)
6. Discord.txt (Score: 0.053398)
7. bing.txt (Score: 0.051780)
8. paypal.txt (Score: 0.047086)
9. reddit.txt (Score: 0.044108)
10. flipkart.txt (Score: 0.040728)
--------------------------------------------------

Using lnc.ltn scheme:
Top results:
1. zomato.txt (Score: 0.548850)
2. swiggy.txt (Score: 0.335191)
3. instagram.txt (Score: 0.154706)
4. messenger.txt (Score: 0.151239)
5. youtube.txt (Score: 0.149406)
6. Discord.txt (Score: 0.136489)
7. bing.txt (Score: 0.132353)
8. paypal.txt (Score: 0.120355)
9. reddit.txt (Score: 0.112743)
10. flipkart.txt (Score: 0.104105)
Query 2: Warwickshire, came from an ancient family and was the heiress to some land

Using lnc.ltc

In [7]:
from tabulate import tabulate  # Import the tabulate library

# Rest of the code remains the same...

if __name__ == "__main__":
    corpus_dir = "corpus"  # Directory containing your corpus of text files
    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land",
        "Their Design School, filled with free video courses on a wide range of topics, is a good place to start.",
    ]

    search_engine = SearchEngine(corpus_dir)

    for i, query in enumerate(queries, 1):
        print(f"Query {i}: {query}")

        # lnc.ltc scheme
        print("\nUsing lnc.ltc scheme:")
        results_ltc = search_engine.search(query, scheme="ltc")
        if results_ltc:
            ltc_table = [
                [rank, filename, f"{score:.6f}"]
                for rank, (filename, score) in enumerate(results_ltc, 1)
            ]
            print(
                tabulate(
                    ltc_table, headers=["Rank", "Filename", "Score"], tablefmt="grid"
                )
            )
        else:
            print("No documents match the query.")

        print("-" * 50)

        # lnc.ltn scheme
        print("\nUsing lnc.ltn scheme:")
        results_ltn = search_engine.search(query, scheme="ltn")
        if results_ltn:
            ltn_table = [
                [rank, filename, f"{score:.6f}"]
                for rank, (filename, score) in enumerate(results_ltn, 1)
            ]
            print(
                tabulate(
                    ltn_table, headers=["Rank", "Filename", "Score"], tablefmt="grid"
                )
            )
        else:
            print("No documents match the query.")

        print("=" * 50)

Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Using lnc.ltc scheme:
+--------+---------------+----------+
|   Rank | Filename      |    Score |
|      1 | zomato.txt    | 0.214723 |
+--------+---------------+----------+
|      2 | swiggy.txt    | 0.131135 |
+--------+---------------+----------+
|      3 | instagram.txt | 0.060525 |
+--------+---------------+----------+
|      4 | messenger.txt | 0.059168 |
+--------+---------------+----------+
|      5 | youtube.txt   | 0.058451 |
+--------+---------------+----------+
|      6 | Discord.txt   | 0.053398 |
+--------+---------------+----------+
|      7 | bing.txt      | 0.05178  |
+--------+---------------+----------+
|      8 | paypal.txt    | 0.047086 |
+--------+---------------+----------+
|      9 | reddit.txt    | 0.044108 |
+--------+---------------+----------+
|     10 | flipkart.txt  | 0.040728 |
+--------+---------------+----------+
--------------------

In [8]:
from tabulate import tabulate  # Import the tabulate library

# Rest of the code remains the same...

if __name__ == "__main__":
    corpus_dir = "corpus"  # Directory containing your corpus of text files
    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land",
        "Their Design School, filled with free video courses on a wide range of topics, is a good place to start.",
    ]

    search_engine = SearchEngine(corpus_dir)

    for i, query in enumerate(queries, 1):
        print(f"Query {i}: {query}")

        # Get results for lnc.ltc and lnc.ltn schemes
        results_ltc = search_engine.search(query, scheme="ltc")
        results_ltn = search_engine.search(query, scheme="ltn")

        if results_ltc or results_ltn:
            # Prepare side-by-side comparison table
            comparison_table = []
            max_len = max(
                len(results_ltc), len(results_ltn)
            )  # Get max length for iteration

            for rank in range(max_len):
                ltc_entry = results_ltc[rank] if rank < len(results_ltc) else ("-", "-")
                ltn_entry = results_ltn[rank] if rank < len(results_ltn) else ("-", "-")

                comparison_table.append(
                    [
                        rank + 1,
                        ltc_entry[0],  # Filename for ltc
                        (
                            f"{ltc_entry[1]:.6f}" if ltc_entry[1] != "-" else "-"
                        ),  # Score for ltc
                        ltn_entry[0],  # Filename for ltn
                        (
                            f"{ltn_entry[1]:.6f}" if ltn_entry[1] != "-" else "-"
                        ),  # Score for ltn
                    ]
                )

            # Print the side-by-side comparison table
            print(
                tabulate(
                    comparison_table,
                    headers=[
                        "Rank",
                        "LTC Filename",
                        "LTC Score",
                        "LTN Filename",
                        "LTN Score",
                    ],
                    tablefmt="grid",
                )
            )
        else:
            print("No documents match the query.")

        print("=" * 50)

Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
+--------+----------------+-------------+----------------+-------------+
|   Rank | LTC Filename   |   LTC Score | LTN Filename   |   LTN Score |
|      1 | zomato.txt     |    0.214723 | zomato.txt     |    0.54885  |
+--------+----------------+-------------+----------------+-------------+
|      2 | swiggy.txt     |    0.131135 | swiggy.txt     |    0.335191 |
+--------+----------------+-------------+----------------+-------------+
|      3 | instagram.txt  |    0.060525 | instagram.txt  |    0.154706 |
+--------+----------------+-------------+----------------+-------------+
|      4 | messenger.txt  |    0.059168 | messenger.txt  |    0.151239 |
+--------+----------------+-------------+----------------+-------------+
|      5 | youtube.txt    |    0.058451 | youtube.txt    |    0.149406 |
+--------+----------------+-------------+----------------+-------------+
|  

In [9]:
import pandas as pd  # Import pandas

if __name__ == "__main__":
    corpus_dir = "corpus"  # Directory containing your corpus of text files
    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land",
        # "Their Design School, filled with free video courses on a wide range of topics, is a good place to start.",
    ]

    search_engine = SearchEngine(corpus_dir)
    output=[]

    for i, query in enumerate(queries, 1):
        print(f"Query {i}: {query}")

        # Get results for lnc.ltc and lnc.ltn schemes
        results_ltc = search_engine.search(query, scheme="ltc")
        results_ltn = search_engine.search(query, scheme="ltn")

        if results_ltc or results_ltn:
            # Prepare side-by-side comparison using pandas DataFrame
            max_len = max(
                len(results_ltc), len(results_ltn)
            )  # Get max length for iteration
            comparison_data = {
                "Rank": list(range(1, max_len + 1)),
                "!": ["║" for _ in range(max_len)],
                "lnc.ltc": [
                    results_ltc[rank][0] if rank < len(results_ltc) else "-"
                    for rank in range(max_len)
                ],
                "lnc.ltc score": [
                    f"{results_ltc[rank][1]:.6f}" if rank < len(results_ltc) else "-"
                    for rank in range(max_len)
                ],
                "|": ["║" for _ in range(max_len)],
                "lnc.ltn": [
                    results_ltn[rank][0] if rank < len(results_ltn) else "-"
                    for rank in range(max_len)
                ],
                "lnc.ltn score": [
                    f"{results_ltn[rank][1]:.6f}" if rank < len(results_ltn) else "-"
                    for rank in range(max_len)
                ],
            }

            # Create a pandas DataFrame
            df = pd.DataFrame(comparison_data)

            output.append(df)

            # Display the DataFrame
            print(df.to_string(index=False))
        else:
            print("No documents match the query.")

        print("=" * 70)
        print()

Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
 Rank !       lnc.ltc lnc.ltc score |       lnc.ltn lnc.ltn score
    1 ║    zomato.txt      0.214723 ║    zomato.txt      0.548850
    2 ║    swiggy.txt      0.131135 ║    swiggy.txt      0.335191
    3 ║ instagram.txt      0.060525 ║ instagram.txt      0.154706
    4 ║ messenger.txt      0.059168 ║ messenger.txt      0.151239
    5 ║   youtube.txt      0.058451 ║   youtube.txt      0.149406
    6 ║   Discord.txt      0.053398 ║   Discord.txt      0.136489
    7 ║      bing.txt      0.051780 ║      bing.txt      0.132353
    8 ║    paypal.txt      0.047086 ║    paypal.txt      0.120355
    9 ║    reddit.txt      0.044108 ║    reddit.txt      0.112743
   10 ║  flipkart.txt      0.040728 ║  flipkart.txt      0.104105

Query 2: Warwickshire, came from an ancient family and was the heiress to some land
 Rank !         lnc.ltc lnc.ltc score |         lnc.ltn lnc.ltn scor

In [10]:
output[0]

Unnamed: 0,Rank,!,lnc.ltc,lnc.ltc score,|,lnc.ltn,lnc.ltn score
0,1,║,zomato.txt,0.214723,║,zomato.txt,0.54885
1,2,║,swiggy.txt,0.131135,║,swiggy.txt,0.335191
2,3,║,instagram.txt,0.060525,║,instagram.txt,0.154706
3,4,║,messenger.txt,0.059168,║,messenger.txt,0.151239
4,5,║,youtube.txt,0.058451,║,youtube.txt,0.149406
5,6,║,Discord.txt,0.053398,║,Discord.txt,0.136489
6,7,║,bing.txt,0.05178,║,bing.txt,0.132353
7,8,║,paypal.txt,0.047086,║,paypal.txt,0.120355
8,9,║,reddit.txt,0.044108,║,reddit.txt,0.112743
9,10,║,flipkart.txt,0.040728,║,flipkart.txt,0.104105


In [11]:
output[1]

Unnamed: 0,Rank,!,lnc.ltc,lnc.ltc score,|,lnc.ltn,lnc.ltn score
0,1,║,shakespeare.txt,0.119976,║,shakespeare.txt,0.379023
1,2,║,levis.txt,0.024142,║,levis.txt,0.07627
2,3,║,Adobe.txt,0.022651,║,Adobe.txt,0.071557
3,4,║,google.txt,0.020737,║,google.txt,0.065513
4,5,║,nike.txt,0.019211,║,nike.txt,0.060691
5,6,║,zomato.txt,0.017713,║,zomato.txt,0.055958
6,7,║,huawei.txt,0.013724,║,huawei.txt,0.043357
7,8,║,skype.txt,0.011723,║,skype.txt,0.037034
8,9,║,blackberry.txt,0.010926,║,blackberry.txt,0.034518
9,10,║,Dell.txt,0.010766,║,Dell.txt,0.034013


#### Final output

In [12]:
from prettytable import PrettyTable  # Import PrettyTable

if __name__ == "__main__":
    corpus_dir = "corpus"  # Directory containing your corpus of text files
    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land",
        # "Their Design School, filled with free video courses on a wide range of topics, is a good place to start.",
    ]

    search_engine = SearchEngine(corpus_dir)

    for i, query in enumerate(queries, 1):
        print(f"Query {i}: {query}")

        # Get results for lnc.ltc and lnc.ltn schemes
        results_ltc = search_engine.search(query, scheme="ltc")
        results_ltn = search_engine.search(query, scheme="ltn")

        if results_ltc or results_ltn:
            # Prepare side-by-side comparison using PrettyTable
            max_len = max(
                len(results_ltc), len(results_ltn)
            )  # Get max length for iteration

            table = PrettyTable()
            table.field_names = [
                "Rank",
                "lnc.ltc",
                "lnc.ltc Score",
                "lnc.ltn",
                "lnc.ltn Score",
            ]

            for rank in range(max_len):
                # Prepare rows with results from both ltc and ltn schemes
                ltc_file = results_ltc[rank][0] if rank < len(results_ltc) else "-"
                ltc_score = (
                    f"{results_ltc[rank][1]:.6f}" if rank < len(results_ltc) else "-"
                )

                ltn_file = results_ltn[rank][0] if rank < len(results_ltn) else "-"
                ltn_score = (
                    f"{results_ltn[rank][1]:.6f}" if rank < len(results_ltn) else "-"
                )

                table.add_row([rank + 1, ltc_file, ltc_score, ltn_file, ltn_score])

            # Print the table
            print(table)
        else:
            print("No documents match the query.")

        # print("=" * 70)
        print()

Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
+------+---------------+---------------+---------------+---------------+
| Rank |    lnc.ltc    | lnc.ltc Score |    lnc.ltn    | lnc.ltn Score |
+------+---------------+---------------+---------------+---------------+
|  1   |   zomato.txt  |    0.214723   |   zomato.txt  |    0.548850   |
|  2   |   swiggy.txt  |    0.131135   |   swiggy.txt  |    0.335191   |
|  3   | instagram.txt |    0.060525   | instagram.txt |    0.154706   |
|  4   | messenger.txt |    0.059168   | messenger.txt |    0.151239   |
|  5   |  youtube.txt  |    0.058451   |  youtube.txt  |    0.149406   |
|  6   |  Discord.txt  |    0.053398   |  Discord.txt  |    0.136489   |
|  7   |    bing.txt   |    0.051780   |    bing.txt   |    0.132353   |
|  8   |   paypal.txt  |    0.047086   |   paypal.txt  |    0.120355   |
|  9   |   reddit.txt  |    0.044108   |   reddit.txt  |    0.112743   |
|  