For Regular Indexing 

In [42]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [43]:
import pandas as pd
import pyterrier as pt
import os

def Regular_indexing():
    if not pt.started():
        pt.init()
        print("Java Virtual Machine started!")

    input_file = r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"
    df = pd.read_csv(input_file)
    df["docno"] = df["Doc_NO"].astype(str)

    # Validation checks
    assert df["Processed_Text"].notnull().all(), "Processed_Text has null values!"
    assert df["docno"].notnull().all(), "docno has null values!"
    assert df["docno"].is_unique, "docno values are not unique!"

    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("\nIndexing documents...")
        indexer = pt.DFIndexer(index_path, overwrite=True)
        index_ref = indexer.index(df["Processed_Text"], df["docno"])
        print("Index created at:", index_ref.toString())
    else:
        print("Index already exists at:", index_path)


In [44]:
from nltk.stem import SnowballStemmer

def search_term_regular_indexing(term):
    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("Index not found. Run Regular_indexing() first.")
        return

    index = pt.IndexFactory.of(index_path)
    stemmer = SnowballStemmer("english")
    term = term.lower()
    stemmed_term = stemmer.stem(term)

    print(f"\nSearching for: '{term}' (stemmed: '{stemmed_term}')")

    try:
        lexicon = index.getLexicon()
        if stemmed_term not in lexicon:
            print(f"Term '{stemmed_term}' not found in the index.")
            return

        pointer = lexicon[stemmed_term]
        print(f"Found term '{stemmed_term}' with stats: {pointer.toString()}")

        postings = index.getInvertedIndex().getPostings(pointer)
        meta = index.getMetaIndex()

        print("Documents containing the term:")
        for posting in postings:
            doc_id = posting.getId()
            docno = meta.getItem("docno", doc_id)
            doc_length = posting.getDocumentLength()
            print(f"- Doc ID: {doc_id} (docno: {docno}), Length: {doc_length}")
    except Exception as e:
        print("Search failed:", e)


Testing

In [45]:
def main():
    print("=== Step 1: Choose the Searching method ===")
    print("1. Regular Indexing + Search")
    option = int(input("Enter the number of the search: "))
    
    if option == 1:
        Regular_indexing()
        query = input("Enter your query: ")
        search_term_regular_indexing(query)


In [46]:
if main==main() :
    main()

=== Step 1: Choose the Searching method ===
1. Regular Indexing + Search


  if not pt.started():


Index already exists at: d:\DownLoad\projects\Search Engine\Olivia_Searchengine\preprocessing\CranfieldTitleIndex

Searching for: 'information' (stemmed: 'inform')
Found term 'inform' with stats: term700 Nt=1 TF=1 maxTF=1 @{0 5628 7}
Documents containing the term:
- Doc ID: 439 (docno: 440), Length: 8
