For Regular Indexing 

In [98]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [99]:
input_file = r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"

In [100]:
df=pd.read_csv(input_file)

In [101]:
df.head()

Unnamed: 0,Doc_NO,Title,Bib,Text,Processed_Text
0,1,experimental investigation of the aerodynamics...,"j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,experiment investig of the aerodynam of a wing...
1,2,simple shear flow past a flat plate in an inco...,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simpl shear flow past a flat plate in an incom...
2,3,the boundary layer in simple shear flow past a...,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundari layer in simpl shear flow past a ...
3,4,approximate solutions of the incompressible la...,"j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...,approxim solut of the incompress laminar bound...
4,5,one-dimensional transient heat conduction into...,"j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,onedimension transient heat conduct into a dou...


In [102]:
import pandas as pd
import pyterrier as pt
import os

def Regular_indexing():
    if not pt.started():
        pt.init()
        print("Java Virtual Machine started!")

    input_file = r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"
    df = pd.read_csv(input_file)
    df["docno"] = df["Doc_NO"].astype(str)

    # Validation checks
    assert df["Processed_Text"].notnull().all(), "Processed_Text has null values!"
    assert df["docno"].notnull().all(), "docno has null values!"
    assert df["docno"].is_unique, "docno values are not unique!"

    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("\nIndexing documents...")
        indexer = pt.DFIndexer(index_path, overwrite=True)
        index_ref = indexer.index(df["Processed_Text"], df["docno"])
        print("Index created at:", index_ref.toString())
    else:
        print("Index already exists at:", index_path)


In [103]:
from nltk.stem import SnowballStemmer

def search_term_regular_indexing(term):
    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("Index not found. Run Regular_indexing() first.")
        return

    index = pt.IndexFactory.of(index_path)
    stemmer = SnowballStemmer("english")
    term = term.lower()
    stemmed_term = stemmer.stem(term)

    print(f"\nSearching for: '{term}' (stemmed: '{stemmed_term}')")

    try:
        lexicon = index.getLexicon()
        if stemmed_term not in lexicon:
            print(f"Term '{stemmed_term}' not found in the index.")
            return

        pointer = lexicon[stemmed_term]
        print(f"Found term '{stemmed_term}' with stats: {pointer.toString()}")

        postings = index.getInvertedIndex().getPostings(pointer)
        meta = index.getMetaIndex()

        print("Documents containing the term:")
        for posting in postings:
            doc_id = posting.getId()
            docno = meta.getItem("docno", doc_id)
            doc_length = posting.getDocumentLength()
            print(f"- Doc ID: {doc_id} (docno: {docno}), Length: {doc_length}")
    except Exception as e:
        print("Search failed:", e)


Adding Boolean Retreivel

In [104]:
def boolean_query(query_terms, operator):
    df=pd.read_csv(r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv")
    # Convert all query terms to lowercase
    query_terms = [term.lower() for term in query_terms]

    # Use CountVectorizer to get binary term-document matrix
    vectorizer = CountVectorizer(binary=True, stop_words="english")
    X = vectorizer.fit_transform(df['Title'])
    terms = vectorizer.get_feature_names_out()
    td_matrix = X.toarray()

    # Create a query vector
    query_vector = [1 if term in query_terms else 0 for term in terms]

    result = td_matrix @ query_vector  # Matrix multiplication (dot product)

    if operator.upper() == "AND":
        return df['Doc_NO'][[count == len(query_terms) for count in result]].tolist()
    elif operator.upper() == "OR":
        return df['Doc_NO'][[count > 0 for count in result]].tolist()
    elif operator.upper() == "NOT":
        return df['Doc_NO'][[count == 0 for count in result]].tolist()
    else:
        raise ValueError("Operator must be 'AND', 'OR', or 'NOT'")

In [105]:
print("AND:", boolean_query( ["experimental", "investigation"], "AND"))

AND: [1, 84, 189, 372, 423, 497, 569, 662, 766, 816, 836, 858, 1062, 1074, 1075, 1098, 1156, 1159, 1364]


Testing

In [106]:
def main():
    print("=== Step 1: Choose the Searching method ===")
    print("1. Regular Indexing + Search")
    print("2. Boolean Retreivel")
    option = int(input("Enter the number of the search: "))
    
    if option == 1:
        Regular_indexing()
        query = input("Enter your query: ")
        search_term_regular_indexing(query)
    if option == 2 :

        query = input("Enter your the words: ")
        operator=input("Enter the Operator {AND,OR,NOT} : ")
        print(boolean_query(query, operator.upper()))   




In [107]:
if main==main() :
    main()

=== Step 1: Choose the Searching method ===
1. Regular Indexing + Search
2. Boolean Retreivel
[]
