For Regular Indexing 

In [142]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [143]:
input_file = r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"

In [144]:
df=pd.read_csv(input_file)

In [145]:
df.head()

Unnamed: 0,Doc_NO,Title,Bib,Text,Processed_Text
0,1,experimental investigation of the aerodynamics...,"j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,experiment investig of the aerodynam of a wing...
1,2,simple shear flow past a flat plate in an inco...,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simpl shear flow past a flat plate in an incom...
2,3,the boundary layer in simple shear flow past a...,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundari layer in simpl shear flow past a ...
3,4,approximate solutions of the incompressible la...,"j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...,approxim solut of the incompress laminar bound...
4,5,one-dimensional transient heat conduction into...,"j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,onedimension transient heat conduct into a dou...


In [146]:
import pandas as pd
import pyterrier as pt
import os

def Regular_indexing():
    if not pt.started():
        pt.init()
        print("Java Virtual Machine started!")

    input_file = r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"
    df = pd.read_csv(input_file)
    df["docno"] = df["Doc_NO"].astype(str)

    # Validation checks
    assert df["Processed_Text"].notnull().all(), "Processed_Text has null values!"
    assert df["docno"].notnull().all(), "docno has null values!"
    assert df["docno"].is_unique, "docno values are not unique!"

    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("\nIndexing documents...")
        indexer = pt.DFIndexer(index_path, overwrite=True)
        index_ref = indexer.index(df["Processed_Text"], df["docno"])
        print("Index created at:", index_ref.toString())
    else:
        print("Index already exists at:", index_path)


In [147]:
from nltk.stem import SnowballStemmer

def search_term_regular_indexing(query_term):
    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("Index not found. Run Regular_indexing() first.")
        return

    index = pt.IndexFactory.of(index_path)
    stemmer = SnowballStemmer("english")
    for term in query_term :
        term = term.lower()
        stemmed_term = stemmer.stem(term)

        print(f"\nSearching for: '{term}' (stemmed: '{stemmed_term}')")

        try:
            lexicon = index.getLexicon()
            if stemmed_term not in lexicon:
                print(f"Term '{stemmed_term}' not found in the index.")
                return

            pointer = lexicon[stemmed_term]
            print(f"Found term '{stemmed_term}' with stats: {pointer.toString()}")

            postings = index.getInvertedIndex().getPostings(pointer)
            meta = index.getMetaIndex()

            print("Documents containing the term:")
            for posting in postings:
                doc_id = posting.getId()
                docno = meta.getItem("docno", doc_id)
                doc_length = posting.getDocumentLength()
                print(f"- Doc ID: {doc_id} (docno: {docno}), Length: {doc_length}")
        except Exception as e:
            print("Search failed:", e)


Adding Boolean Retreivel

In [148]:
def boolean_query(query_terms, operator):
    df=pd.read_csv(r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv")
    # Convert all query terms to lowercase
    query_terms = [term.lower() for term in query_terms]

    # Use CountVectorizer to get binary term-document matrix
    vectorizer = CountVectorizer(binary=True, stop_words="english")
    X = vectorizer.fit_transform(df['Title'])
    terms = vectorizer.get_feature_names_out()
    td_matrix = X.toarray()

    # Create a query vector
    query_vector = [1 if term in query_terms else 0 for term in terms]

    result = td_matrix @ query_vector  # Matrix multiplication (dot product)

    if operator.upper() == "AND":
        return df['Doc_NO'][[count == len(query_terms) for count in result]].tolist()
    elif operator.upper() == "OR":
        return df['Doc_NO'][[count > 0 for count in result]].tolist()
    elif operator.upper() == "NOT":
        return df['Doc_NO'][[count == 0 for count in result]].tolist()
    else:
        raise ValueError("Operator must be 'AND', 'OR', or 'NOT'")

In [149]:
print("AND:", boolean_query( ["experimental", "investigation"], "AND"))

AND: [1, 84, 189, 372, 423, 497, 569, 662, 766, 816, 836, 858, 1062, 1074, 1075, 1098, 1156, 1159, 1364]


Adding Query preprocessing 

In [150]:
def preprocess_query(query, stemmer=SnowballStemmer('english')):
    query = query.lower()
    query = re.sub(r'[^a-zA-Z\s]', '', query)
    query = re.sub(r'\s+', ' ', query).strip()
    tokens = query.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens


sample_query = 'Experimental Aerodynamics Wing'
print('Sample query:', sample_query)
print('Preprocessed query tokens:', preprocess_query(sample_query))

Sample query: Experimental Aerodynamics Wing
Preprocessed query tokens: ['experiment', 'aerodynam', 'wing']


In [151]:

def retrieve_documents(query_tokens, df):
    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("Index not found. Run Regular_indexing() first.")
        return

    index = pt.IndexFactory.of(index_path)
    lexicon = index.getLexicon()
    doc_sets = []

    for token in query_tokens:
        try:
            pointer = lexicon[token]
            postings = index.getInvertedIndex().getPostings(pointer)
            doc_ids = [posting.getId() for posting in postings]
            doc_sets.append(set(doc_ids))
        except KeyError:
            print(f"Term '{token}' not found in index.")
            return []

    if not doc_sets:
        return []
    common_docs = list(set.intersection(*doc_sets))

    results = []
    for doc_id in common_docs:
        docno = df['Doc_NO'].iloc[doc_id]
        title = df['Title'].iloc[doc_id]
        processed_text = df['Processed_Text'].iloc[doc_id]
        results.append({
            'doc_id': doc_id,
            'docno': docno,
            'title': title,
            'processed_text': processed_text
        })
    for doc in results[:2]:
        print(f"Docno: {doc['docno']}, Title: {doc['title']}")

    

test_query = 'experimental investigation' #aerodynamics wing
test_tokens = preprocess_query(test_query)
docs = retrieve_documents(test_tokens, df)
print(f'\nDocuments retrieved for query "{test_query}":')
# for doc in docs[:2]:
#     print(f"Docno: {doc['docno']}, Title: {doc['title']}")

Term 'experiment' not found in index.

Documents retrieved for query "experimental investigation":


In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer
def rank_documents(documents, query_tokens):
    if not documents:
        return []

    corpus = [doc['processed_text'] for doc in documents]
    query = ' '.join(query_tokens)

    vectorizer = TfidfVectorizer(vocabulary=query_tokens)
    try:
        tfidf_matrix = vectorizer.fit_transform(corpus)
        scores = tfidf_matrix.sum(axis=1).A1
    except ValueError as e:
        print('TF-IDF calculation failed:', e)
        scores = [0] * len(documents)

    for i, doc in enumerate(documents):
        doc['tfidf_score'] = scores[i]

    ranked_docs = sorted(documents, key=lambda x: x['tfidf_score'], reverse=True)
    return ranked_docs

ranked_docs = rank_documents(docs, test_tokens)
print(f'\nTop ranked documents for query "{test_query}":')
for doc in ranked_docs[:2]:
    print(f"Docno: {doc['docno']}, Title: {doc['title']}, TF-IDF Score: {doc['tfidf_score']:.4f}")


Top ranked documents for query "experimental investigation":


In [None]:
def boolean_search(query, df, top_k=5):
    query = query.lower().strip()
    tokens = re.findall(r'\b\w+\b|and|or|not', query)
    stemmer = SnowballStemmer('english')
    index_path = os.path.abspath("./CranfieldTitleIndex")
    if not os.path.exists(index_path) or not os.listdir(index_path):
        print("Index not found. Run Regular_indexing() first.")
        return

    index = pt.IndexFactory.of(index_path)
    
    lexicon = index.getLexicon()
    doc_sets = []
    current_term = ''
    operator = 'AND'

    for token in tokens:
        if token in ['and', 'or', 'not']:
            operator = token.upper()
            continue
        stemmed = stemmer.stem(token)
        doc_ids = set()
        if stemmed in lexicon:
            postings = index.getInvertedIndex().getPostings(lexicon[stemmed])
            doc_ids = {posting.getId() for posting in postings}
        if operator == 'NOT':
            all_docs = set(range(len(df)))
            doc_ids = all_docs - doc_ids
            operator = 'AND'
        doc_sets.append((doc_ids, operator))
        current_term = stemmed

    if not doc_sets:
        print('No valid terms found.')
        return []

    result_docs = doc_sets[0][0]
    for i in range(1, len(doc_sets)):
        docs, op = doc_sets[i]
        if op == 'AND':
            result_docs &= docs
        elif op == 'OR':
            result_docs |= docs

    documents = [{'doc_id': doc_id, 'docno': df['docno'].iloc[doc_id], 'title': df['Title'].iloc[doc_id], 'processed_text': df['Processed_Text'].iloc[doc_id]} for doc_id in result_docs]
    query_tokens = [stemmer.stem(t) for t in re.findall(r'\b\w+\b', query)]
    ranked_docs = rank_documents(documents, query_tokens)

    print(f'Boolean search results for "{query}":')
    for i, doc in enumerate(ranked_docs[:top_k], 1):
        print(f'{i}. Docno: {doc["docno"]}, Title: {doc["title"]}, TF-IDF Score: {doc["tfidf_score"]:.4f}')
    return ranked_docs

boolean_search('aerodynamics AND wing NOT supersonic',df)
boolean_search('boundary OR layer',df)

In [153]:
def search(query, df, top_k=5):
    # index_path = os.path.abspath("./CranfieldTitleIndex")
    # if not os.path.exists(index_path) or not os.listdir(index_path):
    #     print("Index not found. Run Regular_indexing() first.")
    #     return

    # index = pt.IndexFactory.of(index_path)
    
    print(f'\n=== Searching for: "{query}" ===')
    query_tokens = preprocess_query(query)
    print('Query tokens:', query_tokens)

    documents = retrieve_documents(query_tokens, df)
    if not documents:
        print('No documents found.')
        return
    print(f'Found {len(documents)} documents.')

    ranked_docs = rank_documents(documents, query_tokens)

    print(f'Top {min(top_k, len(ranked_docs))} results:')
    for i, doc in enumerate(ranked_docs[:top_k], 1):
        print(f'{i}. Docno: {doc["docno"]}, Title: {doc["title"]}, TF-IDF Score: {doc["tfidf_score"]:.4f}')

search('experimental investigation', df)
search('information retrieval', df)
search('nonexistent term', df)


=== Searching for: "experimental investigation" ===
Query tokens: ['experiment', 'investig']
Term 'experiment' not found in index.
No documents found.

=== Searching for: "information retrieval" ===
Query tokens: ['inform', 'retriev']
Term 'retriev' not found in index.
No documents found.

=== Searching for: "nonexistent term" ===
Query tokens: ['nonexist', 'term']
Term 'nonexist' not found in index.
No documents found.


Testing

In [None]:
def main():
    print("=== Step 1: Choose the Searching method ===")
    print("1. Regular Indexing + Search")
    print("2. Boolean Retreivel")
    print("3.TF-IDF Retreiveal")

    option = int(input("Enter the number of the search: "))
    
    if option == 1:
        Regular_indexing()
        query = input("Enter your query: ")
        # query_terms=preprocess_query(query)
        # search_term_regular_indexing(query_terms)
        # print("================Retreving the Query=================")
        # retrieve_documents(query_terms,df)
        search(query,df,top_k=5)
    if option == 2 :

        query = input("Enter your the words: ").strip().lower().split()
        operator=input("Enter the Operator {AND,OR,NOT} : ")
        print(boolean_query(query, operator.upper()))
        for
    if option ==3 : 
        query=input("Enter your the words: ")
        search(query,top_k=5)
           




In [155]:
if main==main() :
    main()

=== Step 1: Choose the Searching method ===
1. Regular Indexing + Search
2. Boolean Retreivel
3.TF-IDF Retreiveal
[1, 84, 189, 372, 423, 497, 569, 662, 766, 816, 836, 858, 1062, 1074, 1075, 1098, 1156, 1159, 1364]
