# Search Engine Project: Phases 1, 2, and 3

This notebook implements a search engine for the Cranfield dataset across three phases:
- **Phase 1**: Indexes the dataset using PyTerrier.
- **Phase 2 (Query Preproccessing)**: Processes queries with TF-IDF ranking, fixing identical scores and restrictive retrieval.
- **Phase 3 (Query Expansion)**: Adds query expansion with relevance feedback, synonym mapping, and BERT, a Gradio interface, and evaluation.

## Phase 1: Indexing the Cranfield Dataset

Preprocessing the dataset, comparing stemming methods, and creating a PyTerrier index.

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [None]:
input_file = "D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran.all.1400.csv"
output_file = "D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"

In [None]:
print("=== Loading the Cranfield Dataset ===")
data = pd.read_csv(input_file)
df = pd.DataFrame(data)
print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows of raw data:")
print(df.head())

In [None]:
print("\n=== Checking for Missing Values ===")
print("Missing values in 'Title':", df['Title'].isna().sum())
print("Missing values in 'Text':", df['Text'].isna().sum())
print("Total rows before dropping NaN:", len(df))

In [None]:
df = df.dropna(subset=['Title'])
print("Total rows after dropping NaN in Title:", len(df))
print("\nFirst 5 rows after dropping NaN:")
print(df.head())

In [None]:
print("\n=== Step 1: Cleaning Titles ===")
cleaned_titles = []
for title in df['Title']:
    title_clean = re.sub(r'[^a-zA-Z\s]', '', str(title))
    title_clean = re.sub(r'\s+', ' ', title_clean).strip()
    cleaned_titles.append(title_clean.lower())
df['Cleaned_Title'] = cleaned_titles
print("Sample of cleaned Titles (first 2 rows):")
print(df[['Doc_NO', 'Cleaned_Title']].head(2))

In [None]:
print("\n=== Step 2: Tokenizing Titles and Vocabulary Analysis ===")
vectorizer = CountVectorizer(
    stop_words="english",
    lowercase=True,
    token_pattern=r'\b[a-zA-Z]+\b'
)
vector = vectorizer.fit_transform(df['Cleaned_Title'])
terms = vectorizer.get_feature_names_out()
print("Total unique terms in Titles:", len(terms))
print("First 20 terms in Title vocabulary:", terms[:20])

In [None]:
tokenized_titles = []
for title in df['Cleaned_Title']:
    words = title.split()
    tokenized_titles.append(words)
df['Title_Tokens'] = tokenized_titles
print("\nSample tokenized Titles (first 2 rows):")
print(df[['Doc_NO', 'Title_Tokens']].head(2))

In [None]:
print("\n=== Step 3: Comparing Stemming Methods ===")
porter = PorterStemmer()
snowball = SnowballStemmer("english")
lancaster = LancasterStemmer()

In [None]:
porter_stemmed = []
snowball_stemmed = []
lancaster_stemmed = []
for word in terms:
    porter_stemmed.append(porter.stem(word))
    snowball_stemmed.append(snowball.stem(word))
    lancaster_stemmed.append(lancaster.stem(word))

In [None]:
print("\nStemming Comparison (First 5 Title Terms):")
print("-" * 60)
print(f"{'Original':<15} | {'Porter':<15} | {'Snowball':<15} | {'Lancaster':<15}")
print("-" * 60)
for i in range(min(5, len(terms))):
    print(f"{terms[i]:<15} | {porter_stemmed[i]:<15} | {snowball_stemmed[i]:<15} | {lancaster_stemmed[i]:<15}")
print("-" * 60)

In [None]:
print("\nApplying Snowball Stemming to Title Tokens...")
stemmed_titles = []
for tokens in df['Title_Tokens']:
    stemmed_words = []
    for word in tokens:
        stemmed_words.append(snowball.stem(word))
    stemmed_titles.append(stemmed_words)
df['Stemmed_Title_Tokens'] = stemmed_titles
print("Sample stemmed Titles (first 2 rows):")
print(df[['Doc_NO', 'Stemmed_Title_Tokens']].head(2))

In [None]:
print("\n=== Step 4: Creating Processed_Text from Titles for Indexing ===")
processed_text = []
for stemmed_tokens in df['Stemmed_Title_Tokens']:
    joined = " ".join(stemmed_tokens)
    processed_text.append(joined)
df['Processed_Text'] = processed_text
print("Sample Processed_Text from Titles (first 2 rows):")
print(df[['Doc_NO', 'Processed_Text']].head(2))

In [None]:
print("\n=== Step 6: Saving Processed Data ===")
output_df = df[['Doc_NO', 'Title', 'Bib', 'Text', 'Processed_Text']]
output_df.to_csv(output_file, index=False)
print("Saved to:", output_file)
print("Final output (first 5 rows):")
print(output_df.head())

In [None]:
print("\n=== Step 5: Creative Title Insights ===")
print("Average token count per Title:", round(df['Title_Tokens'].apply(len).mean(), 2))
print("Longest Title (tokens):", df['Title_Tokens'].apply(len).max(), "in Doc_NO:",
      df['Doc_NO'][df['Title_Tokens'].apply(len).idxmax()])
print("Most frequent term in Titles (before stemming):")
word_counts = vector.toarray().sum(axis=0)
top_term_idx = word_counts.argmax()
print(f"'{terms[top_term_idx]}' appears {word_counts[top_term_idx]} times")

In [None]:
!pip install python-terrier

In [None]:
import pyterrier as pt

In [None]:
if not pt.java.started():
    pt.java.init()
    print("Java Virtual Machine started!")

In [None]:
input_file = "D:\DownLoad\projects\Search Engine\Olivia_Searchengine\datacollection\output\cran_preprocessed_modern.csv"

In [None]:
df = pd.read_csv(input_file)
print(df.head())

In [None]:
df["docno"] = df["Doc_NO"].astype(str)
print("\nSample with docno (first 2 rows):")
print(df[['docno', 'Title', 'Processed_Text']].head(2))

In [None]:
print("\n=== Step 1: Creating and Indexing the Titles ===")
indexer = pt.DFIndexer(r"D:\DownLoad\projects\Search Engine\Olivia_Searchengine\preprocessing\CranfieldTitleIndex", overwrite=True)
index_ref = indexer.index(df["Processed_Text"], df["docno"])
print("Index location:", index_ref.toString())
print("Indexing complete! Stored at:", index_ref.toString())

In [None]:
print("\n=== Step 2: Loading the Index ===")
index = pt.IndexFactory.of(index_ref)
print("Index loaded successfully!")

In [None]:
lexicon = index.getLexicon()
count = 0
for kv in lexicon:
    if count < 10:
        term = kv.getKey()
        entry = kv.getValue()
        print(f"{term} -> Nt={entry.getNumberOfEntries()} TF={entry.getFrequency()} maxTF={entry.getMaxFrequencyInDocuments()}")
        count = count + 1
    else:
        break

In [None]:
print("\n=== Step 5: Setting Up Search Function ===")
def search_term(term):
    stemmer = SnowballStemmer("english")
    term = term.lower()
    stemmed_term = stemmer.stem(term)
    print(f"\nSearching for: '{term}' (stemmed: '{stemmed_term}')")
    try:
        pointer = index.getLexicon()[stemmed_term]
        print(f"Found term '{stemmed_term}' with stats: {pointer.toString()}")
        print("Documents containing the term:")
        postings = index.getInvertedIndex().getPostings(pointer)
        for posting in postings:
            doc_id = posting.getId()
            doc_length = posting.getDocumentLength()
            print(f"- Doc ID: {doc_id} (docno: {df['docno'].iloc[doc_id]}), Length: {doc_length}")
    except KeyError:
        print(f"Term '{stemmed_term}' not found in the index.")

In [None]:
search_term("information")
search_term("Omar")

## Phase 2: Query Processing with TF-IDF Ranking

This phase processes queries, retrieves documents, and ranks them using TF-IDF. It fixes:
- **Identical scores (1.4142)**: Uses full vocabulary and cosine similarity for varied scores.
- **Restrictive retrieval**: Retrieves documents with *any* query term, improving recall for queries like 'information retrieval'.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
def preprocess_query(query, stemmer=SnowballStemmer('english')):
    query = query.lower()
    query = re.sub(r'[^a-zA-Z\s]', '', query)
    query = re.sub(r'\s+', ' ', query).strip()
    tokens = query.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Test
sample_query = 'Experimental Aerodynamics Wing'
print('Sample query:', sample_query)
print('Preprocessed tokens:', preprocess_query(sample_query))

In [None]:
def retrieve_documents(query_tokens, index, df):
    lexicon = index.getLexicon()
    doc_ids = set()

    for token in query_tokens:
        try:
            pointer = lexicon[token]
            postings = index.getInvertedIndex().getPostings(pointer)
            doc_ids.update(posting.getId() for posting in postings)
        except KeyError:
            print(f"Term '{token}' not found in index.")

    if not doc_ids:
        return []

    results = []
    for doc_id in doc_ids:
        docno = df['docno'].iloc[doc_id]
        title = df['Title'].iloc[doc_id]
        processed_text = df['Processed_Text'].iloc[doc_id]
        results.append({
            'doc_id': doc_id,
            'docno': docno,
            'title': title,
            'processed_text': processed_text
        })

    return results

In [None]:
def rank_documents(documents, query_tokens, original_query_tokens=None):
    if not documents:
        return []

    corpus = [doc['processed_text'] for doc in documents]
    query = ' '.join(query_tokens)

    vectorizer = TfidfVectorizer(
        lowercase=False,
        token_pattern=r'(?u)\b\w+\b',
        norm='l2',
        use_idf=True,
        smooth_idf=True
    )
    try:
        tfidf_matrix = vectorizer.fit_transform(corpus)
        query_vector = vectorizer.transform([query])
        scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

        if original_query_tokens:
            original_query = ' '.join(original_query_tokens)
            original_vector = vectorizer.transform([original_query])
            original_scores = cosine_similarity(original_vector, tfidf_matrix).flatten()
            scores = 0.5 * scores + 0.5 * original_scores
    except ValueError as e:
        print(f'TF-IDF calculation failed: {e}')
        scores = [0] * len(documents)

    for i, doc in enumerate(documents):
        doc['tfidf_score'] = scores[i]

    ranked_docs = sorted(documents, key=lambda x: x['tfidf_score'], reverse=True)
    return ranked_docs

In [None]:
def search(query, index, df, top_k=5):
    print(f'\n=== Searching for: "{query}" ===')
    query_tokens = preprocess_query(query)
    print('Query tokens:', query_tokens)

    documents = retrieve_documents(query_tokens, index, df)
    if not documents:
        print('No documents found.')
        return []
    print(f'Found {len(documents)} documents.')

    ranked_docs = rank_documents(documents, query_tokens)

    print(f'Top {min(top_k, len(ranked_docs))} results:')
    for i, doc in enumerate(ranked_docs[:top_k], 1):
        print(f'{i}. Docno: {doc["docno"]}, Title: {doc["title"]}, TF-IDF Score: {doc["tfidf_score"]:.4f}')

    return ranked_docs

# Test searches
search('aerodynamics wing', index, df)
search('information retrieval', index, df)
search('nonexistent term', index, df)

## Phase 3: Query Expansion, User Interface, and Evaluation

This phase enhances the search engine with query expansion (relevance feedback, synonym mapping, BERT), a Gradio interface, and evaluation with precision@5.

In [None]:
!pip install transformers==4.44.2 torch==2.4.1 gradio==4.44.0
from collections import Counter
from transformers import BertTokenizer, BertModel
import torch
import gradio as gr
import numpy as np

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def expand_query(query, index, df, expansion_type='bert', top_k=3, num_terms=1):
    query_tokens = preprocess_query(query)
    print(f'Original query tokens: {query_tokens}')


    lexicon = index.getLexicon()
    def is_in_index(term):
        return term in lexicon

    synonym_map = {
        'aerodynam': ['flow', 'lift', 'airfoil'],
        'wing': ['airfoil', 'flap', 'swept'],
        'boundari': ['layer', 'shear', 'turbul'],
        'flow': ['stream', 'current'],
        'experiment': ['test', 'trial'],
        'inform': ['data', 'report', 'compil'],
        'retriev': ['search', 'queri', 'fetch']
    }
    synonym_terms = []
    for token in query_tokens:
        if token in synonym_map:
            synonym_terms.extend([syn for syn in synonym_map[token] if syn not in query_tokens and is_in_index(syn)])
    synonym_terms = synonym_terms[:num_terms]
    print(f'Synonym terms: {synonym_terms}')

    stop_words = set(stopwords.words('english'))
    initial_docs = search(query, index, df, top_k=top_k)
    feedback_terms = []
    if initial_docs:
        all_terms = []
        for doc in initial_docs:
            terms = [t for t in doc['processed_text'].split() if t not in stop_words and t not in synonym_terms and is_in_index(t)]
            all_terms.extend(terms)
        term_counts = Counter(all_terms)
        feedback_terms = [term for term, count in term_counts.most_common() if term not in query_tokens and term not in synonym_terms and term not in stop_words and is_in_index(term)][:num_terms]
        print(f'Relevance feedback terms: {feedback_terms}')
    else:
        print('No documents found for initial query.')

    embedding_terms = []
    if expansion_type == 'bert':
        print('Loading BERT...')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        vocab = [kv.getKey() for kv in lexicon]
        query_inputs = tokenizer(' '.join(query_tokens), return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            query_outputs = model(**query_inputs)
        query_embed = query_outputs.last_hidden_state[:, 0, :].mean(dim=0).numpy()

        inputs = tokenizer(vocab[:100], return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        vocab_embeds = outputs.last_hidden_state[:, 0, :].numpy()

        similarities = cosine_similarity([query_embed], vocab_embeds).flatten()
        top_indices = np.argsort(similarities)[::-1]
        for i in top_indices:
            if similarities[i] > 0.85:
                term = vocab[i]
                if term not in query_tokens and term not in synonym_terms and term not in feedback_terms and term not in stop_words and is_in_index(term):
                    embedding_terms.append(term)
                    break
        print(f'BERT embedding terms: {embedding_terms}')

    expanded_query = list(set(query_tokens + feedback_terms + synonym_terms + embedding_terms))
    return expanded_query

# Test expansion types
test_query = 'aerodynamics wing'
for exp_type in ['synonym', 'bert']:
    print(f'\nTesting {exp_type.upper()} expansion:')
    expanded_tokens = expand_query(test_query, index, df, expansion_type=exp_type)
    print(f'Expanded query: {expanded_tokens}')

In [None]:
def search_with_expansion(query, index, df, expansion_type='bert', top_k=5):
    print(f'\n=== Searching for "{query}" with {expansion_type.upper()} expansion ===')
    original_query_tokens = preprocess_query(query)
    query_tokens = expand_query(query, index, df, expansion_type=expansion_type)
    print(f'Using query tokens: {query_tokens}')

    documents = retrieve_documents(query_tokens, index, df)
    if not documents:
        print('No documents found.')
        return []
    print(f'Found {len(documents)} documents.')

    ranked_docs = rank_documents(documents, query_tokens, original_query_tokens=original_query_tokens)

    print(f'Top {min(top_k, len(ranked_docs))} results:')
    for i, doc in enumerate(ranked_docs[:top_k], 1):
        print(f'{i}. Docno: {doc["docno"]}, Title: {doc["title"]}, TF-IDF Score: {doc["tfidf_score"]:.4f}')

    return ranked_docs

# Test
search_with_expansion('aerodynamics wing', index, df, expansion_type='bert')

In [None]:
def search_query_interface(query, expansion_type):
    expansion_map = {'Synonym': 'synonym', 'BERT': 'bert'}
    exp_type = expansion_map.get(expansion_type, 'bert')
    results = search_with_expansion(query, index, df, expansion_type=exp_type, top_k=10)
    if not results:
        return {'results': []}
    output = [
        {'docno': doc['docno'], 'title': doc['title'], 'tfidf_score': round(doc['tfidf_score'], 4)}
        for doc in results
    ]
    return {'results': output}

# Test interface
test_result = search_query_interface('aerodynamics wing', 'BERT')
print('Interface test output:', test_result)

In [None]:
gr.Interface(
     fn=search_query_interface,
     inputs=[
         gr.Textbox(label='Enter your query', placeholder='e.g., aerodynamics wing'),
         gr.Dropdown(choices=['Synonym', 'BERT'], label='Expansion Type', value='BERT')
     ],
     outputs=gr.JSON(label='Search Results'),
     title='Cranfield Search Engine',
     description='Search the Cranfield dataset with query expansion.'
 ).launch()

In [None]:
def evaluate_search_engine(index, df):
    test_cases = {
        'aerodynamics wing': ['1', '140', '141'],
        'boundary layer': ['3', '4', '142'],
        'information retrieval': ['440']
    }
    expansion_types = ['synonym', 'bert']
    metrics = {}

    for exp_type in expansion_types:
        print(f'\n=== Evaluating with {exp_type.upper()} expansion ===')
        metrics[exp_type] = {}
        for query, relevant_docnos in test_cases.items():
            print(f'Query: "{query}" (Expected docnos: {relevant_docnos})')
            results = search_with_expansion(query, index, df, expansion_type=exp_type, top_k=5)
            retrieved_docnos = [doc['docno'] for doc in results[:5]]
            relevant_retrieved = len(set(retrieved_docnos) & set(relevant_docnos))
            precision = relevant_retrieved / 5 if retrieved_docnos else 0
            metrics[exp_type][query] = {'precision@5': precision}
            print(f'Retrieved docnos: {retrieved_docnos}')
            print(f'Retrieved titles: {[doc["title"] for doc in results[:5]]}')
            print(f'Precision@5: {precision:.2f} ({relevant_retrieved}/5 relevant)')
            print('Speed: Fast response for small dataset.')

    return metrics

# Run evaluation
evaluation_results = evaluate_search_engine(index, df)
print('\nFinal Evaluation Results:', evaluation_results)

In [None]:
# def main():
#     print("=== Step 1: Choose the Searching method ===")
#     print("1. Regular Indexing + Search")
#     print("2. Boolean Retreivel")
#     print("3. TF-IDF Retreivel")

#     option = int(input("Enter the number of the search: "))
    
#     if option == 1:
#         Regular_indexing()
#         query = input("Enter your query: ")
#         query_terms=preprocess_query(query)
#         search_term_regular_indexing(query_terms)
#         print("================Retreving the Query=================")
#         retrieve_documents(query_terms,df)
        
#     if option == 2 :

#         query = input("Enter your the words: ")
#         # operator=input("Enter the Operator {AND,OR,NOT} : ")
#         # print(boolean_query(query, operator.upper()))
#         boolean_search(query,df)
#     if option ==3 : 
#         query=input("Enter your the words: ")
#         search(query,df,top_k=5)
           

