<a href="https://colab.research.google.com/github/SujithaNamburu/Document_Search_Engine/blob/main/dodumentsearchengine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q gradio pdfplumber sentence-transformers scikit-learn nltk


In [None]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")


In [None]:
from google.colab import files

uploaded = files.upload()

pdf_path = list(uploaded.keys())[0]
print("Uploaded PDF:", pdf_path)


In [None]:
import pdfplumber

def extract_pdf(pdf_path):
    pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for p in pdf.pages:
            text = p.extract_text()
            if text:
                pages.append(text)
    return pages

raw_pages = extract_pdf(pdf_path)
print("Total pages extracted:", len(raw_pages))
print(raw_pages[0][:500], "...")


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re, time

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Clean text
def preprocess(text):
    text = re.sub(r"[^a-zA-Z ]", " ", text.lower())
    tokens = word_tokenize(text)
    return " ".join(stemmer.stem(t) for t in tokens if t not in stop_words and len(t) > 2)


class SearchEngine:
    def __init__(self):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.pages = raw_pages
        self.page_ids = [f"Page {i+1}" for i in range(len(raw_pages))]

        # Preprocessing
        print("Preprocessing pages‚Ä¶")
        self.proc_pages = [preprocess(p) for p in raw_pages]

        # TF-IDF
        print("Building TF-IDF matrix‚Ä¶")
        self.tfidf = TfidfVectorizer(stop_words="english")
        self.tfidf_mat = self.tfidf.fit_transform(self.proc_pages)

        # BERT embeddings
        print("Generating BERT embeddings‚Ä¶")
        self.emb = self.model.encode(self.pages, convert_to_tensor=True, show_progress_bar=True)

    def search(self, query, method="hybrid", alpha=0.6, top_k=5):
        q_clean = preprocess(query)

        # TF-IDF
        q_vec = self.tfidf.transform([q_clean])
        t_scores = cosine_similarity(q_vec, self.tfidf_mat).flatten()

        # BERT
        q_emb = self.model.encode(query, convert_to_tensor=True)
        b_scores = util.pytorch_cos_sim(q_emb, self.emb)[0].cpu().numpy()

        if method == "tfidf":
            final = t_scores
        elif method == "bert":
            final = b_scores
        else:
            # Hybrid (normalized)
            t_norm = (t_scores - t_scores.min()) / (t_scores.max() - t_scores.min() + 1e-12)
            b_norm = (b_scores - b_scores.min()) / (b_scores.max() - b_scores.min() + 1e-12)
            final = alpha * b_norm + (1 - alpha) * t_norm

        idx = np.argsort(final)[::-1][:top_k]

        return [
            {
                "rank": i+1,
                "page": self.page_ids[j],
                "score": float(final[j]),
                "content": self.pages[j][:1000]
            }
            for i, j in enumerate(idx)
        ]


In [None]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
engine = SearchEngine()
print("‚úì Search engine is ready!")


In [None]:
import gradio as gr

def search_ui(query, method, alpha, top_k):
    results = engine.search(query, method=method, alpha=alpha, top_k=int(top_k))
    out = f"### Results for: `{query}`\n"
    for r in results:
        out += f"\n**{r['rank']}. {r['page']} ‚Äî Score: {r['score']:.4f}**\n"
        out += r["content"] + "\n\n"
    return out


demo = gr.Interface(
    fn=search_ui,
    inputs=[
        gr.Textbox(label="Enter your search query"),
        gr.Radio(["tfidf", "bert", "hybrid"], value="hybrid"),
        gr.Slider(0, 1, value=0.6, step=0.1, label="Hybrid Alpha"),
        gr.Slider(1, 10, value=5, label="Top K")
    ],
    outputs=gr.Markdown(),
    title="üîç PDF Search Engine (TF-IDF + BERT + Hybrid)",
    description="Upload a PDF and search inside it"
)

demo.launch()
