In [2]:
# simple_search_engine.py

import os
import re
from bs4 import BeautifulSoup

# Step 1: Read all HTML files from the "website" folder
folder = r"C:\ISR\website"  # your folder with HTML files
pages = {}

for filename in os.listdir(folder):
    if filename.endswith(".html"):
        path = os.path.join(folder, filename)
        with open(path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, "html.parser")
            # Extract visible text and convert to lowercase
            text = soup.get_text(separator=" ", strip=True).lower()
            pages[filename] = text

# Step 2: Build inverted index (word -> list of pages containing the word)
index = {}

for page, content in pages.items():
    words = re.findall(r'\w+', content)
    for word in words:
        if word not in index:
            index[word] = set()
        index[word].add(page)

# Step 3: Search function
def search(query):
    words = query.lower().split()
    results = set()
    for word in words:
        if word in index:
            if not results:
                results = index[word]
            else:
                results = results.intersection(index[word])
        else:
            results = set()
            break  # if one word not found, intersection will be empty
    return results

# Step 4: Run search queries
print("Simple Search Engine (type 'exit' to quit)")
while True:
    query = input("\nEnter search query: ")
    if query.lower() == "exit":
        break
    result_pages = search(query)
    if result_pages:
        print("Pages containing your query:", result_pages)
    else:
        print("No pages found.")
1

Simple Search Engine (type 'exit' to quit)



Enter search query:  HIi


No pages found.



Enter search query:  hello


No pages found.



Enter search query:  contact


Pages containing your query: {'contacts.html'}



Enter search query:  exit


1