In [1]:
import re
import string
from tqdm import tqdm

In [2]:
class SearchEngine:

  def __init__(self, path = None):
    self.readDocs(path)
    self.buildInvertedIndex()

  def cleanToken(self, token: str):
    puncs = string.punctuation+string.whitespace+'\xa0'
    token = ''.join(list(filter(lambda x: (x.isalnum() or x in puncs), list(token))))
    token = token.strip(puncs).lower()
    return token


  def readDocs(self,path):

    self.mapper = dict() #maps urls to the set of unique words

    try:
      with open(path, 'r') as file:
        data = file.readlines()

    except FileNotFoundError:
      print(f"the file - {path} is not present")
      return None

    data = list(filter(lambda x: x != '\n', data)) # removing empty lines
    data = [i.strip('\n') for i in data]

    prevline = '<endPageBody>'
    page_content = []
    cur_url = None

    print('Stand by while building index...')
    for line in tqdm(data):

      if line.startswith('http://') or line.startswith('https://'):
        if prevline=='<endPageBody>':
          page_content = page_content[1:-2] # to remove <pageBody> and <endPageBody>
          tokens = list(set(' '.join(page_content).split(' ')))
          tokens = set([self.cleanToken(i) for i in tokens])
          self.mapper[cur_url] = tokens
          cur_url = line
          page_content = []
      else:
        page_content.append(line)
      prevline = line

    if page_content and cur_url:
      page_content = page_content[1:-2] # to remove <pageBody> and <endPageBody>
      tokens = list(set(' '.join(page_content).split(' ')))
      tokens = set([self.cleanToken(i) for i in tokens])
      self.mapper[cur_url] = tokens

    del self.mapper[None]

    unique_words = set()
    for i in self.mapper.values(): unique_words|=i

    print(f"Indexed {len(self.mapper.keys())} pages containing {len(unique_words)} unique terms")

  def buildInvertedIndex(self):

    self.inverted_index = dict()

    urls = list(self.mapper.keys())
    words = set()
    for value in self.mapper.values():
      words |= value

    for word in tqdm(words):
      self.inverted_index[word] = set()

      for url in urls:
        if word in self.mapper[url]:
          self.inverted_index[word].add(url)


  def findQueryMatches(self, query):

    words = query.lower().split(' ')
    result = set()

    for word in words:
      prefix = word[0]
      if prefix == '+':
        result &= self.inverted_index.get(word[1:],set())
      elif prefix == '-':
        result -= self.inverted_index.get(word[1:],set())
      else:
        result |= self.inverted_index.get(word,set())

    return result

In [3]:
mysearchengine = SearchEngine('/content/sampleWebsiteData.txt')
while True:
  query = input("Enter a query to run on the file (enter RETURN/ENTER to exit): ").strip()
  if query == "": break
  result = mysearchengine.findQueryMatches(query)
  n = len(result)
  if n>0:
    print(f"There are {n} matches for the query - \"{query}\"")
    for url in result: print(url)
  else:
    print(f"There are no matches for the given query - \"{query}\"")
print('program terminated successfully!!')

Stand by while building index...


100%|██████████| 361/361 [00:00<00:00, 23924.28it/s]


Indexed 5 pages containing 1460 unique terms


100%|██████████| 1460/1460 [00:00<00:00, 246863.01it/s]


Enter a query to run on the file (enter RETURN/ENTER to exit): gupta
There are 2 matches for the query - "gupta"
https://www.cs.wmich.edu/gupta/teaching/cs5950/5950F23PGSweb/TopicsCovered%20ProgGradStu.html
https://www.cs.wmich.edu/~gupta/teaching/cs603/wsnSp04/ClassPolicies.html
Enter a query to run on the file (enter RETURN/ENTER to exit): add +in
There are 2 matches for the query - "add +in"
https://www.cs.wmich.edu/gupta/teaching/cs5950/5950F23PGSweb/TopicsCovered%20ProgGradStu.html
https://cs.wmich.edu/elise/courses/cs531/assignments-SI19.html
Enter a query to run on the file (enter RETURN/ENTER to exit): 
program terminated successfully!!
