github link: https://github.com/ShlomiFridman/PhoenixProject2025

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install requests beautifulsoup4
!pip install requests beautifulsoup4 nltk
!pip install firebase

Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl.metadata (6.5 kB)
Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


In [None]:
import requests
import time
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from nltk.stem import PorterStemmer
import re
from firebase import firebase

Utils functions

In [None]:
def read_txtfile(fileName):
    file_path = '/content/drive/My Drive/' + fileName
    with open(file_path, 'r') as file:
        allText = ""
        for line in file:
          allText += line
        wordsList =  allText.split()
        wordsSet =  set(wordsList)
    return wordsSet

#TODO add read image file

def index_words(soup):
    index_res = {}
    words = re.findall(r'\w+', soup.get_text())

    for word in words:
        word = word.lower()

        if word in index_res:
            index_res[word] += 1
        else:
            index_res[word] = 1

    return index_res

def remove_stop_words(p_index):
    #stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at', 'to'}
    stop_words = read_txtfile("stopwords_en.txt")

    for stop_word in stop_words:
        if stop_word in p_index:
            del p_index[stop_word]

    return p_index

def apply_stemming(p_index):
    stemmer = PorterStemmer()
    stemmed_index = {}

    for word, count in p_index.items():
        stemmed_word = stemmer.stem(word)

        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count

    return stemmed_index

Firebase service

In [None]:
class FirebaseService:

  def __init__(self, db_url = 'https://phoenixhw2-default-rtdb.europe-west1.firebasedatabase.app/'):
    self.FBconn = firebase.FirebaseApplication(db_url,None)

  def get_index_from_DB(self):
    return self.FBconn.get('/Index/', None)

  def update_index_in_db(self, index_p):
    for k,v in index_p.items():
      self.FBconn.put("/Index/", k, v)
      print(f"update made for index={k} ({v['term']})")
    print("Updated given index in DB")

index service for maintain

In [None]:
class IndexService:

  def __init__(self, index, firebaseService):
    self.firebaseService = firebaseService
    self.rev_index = {}
    self.urls_index = {}
    stemmer = PorterStemmer()
    for w in index:
      self.rev_index[stemmer.stem(w)] = {"term":w, "DocIDs": [], "DocIDs_cntrs": []}

  def process_soup(self, url, soup):
    if not soup:
      print("empty soup")
      return
    u_index = index_words(soup)
    u_index = remove_stop_words(u_index)
    u_index = apply_stemming(u_index)
    self.urls_index[url] = u_index

    for ind, ind_val in self.rev_index.items():
      if ind not in u_index:
        continue
      elif url in ind_val['DocIDs']:
        continue
      ind_val['DocIDs'].append(url)
      ind_val['DocIDs_cntrs'].append(u_index[ind])

    return u_index

  def get_reverse_index(self):
    return self.rev_index

  def get_url_index(self, url):
    return self.urls_index.get(url,{})

  def set_index(self, newRevIndex):
    self.rev_index = newRevIndex
    self.urls_index = {}

    for ind, vals in self.rev_index.items():
      urls = vals['DocIDs']
      cntrs = vals['DocIDs_cntrs']
      for j in range(len(vals['DocIDs'])):
        if urls[j] not in self.urls_index:
          self.urls_index[urls[j]] = {}
        self.urls_index[urls[j]][ind] = cntrs[j]
    print("index updated")

  def add_new_word(self):
    pass

  def remove_word(self):
    pass

  def add_new_url(self, url):
    pass

  def remove_url(self, url):
    pass

  def save_in_db(self):
    pass

  def load_from_db(self):
    pass

  def index_toString(self):
    str = ''
    for ind,vals in self.rev_index.items():
      str += f"Index {ind}"
      str += f"\tTerm={vals['term']}"
      str += "\tDocIDs="
      for j in range(len(vals['DocIDs'])):
        str += f"\t\tURL No.{j}: {vals['DocIDs'][j]} - {vals['DocIDs_cntrs'][j]} times"
    return str


Crawling service

In [None]:
class CrawlerService:

  def __init__(self, indexService):
    self.indexService = indexService
    self.crawled_urls = set()
    self.crawled_count = 0
    self.robot = None

    # Function to crawl a website and fetch n pages
  def crawl_website(self, base_url, max_pages):
      rp = self.__check_robot(base_url)  # Check the robots.txt file
      urls_to_crawl = [base_url]  # Initialize the queue with the base URL
      current_crawled_urls = set()
      ignore_urls = ["form", "mp3", "mp4", "downloads"]

      while urls_to_crawl and len(current_crawled_urls) < max_pages:
          current_url = urls_to_crawl.pop(0)

          if current_url in self.crawled_urls:
              continue  # Skip already crawled URLs

          print(f"Crawling {self.crawled_count+1}: {current_url}")
          page_content = self.__fetch_page_crawler(current_url, rp)

          if page_content:
              self.crawled_urls.add(current_url)
              current_crawled_urls.add(current_url)
              self.crawled_count += 1

              # Extract and queue new links to crawl
              new_links = self.__extract_links(page_content, base_url)
              for link in new_links:
                  cond1 = any(bad_url in link for bad_url in ignore_urls)
                  cond2 = any(u for u in self.crawled_urls if (f"{u}#" in link))
                  cond = cond1 or cond2
                  if link not in self.crawled_urls and link not in urls_to_crawl and not cond:
                      urls_to_crawl.append(link)

              soup = BeautifulSoup(page_content, 'html.parser')
              self.indexService.process_soup(current_url, soup)
              # Delay between requests to avoid overwhelming the server
              time.sleep(2)  # Sleep for 2 seconds between requests (politeness)

      print(f"\nCrawled {len(current_crawled_urls)} pages.")
      return current_crawled_urls

  def get_crawled_urls(self):
    return self.crawled_urls

  # Function to fetch and parse the robots.txt file to check permissions
  def __check_robot(self, url):
      robot_url = urljoin(url, '/robots.txt')
      rp = RobotFileParser()
      rp.set_url(robot_url)
      # Fetch and parse robots.txt file
      rp.read()
      if rp:
        self.robot = rp
      return rp if rp else self.robot

  # Function to fetch and parse a page
  def __fetch_page_crawler(self, url, rp):
      # Check if the URL is allowed to be crawled according to robots.txt
      if not rp.can_fetch('*', url):  # '*' means all user agents
          print(f"Blocked by robots.txt: {url}")
          return None

      try:
          response = requests.get(url, timeout=5)
          response.raise_for_status()  # Will raise an exception for 4xx or 5xx responses
          return response.text
      except requests.exceptions.RequestException as e:
          print(f"Error fetching {url}: {e}")
          return None

  # Function to extract internal links from a page
  def __extract_links(self, page_content, base_url):
      soup = BeautifulSoup(page_content, 'html.parser')
      links = set()

      # Find all anchor tags and extract the href attribute
      # TODO need to add all links that are in tags with property cta-type="local
      for anchor in soup.find_all('a', href=True):
          href = anchor['href']

          # Resolve relative URLs to absolute URLs
          full_url = urljoin(base_url, href)

          # Only add links that are within the same domain (ibm.com)
          if urlparse(full_url).netloc == urlparse(base_url).netloc:
              links.add(full_url)
      for tag in soup.find_all(attrs={'cta-type': 'local'}, href=True):
        href = tag['href']
        # Resolve relative URLs to absolute URLs
        full_url = urljoin(base_url, href)

        # Only add links that are within the same domain
        if urlparse(full_url).netloc == urlparse(base_url).netloc:
            links.add(full_url)

      return links

Query Service

In [71]:
class QueryService:

  # enable boolean search

  def __init__(self, indexService):
    self.indexService = indexService
    # self.query_history_results = {}   # query => urls
    self.query_history = []   # query => urls

  def query(self, query):
    url_res_set = set()
    query_words = set(re.findall(r'\w+', query.lower()))
    stemmer = PorterStemmer()
    stemmed_query = set()
    rev_index = self.indexService.get_reverse_index()
    for word in query_words:
      stemmed_word = stemmer.stem(word)
      stemmed_query.add(stemmed_word)
      #add url to dict
      if stemmed_word in rev_index:
        url_res_set.update(rev_index[stemmed_word]["DocIDs"])

    ranked_url_res = {}
    for u in url_res_set:
      ranked_url_res[u] = self.rank_url(u, stemmed_query)
    ranked_url_res = sorted(ranked_url_res.items(), key=lambda item: item[1], reverse=True)
    # add result to history
    # self.query_history[query] = ranked_url_res
    self.query_history.append({'query':query, 'results':ranked_url_res})
    # print(ranked_url_res)
    return ranked_url_res

  def rank_url(self, url, query_words):
    rank = 1
    # resultService = ResultService()
    # rank based on lab6
    url_index = self.indexService.get_url_index(url)
    for word in query_words:
      if word in url_index:
        rank = rank*1/url_index[word]
    rank = 1-rank
    # print(rank)
    return rank

  def get_history(self):
    return self.query_history



Result Service

In [None]:
# Lab 7
# result_service.py
class ResultService:
    def __init__(self, index_service, query_service):
        self.index_service = index_service
        self.query_service = query_service
        self.results = {}

    def format_results(self, query_id):
        """Format search results for display"""
        try:
            query = self.query_service.queries.get(query_id)
            if not query:
                return {'error': 'Query not found'}

            formatted_results = []
            for doc_id in query['results']:
                doc = self.index_service.get_document(doc_id)
                if doc:
                    formatted_results.append({
                        'doc_id': doc_id,
                        'title': doc['title'],
                        'snippet': doc['content'][:100] + '...'
                    })

            result_id = str(len(self.results) + 1)
            result = {
                'id': result_id,
                'query_id': query_id,
                'formatted_results': formatted_results,
                'count': len(formatted_results)
            }
            self.results[result_id] = result
            return result

        except Exception as e:
            return {'error': str(e)}

The index we defined

In [None]:
init_index = [
    'SAAS',
    'PAAS',
    'IAAS',
    'FAAS',
    'Private',
    'Public',
    'Hybrid',
    'Service',
    'Platform',
    'Infrastructure',
    'Study',
    'Case',
    'Chatbot',
    'Engine',
    'Cloud',
    'Monitor',
    'Data',
    'Mainframe',
    'Performance',
    'Security',
    'SLA',
    'KPI',
    'SOA',
    'Information',
    'Kafka',
    'SQL',
    'Technology',
    'Database',
    'Docker',
    'Kubernetes',
    'RabbitMQ',
    'IBM',
    'Google',
    'Amazon',
    'AI',
    'Artificial',
    'Intelligence',
]

In [None]:
firebaseService = FirebaseService()

In [None]:
indexService = IndexService(init_index, firebaseService)

In [None]:
crawlerService = CrawlerService(indexService)

In [78]:
queryService = QueryService(indexService)
query = queryService.query("chatbot ai")
print([k['query'] for k in queryService.get_history()])

['chatbot ai', 'SaaS', 'chatbot ai']


In [None]:
# main.py
def main():
    # Initialize services
    indexService = IndexService(init_index)
    crawlerService = CrawlerService(indexService)
    resultService = ResultService(indexService, queryService)

#if __name__ == "__main__":
#  main()

In [None]:
indexService.set_index(firebaseService.get_index_from_DB())
print("Index from firebase:")
print(indexService.index_toString())

index updated
Index from firebase:
Index ai	Term=AI	DocIDs=		URL No.0: https://www.ibm.com/us-en - 28 times		URL No.1: https://www.ibm.com/impact/ai-ethics?lnk=bus - 88 times		URL No.2: https://www.ibm.com/servers?lnk=ProdC - 1 times		URL No.3: https://www.ibm.com/thought-leadership/institute-business-value/report/ceo-generative-ai?lnk=bus - 5 times		URL No.4: https://www.ibm.com/think/reports/ai-in-action?lnk=bus - 28 times		URL No.5: https://www.ibm.com/think/insights/ai-ethics-and-governance-in-2025?lnk=hpUSls2 - 61 times		URL No.6: https://www.ibm.com/products/watsonx-code-assistant?lnk=dev - 7 times		URL No.7: https://www.ibm.com/about?lnk=inside - 6 times		URL No.8: https://www.ibm.com/automation?lnk=ProdC - 10 times		URL No.9: https://www.ibm.com/history?lnk=inside - 1 times		URL No.10: https://www.ibm.com/analytics?lnk=ProdC - 12 times		URL No.11: https://www.ibm.com/artificial-intelligence?lnk=ProdC - 81 times		URL No.12: https://www.ibm.com/new/announcements/scale-ai-with-ibm

Processing the index and saving it in DB

In [None]:
firebaseService.update_index_in_db(indexService.get_reverse_index())

In [None]:

# To limit the number of pages to crawl
MAX_PAGES = 10
crawlerService.crawl_website('https://www.ibm.com/us-en', MAX_PAGES)
crawlerService.crawl_website('https://www.ibm.com/topics', MAX_PAGES)

In [None]:
firebaseService.update_index_in_db(indexService.get_reverse_index())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def displayCntrGraph(index):
  ind_names = [ind["term"] for ind in index.values()]
  ind_cntrs = [sum(ind["DocIDs_cntrs"]) for ind in index.values()]
  ind_data = pd.DataFrame({
      "Index":ind_names,
      "Appearances":ind_cntrs
  })

  plt.clf()  # Clear the current figure
  plt.figure(figsize=(10, 8))  # Increased figure size to accommodate labels
  plt.title("Appearance of indexes in IBM:")

  ax = sns.barplot(x="Index", y="Appearances",
                  err_kws={'linewidth': 0}, data=ind_data) #errwidth=0 will be deprecated in v0.15.0
  for i in ax.containers:
    ax.bar_label(i,)
  plt.xticks(rotation=60)
  plt.show()  # Add this to display the plot
  plt.close()  # Close the figure to free memory

displayCntrGraph(indexService.get_reverse_index())

In [None]:
searchHistory = []

TODO 3 tabs: search results, graph that shows the rank of each page, graph for the website cover of the query keywords

TODO add the group logo from drive

TODO enable shering, make the link public

TODO edit_index: print_index, add_new_word, remove_from_index, add_url, remove_url, get_index_from_db, save_index_in_db, exit menu