<a href="https://colab.research.google.com/github/WardZid/IntroToCloud/blob/main/BraudeCloud6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

מבוא למחשוב ענן | שבוע 6

In [None]:
!pip install requests beautifulsoup4
!pip install firebase

Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


In [None]:
#Prepare Firebase
from firebase import firebase
fbConn = firebase.FirebaseApplication('https://braudecloud-18-02-2024-default-rtdb.europe-west1.firebasedatabase.app/',None)

In [None]:
import requests
from bs4 import BeautifulSoup

def fetch_page(url):
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup
  else:
    return None



In [None]:
import re
def index_words(soup):
    index = {}
    words = re.findall(r'\w+', soup.get_text())
    for word in words:
        word = word.lower()
        if word in index:
            index[word] += 1
        else:
            index[word] = 1
    #print(format_json(index))
    return index


In [None]:
def remove_stop_words(index):
  stop_words = {'a', 'an', 'the', 'and', 'or','in', 'on', 'at'}
  for stop_word in stop_words:
      if stop_word in index:
            del index[stop_word]
  return index


In [None]:
from nltk.stem import PorterStemmer
def apply_stemming(index):
    stemmer = PorterStemmer()
    stemmed_index = {}
    for word, count in index.items():
        stemmed_word = stemmer.stem(word)
        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count
    return stemmed_index


In [None]:
def search(query, index):
    query_words = re.findall(r'\w+', query.lower())
    results = {}
    for word in query_words:
        if word in index:
            results[word] = index[word]
    return results


In [None]:
def search_engine(url, query):
    soup = fetch_page(url)
    if soup is None:
        return None
    index = index_words(soup)
    index = remove_stop_words(index)
    index = apply_stemming(index)
    results = search(query, index)
    return results


In [None]:
from urllib.parse import urlparse, urljoin
from collections import deque

# Function to extract links from a webpage
def extract_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for link in soup.find_all('a', href=True):
        links.append(link['href'])
    return links

# Function to check if links are from the same domain
def check_same_domain(base_url, links):
    base_domain = urlparse(base_url).netloc
    same_domain_links = []
    for link in links:
        parsed_link = urlparse(link)
        if parsed_link.netloc == base_domain or parsed_link.netloc == '':
            same_domain_links.append(link)
    return same_domain_links

def is_same_domain(base_url, url):
    base_domain = urlparse(base_url).netloc
    link_domain = urlparse(url).netloc
    return base_domain == link_domain

In [None]:
def fetch_page_title(url):
    soup = fetch_page(url)
    if soup:
        title_tag = soup.find('title')
        if title_tag:
            return title_tag.get_text()
    return None

In [None]:
import hashlib

def save_and_hash_url(url_to_hash):
  # Hash the URL using MD5 hashing algorithm
  hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest()

  title = fetch_page_title(url_to_hash)
  data_to_upload = {
    'url': url_to_hash,
    'title': title
  }
  result = fbConn.put_async('/pages', hashed_url,data=data_to_upload)
  return hashed_url


In [None]:
import urllib.parse

def search_words(url,query_words):
  hashed_url = save_and_hash_url(url)
  #print(hashed_url)
  for query in query_words:
    result = search_engine(url, query)
    if result == {}:
      continue
    #print(result," ---- ",result[query])
    #data_to_upload ={
    #    'link': url,
    #    'count': result[query]
    #}
    data_to_upload = {
        'count': result[query]
    }
    result = fbConn.put_async('/index/' + query,hashed_url,data=data_to_upload)

In [None]:
#start_url = "https://www.tencent.com/en-us/"
#queries = ["tencent","tech","industry","partner","cloud","media","revenue","career","team","manage"]
#search_words(start_url,queries)
#search_words("https://www.tencent.com/en-us/sitemap.html",queries)


KeyboardInterrupt: 

In [None]:
start_url = "https://www.tencent.com/en-us/"
queries = ["tencent","tech","partner","cloud","media","career"]
#queries = ["infrastructure","Development","production","holdings"]

###BFS

visited = set()
# Initialize a queue with first url
queue = deque([start_url])

#get the base domain to start from
#base_domain = urlparse(start_url).netloc

# Loop until the queue is empty
while queue:
  #get next url to search
  url = queue.popleft()
  # Skip if the URL has been visited before
  if url in visited:
      continue

  # mark visited
  visited.add(url)

  print("Visiting:", url)
  #Get all the links in this page
  links = extract_links(url)
  print("Links Extracted: ",len(links))

  #searching words and saving them to firebase

  search_words(url,queries)
  # Ppocess link
  for link in links:
    absolute_link = urljoin(url, link)

    if is_same_domain(start_url, absolute_link) and absolute_link not in visited:
      #add link to queue
      queue.append(absolute_link)

Visiting: https://www.tencent.com/en-us/
Links Extracted:  105
Visiting: https://www.tencent.com/en-us/index.html
Links Extracted:  105
Visiting: https://www.tencent.com/en-us/about.html
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/about.html#about-con-1
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/about.html#about-con-2
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/about.html#about-con-3
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/about.html#about-con-4
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/about.html#about-con-5
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/investors/board-members.html
Links Extracted:  67
Visiting: https://www.tencent.com/en-us/about.html#about-con-6
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/about.html#about-con-7
Links Extracted:  69
Visiting: https://www.tencent.com/en-us/business.html
Links Extracted:  87
Visiting: https://www.tencent.com/en-us

TypeError: 'NoneType' object is not subscriptable