<a href="https://colab.research.google.com/github/WardZid/IntroToCloud/blob/main/TencentScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

מבוא למחשוב ענן | שבוע 6

In [None]:
!pip install requests beautifulsoup4
!pip install firebase

In [None]:
#Prepare Firebase
from firebase import firebase
fbConn = firebase.FirebaseApplication('https://braudecloud-18-02-2024-default-rtdb.europe-west1.firebasedatabase.app/',None)

In [None]:
import requests
from bs4 import BeautifulSoup

def fetch_page(url):
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup
  else:
    return None



In [None]:
import re
def index_words(soup):
    index = {}
    words = re.findall(r'\w+', soup.get_text())
    for word in words:
        word = word.lower()
        if word in index:
            index[word] += 1
        else:
            index[word] = 1
    #print(format_json(index))
    return index


In [None]:
def remove_stop_words(index):
  stop_words = {'a', 'an', 'the', 'and', 'or','in', 'on', 'at'}
  for stop_word in stop_words:
      if stop_word in index:
            del index[stop_word]
  return index


In [None]:
from nltk.stem import PorterStemmer
def apply_stemming(index):
    stemmer = PorterStemmer()
    stemmed_index = {}
    for word, count in index.items():
        stemmed_word = stemmer.stem(word)
        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count
    return stemmed_index


In [None]:
def search(query, index):
    query_words = re.findall(r'\w+', query.lower())
    results = {}
    for word in query_words:
        if word in index:
            results[word] = index[word]
    return results


In [None]:
def search_engine(url, query):
    soup = fetch_page(url)
    if soup is None:
        return None
    index = index_words(soup)
    index = remove_stop_words(index)
    index = apply_stemming(index)
    results = search(query, index)
    return results


In [None]:
from urllib.parse import urlparse, urljoin
from collections import deque

# Function to extract links from a webpage
def extract_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for link in soup.find_all('a', href=True):
        links.append(link['href'])
    return links

# Function to check if links are from the same domain
def check_same_domain(base_url, links):
    base_domain = urlparse(base_url).netloc
    same_domain_links = []
    for link in links:
        parsed_link = urlparse(link)
        if parsed_link.netloc == base_domain or parsed_link.netloc == '':
            same_domain_links.append(link)
    return same_domain_links

def is_same_domain(base_url, url):
    base_domain = urlparse(base_url).netloc
    link_domain = urlparse(url).netloc
    return base_domain == link_domain

In [None]:
def fetch_page_title(url):
    soup = fetch_page(url)
    if soup:
        title_tag = soup.find('title')
        if title_tag:
            return title_tag.get_text()
    return None

In [None]:
import hashlib

def hash_url(url_to_hash):
  return hashlib.md5(url_to_hash.encode()).hexdigest()

def save_and_hash_url(url_to_hash_save):
  # Hash the URL using MD5 hashing algorithm
  hashed_url = hash_url(url_to_hash_save)
  title = fetch_page_title(url_to_hash_save)
  data_to_upload = {
    'url': url_to_hash_save,
    'title': title
  }
  result = fbConn.put('/pages', hashed_url,data=data_to_upload)
  if result is None:
      print("save_and_hash_url - None")
  #print("save_and_hash ",result)
  return hashed_url

In [None]:

def save_links_pointing(url_pointing,links_pointed_at):
  for link in links_pointed_at:
    hashed_link = hash_url(link)
    result = fbConn.get("indx/"+ hashed_link,None)
    #print(hashed_link," ",result)
    if result == None:
      save_and_hash_url(link)

    result = fbConn.post_async("pages/"+hashed_link+"/"+"pointed_from", data=hash_url(url_pointing))
    if result is None:
      print("save_links_pointing - None")
    print("point ",result)

In [None]:
import urllib.parse

def search_words(url,query_words):
  hashed_url = save_and_hash_url(url)
  #print("search_words",hashed_url)
  for query in query_words:
    result = search_engine(url, query)
    if result == {} or result is None or result[query] is None:
      continue
    data_to_upload = {
        'count': result[query]
    }
    #print(hashed_url," ",query," ",data_to_upload)
    result = fbConn.put('/index/' + query,hashed_url,data=data_to_upload)
    if result is None:
      print("search_words - None")
    #print("word ",result)

In [None]:
def get_words_from_firestore():
    data = fbConn.get('index',None)
    keys = list(data.keys()) if data else []
    return keys

In [None]:

def scrape_bfs(start_url):

  visited = set()
  # Initialize a queue with first url
  queue = deque([start_url])

  queries = get_words_from_firestore()
  print(queries)
  ####queries = ["tencent","cloud","games","studio","holdings","business","china","nvidia","graphics","development"]
  #get the base domain to start from
  #base_domain = urlparse(start_url).netloc

  # Loop until the queue is empty
  while queue:
    #get next url to search
    url = queue.popleft()
    # Skip if the URL has been visited before
    if url in visited:
        continue

    # mark visited
    visited.add(url)

    #Get all the links in this page
    links = extract_links(url)
    print("Visiting: ",url,"\nLinks Extracted: ",len(links))

    #searching words and saving them to firebase
    search_words(url,queries)

    #threading.Thread(target=save_links_pointing, args=(url, links)).start()
    #save_links_pointing(url,links)

    # Process link
    for link in links:
      absolute_link = urljoin(url, link)

      if is_same_domain(start_url, absolute_link) and absolute_link not in visited:
        #add link to queue
        queue.append(absolute_link)



In [None]:
scrape_bfs('https://www.tencent.com/en-us/')

In [None]:
print(hash_url('https://www.tencent.com/en-us/'))