<a href="https://colab.research.google.com/github/WardZid/IntroToCloud/blob/main/TencentScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Single Website Scraper
___
This Notebook Scrapes a given website, given a starting url.
<br>The Scraper works by retreiving  list of words to search for from the database, and counts the occurances of these words in the the pages of the website.
<br>How it works:


1. Given a starting url
2. Fetches words to search for from DB
3. Searches for these words in the page
4. Saves the counts of occurances for each word
5. Retrieves all the links in the page that navigate to same domain as the current url
6. Saves current url to pointed link. Each link holds a list of urls that navigate to it
7. Repeat steps 2 to 6 for all the found links whilst avoiding previously visited links.

This approach is based on the BFS algorithm where we prioritize breadth first since more shallow pages of a website are usually the most relevant.


In [None]:
# @title Install dependancies

!pip install requests beautifulsoup4
!pip install firebase

In [None]:
# @title Initialize Firebase Connection

from firebase import firebase
fbConn = firebase.FirebaseApplication('https://braudecloud-18-02-2024-default-rtdb.europe-west1.firebasedatabase.app/',None)
#fbConn = firebase.FirebaseApplication('https://smart-howl-250311.firebaseio.com/',None)

In [None]:
# @title Fetch page contents using BeautifulSoup
import requests
from bs4 import BeautifulSoup

def fetch_page(url):
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup
  else:
    return None



In [None]:
# @title Index the words

import re
def index_words(soup):
    index = {}
    words = re.findall(r'\w+', soup.get_text())
    for word in words:
        word = word.lower()
        if word in index:
            index[word] += 1
        else:
            index[word] = 1
    #print(format_json(index))
    return index


In [None]:
# @title Remove unwanted Stop Words

def remove_stop_words(index):
  stop_words = {'a', 'an', 'the', 'and', 'or','in', 'on', 'at'}
  for stop_word in stop_words:
      if stop_word in index:
            del index[stop_word]
  return index


In [None]:
# @title Stem the words to extract the root word

from nltk.stem import PorterStemmer
def apply_stemming(index):
    stemmer = PorterStemmer()
    stemmed_index = {}
    for word, count in index.items():
        stemmed_word = stemmer.stem(word)
        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count
    return stemmed_index


In [None]:
# @title Search the page for a query word

def search(query, index):
    query_words = re.findall(r'\w+', query.lower())
    results = {}
    for word in query_words:
        if word in index:
            results[word] = index[word]
    return results


def search_engine(url, query):
    soup = fetch_page(url)
    if soup is None:
        return None
    index = index_words(soup)
    index = remove_stop_words(index)
    index = apply_stemming(index)
    results = search(query, index)
    return results


In [None]:
# @title Extract all links from the page and ensure only the ones in the same domain as the original page

from urllib.parse import urlparse, urljoin
from collections import deque

# Function to extract links from a webpage
def extract_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for link in soup.find_all('a', href=True):
        links.append(link['href'])
    return links

# Function to check if links are from the same domain
def check_same_domain(base_url, links):
    base_domain = urlparse(base_url).netloc
    same_domain_links = []
    for link in links:
        parsed_link = urlparse(link)
        if parsed_link.netloc == base_domain or parsed_link.netloc == '':
            same_domain_links.append(link)
    return same_domain_links

def is_same_domain(base_url, url):
    base_domain = urlparse(base_url).netloc
    link_domain = urlparse(url).netloc
    return base_domain == link_domain

In [None]:
# @title Pull the title of the page to view it in the search results
def fetch_page_title(url):
    soup = fetch_page(url)
    if soup:
        title_tag = soup.find('title')
        if title_tag:
            return title_tag.get_text()
    return None

In [None]:
# @title Save URLs

import hashlib

def hash_url(url_to_hash):
  return hashlib.md5(url_to_hash.encode()).hexdigest()

def save_and_hash_url(url_to_hash_save):
  # Hash the URL using MD5 hashing algorithm
  hashed_url = hash_url(url_to_hash_save)
  title = fetch_page_title(url_to_hash_save)
  data_to_upload = {
    'url': url_to_hash_save,
    'title': title
  }
  result = fbConn.put('/pages', hashed_url,data=data_to_upload)
  if result is None:
      print("save_and_hash_url - None")
  #print("save_and_hash ",result)
  return hashed_url

In [None]:
# @title Search for words in a page and save the results
import urllib.parse

def search_words(url,query_words):
  hashed_url = save_and_hash_url(url)
  #print("search_words",hashed_url)
  for query in query_words:
    result = search_engine(url, query)
    if result == {} or result is None or result[query] is None:
      continue
    data_to_upload = {
        'count': result[query]
    }
    #print(hashed_url," ",query," ",data_to_upload)
    result = fbConn.put('/index/' + query,hashed_url,data=data_to_upload)
    if result is None:
      print("search_words - None")
    #print("word ",result)

In [None]:
# @title Fetch list of words to index
def get_words_to_index():
    data = fbConn.get('words',None)
    terms = [value['term'] for value in data.values()]
    return terms

print(get_words_to_index())

In [None]:
# @title Function to save for the links the urls that navigate to them to be used as an index of relevance

def save_links_pointing(url_pointing,links_pointed_at):
  try:
    for link in links_pointed_at:
      hashed_link = hash_url(link)

      try:
        hashed_pointing= hash_url(url_pointing)
        result = fbConn.put("pointed/"+hashed_link+"/pointed_from",hash_url(hashed_pointing), data=hashed_pointing)
      except:
        print("Error HABIBI ",url_pointing)
  except:
    print("ERRORRRR")

In [None]:
# @title Scraper Function

#this function works as a url BFS search that given a starting node, searchs and build a map (as a graph) of the website it is given
# this is also a SINGLE Site Scraper where it ensures to not leave the domain that it startd with
def scrape_bfs(start_url):

  visited = set()

  # Initialize a queue with first url
  queue = deque([start_url])

  #get the words that we will be searching
  queries = get_words_to_index()


  # Loop until the queue is empty
  while queue:
    #get next url to search
    url = queue.popleft()
    # Skip if the URL has been visited before
    if url in visited:
        continue

    # mark visited to prevent returns and infinite loops in the graph
    visited.add(url)

    #Get all the links in this page
    links = extract_links(url)
    print("Visiting: ",url,"\nLinks Extracted: ",len(links))

    #searching words and saving them to firebase
    search_words(url,queries)

    #save to links found the current url as a url that points to it
    save_links_pointing(url,links)

    # Process link into the queue of links to visit (if its not already there)
    for link in links:
      absolute_link = urljoin(url, link)

      if is_same_domain(start_url, absolute_link) and absolute_link not in visited:
        #add link to queue
        queue.append(absolute_link)



In [None]:
# @title Start Scrape

scrape_bfs('https://www.tencent.com/en-us/')