# Scrape Text (EN-FR) from Government Websites

In [1]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from time import sleep
import random

In [2]:
# specify the url
domain = "https://www.canada.ca"
root = "https://www.canada.ca/en/employment-social-development.html"
queue = [root]; # FIFO (FBS exploration)

visited = [] # Keeps track on what page have been visited
fr_visited = []

en_contents = [] # en content scapped
fr_contents = [] # fr content scapped

In [3]:
EXTEND_TO_OUTSIDE_DOMAINS = False # Will scrape other domain content if a link referes to them
NUM_PAGES_TO_VISIT = 10000
PRINT_STEPS = 500

## Helpers

In [4]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]',"nav","footer","form","label","details","summary"]:
        return False
    elif isinstance(element, Comment):
        return False
    else:
        return True
    
    
def matches_conditions(url):
    response = True
    if url is None: return False
    if url in visited: return False
    if "canada.ca" not in url: return False
    if "employment-social-development" not in url: return False
    if "/en/" not in url: return False
    if domain not in url: return False
    
    return response

def soupify(url):
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        return soup
    
def get_text_from_soup(soup):
        texts = soup.find_all('h1')[0].find_all_next(text=True)#findAll(text=True)
        visible_texts = filter(tag_visible, texts)  
        page_text = " ".join([t.strip() for t in visible_texts if len(t.strip().split())>5]);
        return page_text

def get_absolute_link(link):
    if link.has_attr('href'):
        if '.html' in link['href']:
            
            url = link['href']
            
            if "#" in url:
                url = url.split("#")[0]
                
            if (".ca" in link['href'] or ".com" in link['href'] or "http:" in link['href'] or "www." in link['href']):
                return url
            
            elif link['href'].startswith("/"):
                return domain+url
            
            else:
                return None
        
            
    else:
        return None

In [5]:
%%time

num_visited = 0
num_errors = 0

while(len(queue) > 0 and num_visited < NUM_PAGES_TO_VISIT):
    try:
        
        if not matches_conditions(queue[0]):
            del queue[0]
            continue
            
        soup = soupify(queue[0])

        
        for link in soup("a"):
            
            if "Français" in link.get_text(): 
                french_url = get_absolute_link(link)
                french_soup = soupify(french_url)
                
                fr_content = get_text_from_soup(french_soup)
                fr_contents.append(fr_content)
                
                en_content = get_text_from_soup(soup)
                en_contents.append(en_content)
                
                visited.append(queue[0])
                fr_visited.append(french_url)
            
            
            url = get_absolute_link(link)
            if matches_conditions(url): queue.append(url)
                       
        num_visited+=1
        
        if((num_visited)%PRINT_STEPS == 0):
            print("Page {} complete({})".format(num_visited,queue[0]))
        
        num_errors = 0
        del queue[0]
        #sleep(0.5)

    except Exception as e:
        print("error for page {}: ".format(num_visited) + str(e))
        
        num_errors += 1
        if num_errors > 5:
            del queue[0]
            
        #sleep(2)
        continue;

error for page 139: Invalid URL 'None': No schema supplied. Perhaps you meant http://None?
error for page 139: Invalid URL 'None': No schema supplied. Perhaps you meant http://None?
error for page 139: Invalid URL 'None': No schema supplied. Perhaps you meant http://None?
error for page 139: Invalid URL 'None': No schema supplied. Perhaps you meant http://None?
error for page 139: Invalid URL 'None': No schema supplied. Perhaps you meant http://None?
error for page 139: Invalid URL 'None': No schema supplied. Perhaps you meant http://None?
Page 500 complete(https://www.canada.ca/en/employment-social-development/services/funding/apprenticeship-incentive-eligibility.html)
Page 1000 complete(https://www.canada.ca/en/employment-social-development/services/health-safety/reports/complaint-resolution.html)
Page 1500 complete(https://www.canada.ca/en/employment-social-development/services/funding/canada-summer-jobs/amounts-paid-2017/ontario.html)
error for page 1962: Invalid URL 'None': No sch

In [7]:
len(en_contents),len(fr_contents),len(visited),num_visited,len(queue)

(3392, 3392, 3392, 3399, 0)

In [25]:
idx = random.randrange(len(visited))
print(visited[idx],fr_visited[idx])

https://www.canada.ca/en/employment-social-development/corporate/reports/audits/internal-audit-compliance.html https://www.canada.ca/fr/emploi-developpement-social/ministere/rapports/verification/conformite-audit-interne.html


In [18]:
import pandas as pd
df = pd.DataFrame({"en_url":visited,"fr_url":fr_visited,"en_text":en_contents,"fr_text":fr_contents})

In [20]:
df.to_csv("data/en_fr_esdc_crawl(pages - {}).csv".format(num_visited),index=False)