# Scraping data from HSE website

The HSE website is fairly unstructured, so required a heavily tailored approach. 

PDFs tended to contain guidance more so than HTMLs, hence the focus changed to scraping only PDFs.

In [215]:
from typing import List
from urllib import request
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import datetime
from datetime import date
from typing import List
from xmlrpc.client import Boolean

TODAY_STR = date.today().strftime("%d%m%y")

# Set max column widths
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

  pd.set_option('max_colwidth', -1)


### Scraping text from htmls under Guidance section of HSE

In [2]:
# Import data from website
URL = "https://www.hse.gov.uk/guidance/index.htm"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

In [2]:
# Define a function to scrape links from an html page
def get_links(page, id_string : str):
    """
    This is a function to scrape links from an html page
    """
    links = []
    content = soup.find(id = id_string)
    link_list = [a.get("href") for a in content.find_all("a")]
    refined_list = [link.replace("..", "https://www.hse.gov.uk") for link in link_list]
    # Delete links after https://www.hse.gov.uk/work-at-height/ladders/index.htm
    refined_list = refined_list[:refined_list.index('https://www.hse.gov.uk/work-at-height/ladders/index.htm') + 1]
    links.extend(refined_list)
    return links


In [3]:
# Strings that appear in scraped text that can be dropped
strings_to_drop = ["Help us improve our website\nWe want to find out more about who uses this part of our website and what they think of it.\nYou can help us by completing a short survey.\nGo to survey\n\n",
"\n"]

In [4]:
# Define function to get text from links
def get_text_from_list_of_links(list_of_urls):
    """
    This function requests information from a list of URLs
    It then scrapes text one at a time from the URLs based on id = contentContainer
    The text is then added to a list as a dictionary and the list is returned
    """
    new_link_list = []
    for url in list_of_urls:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            elements = soup.find(id = "contentContainer")
            requirements = ' '.join([r.text.strip() for r in elements])
            for string in strings_to_drop:
                if string in requirements:
                    requirements = requirements.replace(string, " ")
                else: 
                    continue
            new_link_list.append({"Regulation" : requirements, "URL" : url})
        except:
            continue
    return new_link_list

In [95]:
# Define a function to get content from the scraped links
def get_content_from_link(links_list : List):
    content_list = []
    for url in links_list:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        # Scrape content from current URL and make note of the URL
        elements = soup.find_all("article")
        overview = ' '.join([o.text.strip() for o in elements])
        for string in strings_to_drop:
            overview = overview.replace(string, " ")
        content_list.append({"Regulation" : overview, "URL" : url})

        # Find URLs within URL and make note of URL
        contained_URLs = soup.find('div', class_='column twoThird noBoxPadding')
        refined_list = []
        if contained_URLs is not None:
            more_URLs = [a.get("href") for a in contained_URLs.find_all("a")]
            refined_list = ["https://www.hse.gov.uk/" + url.split("/")[3] + "/" + url.split("/")[4] + "/" + link for link in more_URLs]
            refined_set = list(set(refined_list))
            # Put the new links in a list without the first element as this is the original page link
            new_links_from_each_page = refined_set[1 : ]

            # Scrape content from URLs in the page
            additional_content_list = get_text_from_list_of_links(new_links_from_each_page)
            content_list.extend(additional_content_list)
        else:
            continue

    # Convert to dataframe
    output_with_dups_df = pd.DataFrame(content_list)

    # Some don't contain article so scraping is done now
    empty_rows_df = output_with_dups_df[output_with_dups_df["Regulation"] == ""]
    links_from_empty_rows = [link for link in empty_rows_df["URL"]]
    for url in links_from_empty_rows:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        try:
            elements = soup.find(id = "contentContainer")
            contentContainerUrls = [a.get("href") for a in elements.find_all("a")]
            # Drop unneccessary links
            contentContainerUrls = [link for link in contentContainerUrls if "https" not in link]
            # Create the urls to query
            refined_content_container_list = ["https://www.hse.gov.uk/" + url.split("/")[3] + "/" + link for link in contentContainerUrls]
            # Scrape the text from the links
            additional_content_list = get_text_from_list_of_links(refined_content_container_list)
            # Add to the overall list
            content_list.extend(additional_content_list)
        except:
            continue

    # Convert to dataframe again
    output_with_dups_df = pd.DataFrame(content_list)
   
   # Output to dataframe and drop na's
    output_df = output_with_dups_df.drop_duplicates(keep = "first").reset_index(drop = True)

    return output_df

In [96]:
# Get content from links
links = get_links(page, "contentContainer")
content_df = get_content_from_link(links)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [98]:
# View the df
content_df

Unnamed: 0,Regulation,URL
0,"As an employer, you must appoint a competent person or people to help you meet your health and safety legal duties. What a competent person does They should have the skills, knowledge and experience to be able to recognise hazards in your business and help you put sensible controls in place to protect workers and others from harm. Qualifications and training It's not usually essential for them to have formal qualifications and they're not required by law to have formal training, although it can help. Who you can appoint You could appoint (one or a combination of): yourself one or more of your workers someone from outside your business Usually, managing health and safety isn't complicated and you can do it yourself with the help of your workers. You know your workplace best and the risks associated with it. If there's a competent person within your workforce, use them rather than a competent person from outside your business. Using a consultant or adviser If your business or organisation doesn't have the competence to manage health and safety in-house, for example, if it's large, complex or high risk, you can get help from a consultant or adviser. But remember, as the employer, managing health and safety will still be your legal duty. More on competence",https://www.hse.gov.uk/simple-health-safety/gettinghelp/index.htm
1,"1. Overview The law says that every business must have a policy for managing health and safety. A health and safety policy sets out your general approach to health and safety. It explains how you, as an employer, will manage health and safety in your business. It should clearly say who does what, when and how. If you have five or more employees, you must write your policy down. If you have fewer than five employees you do not have to write anything down, but it is useful to do so. You must share the policy, and any changes to it, with your employees. Next page How to write your policy",https://www.hse.gov.uk/simple-health-safety/policy/index.htm
2,"Prepare a health and safety policy Overview How to write your policy The law 1. Overview The law says that every business must have a policy for managing health and safety. A health and safety policy sets out your general approach to health and safety. It explains how you, as an employer, will manage health and safety in your business. It should clearly say who does what, when and how. If you have five or more employees, you must write your policy down. If you have fewer than five employees you do not have to write anything down, but it is useful to do so. You must share the policy, and any changes to it, with your employees. Next page How to write your policy Related content The basics for your business: Health and safety made simple Risk assessment",https://www.hse.gov.uk/simple-health-safety/policy/index.htm#article
3,"Prepare a health and safety policy Overview How to write your policy The law 2. How to write your policy Your policy should cover three areas. Part 1: Statement of intent State your general policy on health and safety at work, including your commitment to managing health and safety and your aims. As the employer or most senior person in the company, you should sign it and review it regularly. Part 2: Responsibilities for health and safety List the names, positions and roles of the people in your business who have specific responsibility for health and safety. Part 3: Arrangements for health and safety Give details of the practical arrangements you have in place, showing how you will achieve your health and safety policy aims. This could include, for example, doing a risk assessment, training employees and using safety signs or equipment. Worked example and template To help you write your own health and safety policy, you can use this example and template. Previous page Overview Next page The law Related content The basics for your business: Health and safety made simple Risk assessment",https://www.hse.gov.uk/simple-health-safety/policy/how-to-write-your-policy.htm#article
4,1. Overview Employers must make sure employees get immediate help if taken ill or injured at work. The law applies to every workplace and to the self-employed. You must have: a suitably stocked first aid kit an appointed person or people to take charge of first aid arrangements information for all employees telling them about first aid arrangements Next page Assess your first aid needs,https://www.hse.gov.uk/simple-health-safety/firstaid/index.htm
...,...,...
129,First aid in work Overview Assess your first aid needs Appoint someone to take charge of first aid What to put in a first aid kit First aiders and training First aid for homeworkers and co-working spaces First aid in detail 1. Overview Employers must make sure employees get immediate help if taken ill or injured at work. The law applies to every workplace and to the self-employed. You must have: a suitably stocked first aid kit an appointed person or people to take charge of first aid arrangements information for all employees telling them about first aid arrangements Next page Assess your first aid needs Related content The basics for your business: Health and safety made simple RIDDOR First aid offshore First aid for divers,https://www.hse.gov.uk/simple-health-safety/firstaid/index.htm
130,"Display the health and safety law poster If you employ anyone, you must either: display the health and safety law poster where your workers can easily read it provide each worker with the equivalent health and safety law leaflet The poster explains British health and safety laws and lists what workers and their employers should do. You can add details of any employee safety representatives or health and safety contacts. Resources Law poster and leaflet",https://www.hse.gov.uk/simple-health-safety/display.htm
131,"Get insurance for your business Overview How do you get employers' liability insurance? 1. Overview If your business has employees, you will probably need employers' liability insurance. If an employee is injured or becomes ill as a result of the work they do for you, they can claim compensation from you. Employers' liability insurance will help you to pay any compensation. Find out more about employers' liability insurance on the GOV.UK website. Next page How do you get employers' liability insurance? Resources Employers' Liability (Compulsory Insurance) Act 1969: A brief guide for employers Related content The basics for your business: Health and safety made simple Financial Conduct Authority",https://www.hse.gov.uk/simple-health-safety/insurance/index.htm
132,"Health and safety at work: criminal and civil law Overview Health and safety law (criminal law) Civil law - compensation claims 1. Overview Both criminal and civil law apply to workplace health and safety. They're not the same. As an employer, you must protect your workers and others from getting hurt or ill through work. If you don't: a regulator such as the Health and Safety Executive (HSE) or local authority may take action against you under criminal law the person affected may make a claim for compensation against you under civil law Neither HSE nor local authorities have responsibility for applying civil law or setting the rules for the conduct of civil cases. Industry specific health and safety legislation Next page Health and safety law (criminal law) Related content Publications to help you interpret the law Acts of Parliament Statutory instruments (regulations) Health and Safety at Work etc Act 1974",https://www.hse.gov.uk/simple-health-safety/law/index.htm


In [99]:
# View content
content_df[content_df["Regulation"] == ""]

Unnamed: 0,Regulation,URL
29,,https://www.hse.gov.uk/home-working/index.htm
30,,https://www.hse.gov.uk/brexit/index.htm?utm_source=hse.gov.uk&utm_medium=refferal&utm_campaign=EU-Exit&utm_term=brexit-home&utm_content=news-page
31,,https://www.hse.gov.uk/lone-working/index.htm
32,,https://www.hse.gov.uk/asbestos/index.htm
33,,https://www.hse.gov.uk/toolbox/height.htm
34,,https://www.hse.gov.uk/toolbox/ppe.htm
35,,https://www.hse.gov.uk/toolbox/fire.htm
36,,https://www.hse.gov.uk/riddor/index.htm
42,,https://www.hse.gov.uk/simple-health-safety/index.htm


In [101]:
# Output content to excel
content_df.to_excel("guidance.xlsx")

### Scraping text from html pages and pdfs from Industries section on HSE website

In [5]:
# Import data from website
industry_URL = "https://www.hse.gov.uk/guidance/industries.htm"

In [49]:
# Get industry links

# Define a function to scrape links from an html page
def get_links(URL, id_string : str, string_you_want_replaced : str):
    """
    This is a function to scrape links from an html page
    """
    links = []
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    content = soup.find(id = id_string)
    try:
        link_list = [a.get("href") for a in content.find_all("a")]
        refined_list = ["https://www.hse.gov.uk" + link.replace(string_you_want_replaced, "") for link in link_list]
        # Delete links if https://www.hse.gov.uk/ not in it
        refined_list = [link for link in refined_list if "https://www.hse.gov.uk/" in link]
        links.extend(refined_list)
    except:
        return   
    return links

In [7]:
# Getting the links
industry_links = get_links(industry_URL, "contentContainer", "..")

In [9]:
# Define a function to get content from the scraped links
def get_content_from_link(links : List, remove_unnecessary_links_with_string : str) -> pd.DataFrame:
    content_list = []
    for url in links:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        try:
            elements = soup.find(id = "contentContainer")
            contentContainerUrls = [a.get("href") for a in elements.find_all("a")]
            
            # Drop unneccessary links
            contentContainerUrls = [link for link in contentContainerUrls if remove_unnecessary_links_with_string not in link]
            # Create the urls to query
            refined_content_container_list = [link for link in contentContainerUrls]
            # Scrape the text from the links
            additional_content_list = get_text_from_list_of_links(refined_content_container_list)
            # Add to the overall list
            content_list.extend(additional_content_list)
        except:
            continue
    output_df = pd.DataFrame(content_list)
    return output_df

In [106]:
# Scrape text information from the links
industry_content_df = get_content_from_link(industry_links, "https")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [108]:
# View industry contents
industry_content_df.to_excel("industry.xlsx")

In [8]:
# Defining a function to extract pdfs from a multiple links on the pdf_url page input

def extract_pdfs(pdf_url, string_you_want_replaced : str, extractingPDFsFromTopicsPage : Boolean):
    content_list = []
    url_list = []
    links_to_scrape = get_links(pdf_url, "contentContainer", string_you_want_replaced)
    for url in links_to_scrape:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            elements = soup.find(id = "contentContainer")
            contentContainerUrls = [a.get("href") for a in elements.find_all("a")]
            # Drop unneccessary links
            contentContainerUrls = [link for link in contentContainerUrls if "pdf" in link]
            # Create the pdf urls
            if extractingPDFsFromTopicsPage:
                refined_content_container_list = [url.replace("index.htm", "") + link for link in contentContainerUrls]
            else:
                refined_content_container_list = ["https://www.hse.gov.uk/" + link.replace("../", "") for link in contentContainerUrls]
            # Add to the overall list
            content_list.extend(refined_content_container_list)
            url_list.extend([url]*len(refined_content_container_list))
        except:
            continue
    return url_list, content_list

In [9]:
# Scrape PDFs from industry
industry_url_list, industry_pdfs = extract_pdfs(industry_URL, "..", False)
industry_pdfs_df = pd.DataFrame({"PDFs" : industry_pdfs, "URLs" : industry_url_list})

In [12]:
# Scrape PDFs from scraped links within each link in industry page
within_industry_links = []
within_industry_pdfs = []
for link in industry_links:
    try:
        l, p = extract_pdfs(link, "..", False)
        within_industry_links.extend(l)
        within_industry_pdfs.extend(p)
    except:
        continue
within_industry_pdfs_df = pd.DataFrame({"PDFs" : within_industry_pdfs, "URLs" : within_industry_links})

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [15]:
# Add the two dfs together
final_industry_pdf_df = pd.concat([within_industry_pdfs_df, industry_pdfs_df]).reset_index(drop = True)

In [149]:
# Output to excel file
final_industry_pdf_df.to_excel(f"{TODAY_STR}-industry_PDF.xlsx")

In [126]:
# View topic contents
industry_pdfs_df.to_excel(f"{TODAY_STR}-industryPDFs.xlsx")

### Scrape pdfs from Topics

In [17]:
# Import data from website
topics_url = "https://www.hse.gov.uk/guidance/topics.htm" 

In [18]:
# Get topics links
topics_links = get_links(topics_url, "contentContainer", "..")

In [19]:
# Get list of pdfs from first page of each topics_link, if there is no pdf then open links inside and scrape that data
topics_pdfs_urls, topics_pdf_links = extract_pdfs(topics_url, "..", True)

In [131]:
# Now open links from Resource section and scrape text information from those links
topics_content_df = get_text_from_list_of_links(topics_links)

In [132]:
# View topic contents
pd.DataFrame(topics_content_df).to_excel(f"{TODAY_STR}-topics.xlsx")

In [20]:
# Convert topic pdfs links to excel
topics_pdfs_df = pd.DataFrame({"PDFs" : topics_pdf_links, "URLs" : topics_pdfs_urls})

In [None]:
topics_pdfs_df.to_excel(f"{TODAY_STR}-topicsPDFs.xlsx")

### And scrape PDFs from within links in topics

In [21]:
# Scrape PDFs from scraped links within each link in industry page
within_topics_links = []
within_topics_pdfs = []
for link in topics_links:
    try:
        l, p = extract_pdfs(link, "..", False)
        within_industry_links.extend(l)
        within_industry_pdfs.extend(p)
    except:
        continue
within_topics_pdfs_df = pd.DataFrame({"PDFs" : within_topics_pdfs, "URLs" : within_topics_links})

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [23]:
# Concat the two dfs together
final_topic_pdf_df = pd.concat([topics_pdfs_df, within_topics_pdfs_df]).reset_index(drop = True)

In [147]:
# Convert to excel
final_topic_pdf_df.to_excel(f"{TODAY_STR}-topics_PDFs.xlsx")

## Download all PDFs from publications page

In [44]:
# Define publications url
publications_url = "https://www.hse.gov.uk/pubns/index.htm"

In [71]:
# Publications page's relative links are structured differently so a fix is used instead of the get_links function
pubns_page = requests.get(publications_url)
pubns_soup = BeautifulSoup(pubns_page.content, "html.parser")
pubns_content = pubns_soup.find(id = "contentContainer")
pubns_link_list = [a.get("href") for a in pubns_content.find_all("a")]
pubns_link_list_dropped = [link for link in pubns_link_list if len(link) > 2]
publication_links_to_scrape = ["https://www.hse.gov.uk/pubns/" + link for link in pubns_link_list_dropped]

In [73]:
# Get PDFs from publications page
publications_urls, list_of_pdfs = extract_pdfs(publications_url, "..", False)
puiblications_pdfs_df = pd.DataFrame({"PDFs" : list_of_pdfs, "URLs" : publications_urls})

In [110]:
# Scrape PDFs from scraped links within each link in industry page
within_publications_links = []
within_publications_pdfs = []
pubns_website_links = []
for link in publication_links_to_scrape:
    try:
        l, p = extract_pdfs(link, "..", False)
        pubns_links = [item.replace(item, link) for item in l]
        within_publications_links.extend(l)
        within_publications_pdfs.extend(p)
        pubns_website_links.extend(pubns_links)
    except:
        continue
within_pubications_pdfs_df = pd.DataFrame({"PDFs" : within_publications_pdfs, "URLs" : within_publications_links, "Links" : pubns_website_links})

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [111]:
# Concat the two dfs together
final_publications_pdf_df = pd.concat([puiblications_pdfs_df, within_pubications_pdfs_df]).reset_index(drop = True)

In [101]:
# View a sample
final_publications_pdf_df.sample(n = 20)

Unnamed: 0,PDFs,URLs
754,https://www.hse.gov.uk/indg293.pdf,https://www.hse.gov.uk/pubns/indg293.htm
586,https://www.hse.gov.uk/content/science-evidence-delivery-21-24.pdf,https://www.hse.gov.uk/research/index.htm
634,https://www.hse.gov.uk/content/science-evidence-strategy-1622.pdf,https://www.hse.gov.uk/research/index.htm
458,https://www.hse.gov.uk/content/science-review-2022.pdf,https://www.hse.gov.uk/research/index.htm
263,https://www.hse.gov.uk/content/hse-areas-of-research-interest.pdf,https://www.hse.gov.uk/research/index.htm
683,https://www.hse.gov.uk/horizons/assets/documents/foresight-report-2019.pdf,https://www.hse.gov.uk/research/index.htm
614,https://www.hse.gov.uk/overall/hssh2021.pdf,https://www.hse.gov.uk/statistics/index.htm
262,https://www.hse.gov.uk/content/science-evidence-investment-plan-20.pdf,https://www.hse.gov.uk/research/index.htm
200,https://www.hse.gov.uk/content/science-review-2022.pdf,https://www.hse.gov.uk/research/index.htm
656,https://www.hse.gov.uk/content/science-review-2022.pdf,https://www.hse.gov.uk/research/index.htm


In [146]:
# Output to excel
final_publications_pdf_df.to_excel(f"{TODAY_STR}-Publications_PDFs.xlsx")

### Download all PDFs from COSHH page

In [33]:
# Define COSHH url
coshh_url = "https://www.hse.gov.uk/pubns/guidance/index.htm"

coshh_links = get_links(coshh_url, "contentContainer", "../..")

In [34]:
# Get PDFs from coshh page
coshhurls, coshh_list_of_pdfs = extract_pdfs(coshh_url, "../..", False)
coshh_pdf = pd.DataFrame({"PDFs" : coshh_list_of_pdfs, "URLs" : coshhurls})

In [35]:
# Scrape PDFs from scraped links within each link in industry page
within_coshh_links = []
within_coshh_pdfs = []
for link in coshh_links:
    try:
        l, p = extract_pdfs(link, "..", False)
        within_coshh_links.extend(l)
        within_coshh_pdfs.extend(p)
    except:
        continue
within_coshh_pdfs_df = pd.DataFrame({"PDFs" : within_coshh_pdfs, "URLs" : within_coshh_links})

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [36]:
# Concat the two dfs together
final_coshh_pdf_df = pd.concat([coshh_pdf, within_coshh_pdfs_df]).reset_index(drop = True)

In [145]:
# Output to excel file
final_coshh_pdf_df.to_excel(f"{TODAY_STR}-COSHH_PDFs.xlsx")

### Scrape full catelogue (may be overlap with already scraped pubns)

In [140]:
# Scraping catelogue section
def catelogue_pubns_pdf_scraper(URL):
    # Scrape links from initial page
    catelogue_page = requests.get(URL)
    catelogue_soup = BeautifulSoup(catelogue_page.content, "html.parser")
    catelogue_content = catelogue_soup.find(id = "contentContainer")
    catelogue_link_list = [a.get("href") for a in catelogue_content.find_all("a")]
    catelogue_link_list_dropped = catelogue_link_list[: -1]
    catelogue_links_to_scrape = ["https://www.hse.gov.uk/pubns/books/" + link for link in catelogue_link_list_dropped]

    # Scrape links from secondary pages
    total_catelogue_links_to_scrape_for_pdfs = []
    for link in catelogue_links_to_scrape:
        c_page = requests.get(link)
        c_soup = BeautifulSoup(c_page.content, "html.parser")
        c_content = c_soup.find(id = "contentContainer")
        c_link_list = [a.get("href") for a in c_content.find_all("a")]
        for c_link in c_link_list:
            if "../" in c_link:
                c_full_links_list = ["https://www.hse.gov.uk/pubns/" + c_link.replace("../", "") for c_link in c_link_list]
            else:
                c_full_links_list = ["https://www.hse.gov.uk/pubns/books/" + c_link for c_link in c_link_list]
        total_catelogue_links_to_scrape_for_pdfs.extend(c_full_links_list)

    # Scrape pdfs from each of the secondary page links
    content_list = []
    url_list = []
    for url in total_catelogue_links_to_scrape_for_pdfs:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            elements = soup.find(id = "contentContainer")
            contentContainerUrls = [a.get("href") for a in elements.find_all("a")]
            # Drop unneccessary links
            contentContainerUrls = [link for link in contentContainerUrls if "pdf" in link]
            # Create the pdf urls
            refined_content_container_list = ["https://www.hse.gov.uk/pubns" + link.replace("..", "") for link in contentContainerUrls]
            # Add to the overall list
            content_list.extend(refined_content_container_list)
            url_list.extend([url]*len(refined_content_container_list))
        except:
            continue
    return url_list, content_list

In [141]:
# Scrape catelogue url and get pdfs
catelogue_urls, catelogue_pdfs = catelogue_pubns_pdf_scraper("https://www.hse.gov.uk/pubns/books/index-catalogue.htm")

In [142]:
# Convert to df
catelogue_pdf_df = pd.DataFrame({"URLs" : catelogue_urls, "PDFs" : catelogue_pdfs})

### Scrape pdfs from construction topic

In [219]:
# Scrape construction topic
construction_url = "https://www.hse.gov.uk/construction/resources/freeleaflets.htm"

construction_links = get_links(construction_url, "contentContainer", "../..")
# Select only pdfs
construction_pdfs = [pdf for pdf in construction_links if ".pdf" in pdf]

In [220]:
# construction resources pdfs
construction_resources_df = pd.DataFrame({"PDFs" : construction_pdfs})
construction_resources_df["URLs"] = "https://www.hse.gov.uk/construction/resources/freeleaflets.htm"

In [221]:
# Get PDFs from links in construction page
construction_urls1, construction_list_of_pdfs1 = extract_pdfs(construction_url, "../..", False)
construction_urls2, construction_list_of_pdfs2 = extract_pdfs("https://www.hse.gov.uk/construction/areyou/index.htm", "../..", False)

construction_urls = construction_urls1 + construction_urls2
construction_list_of_pdfs = construction_list_of_pdfs1 + construction_list_of_pdfs2

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [222]:
# construction df
construction_df = pd.DataFrame({"URLs" : construction_urls, "PDFs": construction_list_of_pdfs})

In [223]:
# Get more pdfs from https://www.hse.gov.uk/construction/resources/guidance.htm
guidance_construction_urls, guidance_construction_list_of_pdfs = extract_pdfs("https://www.hse.gov.uk/construction/resources/guidance.htm", "../..", False)

In [224]:
# guidance construction df
guidance_construction_df = pd.DataFrame({"URLs" : guidance_construction_urls, "PDFs": guidance_construction_list_of_pdfs})

In [225]:
# Combine all dfs
combined_construction_df = pd.concat([construction_resources_df, construction_df, guidance_construction_df]).drop_duplicates(keep = "first").reset_index(drop = True)

In [226]:
# Convert construction df to excel
combined_construction_df.to_excel(f"/Users/thomas/Documents/BEIS/scraper/scraped_data/{TODAY_STR}-construction_PDFs.xlsx")

In [144]:
# Convert catalogue df to excel
catelogue_pdf_df.to_excel(f"{TODAY_STR}-catelogue_PDFs.xlsx")

### Scraping all topics and descriptions in an effort to map to GDS taxonomy

In [193]:
# Get names and descriptions from topics (part of taxonomy experiment)
url = "https://www.hse.gov.uk/guidance/topics.htm"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")

content = soup.find(id = "contentContainer")
topic_list = [a.text for a in content.find_all("a")]
description_list = [a.get("title") for a in content.find_all("a")]

topics_list_df = pd.DataFrame({"Topics" : topic_list, "Descriptions" : description_list})
topics_list_df = topics_list_df[topics_list_df["Topics"].str.len() > 2]

In [195]:
# Output df to excel
topics_list_df.to_excel("/Users/thomas/Documents/BEIS/taxonomy/input_data/ListOfTopicsWithDescriptions.xlsx")