In [6]:
import csv
import logging
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time

In [7]:
def scrape_legislation_content(url, output_file, log_file):
    
    logging.basicConfig(filename=log_file, level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

    
    options = Options()
    options.headless = True
    service = Service('/opt/homebrew/bin/geckodriver')  
    driver = webdriver.Firefox(service=service, options=options)
    
    try:
        
        driver.get(url)
        time.sleep(3)  
        
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        
        links = soup.find_all('a', href=True)
        base_url = "https://www.legislation.gov.uk"
        
        
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['Section', 'Title', 'Paragraph'])
            
            for link in links:
                href = link['href']
                if href.startswith('/ukpga/2016/19/'):
                    full_url = base_url + href
                    driver.get(full_url)
                    time.sleep(3)  
                    page_soup = BeautifulSoup(driver.page_source, 'html.parser')
                    content = page_soup.find('div', id='viewLegContents')
                    
                    if not content:
                        logging.error(f"No content found on {full_url}")
                        continue
                    
                    section_title = content.find('h2').get_text(strip=True) if content.find('h2') else 'No Title'
                    paragraphs = content.find_all('p')
                    
                    for paragraph in paragraphs:
                        csvwriter.writerow([href, section_title, paragraph.get_text(strip=True)])
    
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    
    finally:
        driver.quit()

# Example usage
url = "https://www.legislation.gov.uk/ukpga/2016/19/contents"
output_file = "legislation_content.csv"
log_file = "scrape_errors.log"
scrape_legislation_content(url, output_file, log_file)