In [19]:
import csv
import logging
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time

In [18]:

def scrape_additional_info(url, csvwriter, visited_urls):
    if url in visited_urls:
        return
    visited_urls.add(url)
    
    driver.get(url)
    time.sleep(5)  # Increase wait time to ensure the page fully loads
    
    # Parse the page with BeautifulSoup
    page_soup = BeautifulSoup(driver.page_source, 'html.parser')
    content = page_soup.find('div', id='wrapper', class_='wrapper')
    
    if not content:
        logging.error(f"No content found on {url}")
        return
    
    # Extract the title from the <h1> tag
    title_tag = content.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No Title'
    
    # Extract all paragraphs
    paragraphs = ' '.join([p.get_text(strip=True) for p in content.find_all('p')])
    
    # Write the content to the CSV
    csvwriter.writerow([url, title, paragraphs])
    
    # Find and follow sub-links
    sub_links = content.find_all('a', href=True)
    for sub_link in sub_links:
        sub_href = sub_link['href']
        if sub_href.startswith('/'):
            sub_full_url = "https://www.gov.uk" + sub_href
            scrape_additional_info(sub_full_url, csvwriter, visited_urls)
        elif sub_href.startswith('http'):
            scrape_additional_info(sub_href, csvwriter, visited_urls)

def main(url, output_csv, log_file):
    # Set up logging
    logging.basicConfig(filename=log_file, level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

    # Set up Selenium WebDriver for Firefox
    options = Options()
    options.headless = True
    service = Service('/opt/homebrew/bin/geckodriver')  # Use the correct path
    global driver
    driver = webdriver.Firefox(service=service, options=options)
    
    try:
        # Open the output CSV file
        with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
            csvwriter = csv.writer(outfile)
            csvwriter.writerow(['Section', 'Title', 'Content'])
            
            # Start scraping from the main URL
            visited_urls = set()
            scrape_additional_info(url, csvwriter, visited_urls)
    
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    
    finally:
        driver.quit()

# Example usage
url = "https://www.gov.uk/browse/visas-immigration"
output_csv = "gov_additional_content.csv"
log_file = "scrape_errors.log"
main(url, output_csv, log_file)