# Import All Required Libraries and Packages

In [1]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import os
import requests
import fitz
import tempfile

from selenium import webdriver
from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager

# Current Year

In [2]:
current_year = time.gmtime().tm_year
current_year

2024

# Setting Important HREF Links for the Daily Updates Tabs

In [3]:
URL_DAILY_UPDATE = {
    'UK' : 'https://www.legislation.gov.uk/new/uk',
    'Wales' : 'https://www.legislation.gov.uk/new/wales',
    'Scotland' : 'https://www.legislation.gov.uk/new/scotland',
    'Northern Ireland' : 'https://www.legislation.gov.uk/new/ni',
}

# Definign Some Functions Which Will be Used for The Daily Updates

In [4]:
def verify_daily_update(driver):
    '''A function that chechs if a new published daily update exists in a tab'''
    h5_content = driver.find_element(By.CLASS_NAME, 'p_content').find_element(By.TAG_NAME, 'h5').text
    if h5_content == 'Nothing published on this date':
        return False
    return True

def extract_content_from_pdf(pdf_url):
    '''
    A function that extracts the content from a pdf file if the title is a PDF.
    '''
    response = requests.get(f'{pdf_url}')
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf_file:
        temp_pdf_path = temp_pdf_file.name
        response = requests.get(pdf_url)
        temp_pdf_file.write(response.content)

    # Open the temporary PDF file and extract text content
    pdf_document = fitz.open(temp_pdf_path)
    text_content = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text_content += page.get_text()
    pdf_document.close()

    return text_content

def extract_content(driver, title_url):
    '''Function that extracts the content from a tab. NOte: This is same as in Scrapper.ipynb'''
    driver.get(title_url)
    time.sleep(1)
    Title_Content_Div = driver.find_element(By.CSS_SELECTOR, 'div.legToc')
    NavBar = Title_Content_Div.find_element(By.ID, 'legSubNav')
    NavBarLists = NavBar.find_elements(By.TAG_NAME, 'li')
    ContentTab = NavBarLists[1] #It may be clickable or not. If not, the media type is PDF not text
    Content_Link_Tag = None
    try:  #If Not PDF
        Content_Link_Tag = ContentTab.find_element(By.TAG_NAME, 'a')
    except:
        Content_Link_Tag = None
    
    if Content_Link_Tag != None:
        Content_Link_Tag_Href = Content_Link_Tag.get_attribute('href')
        driver.get(Content_Link_Tag_Href)
        time.sleep(1)
        '''Now get the content'''
        '''Multiple Pages of the content page'''
        Page_Number = 1
        All_Provisions_Text = ''
        while True:
            Content_Box = driver.find_element(By.ID, 'content')
            Content_Text = Content_Box.find_element(By.ID, 'viewLegContents').find_element(By.CLASS_NAME, 'LegSnippet')
            page_Text = Content_Text.text
            All_Provisions_Text += page_Text
            print(f'Page Number: {Page_Number}')
            
            '''Now check for button'''
            Button_Panel = driver.find_element(By.CLASS_NAME, 'prevNextNav')
            try:
                Next_Button = Button_Panel.find_element(By.TAG_NAME, 'ul').find_elements(By.TAG_NAME, 'li')[-1].find_element(By.TAG_NAME, 'a')
                print(f'Next Button found: {Next_Button.text}')
                try:
                    Next_Button.click()
                    time.sleep(1)
                    Page_Number += 1
                except:
                    print(f'You are probably on the very last Provision page')
                    print(f'Provision Page Number: {Page_Number}')
                    break
            except:
                print(f'No Next Button Found - Last Provision Page')
                print(f'Provision Page Number: {Page_Number}')
                break
        return All_Provisions_Text
    
    elif Content_Link_Tag == None:
        Tag_PDF_href = driver.find_element(By.CSS_SELECTOR, 'div.LegSnippet').find_element(By.TAG_NAME, 'a').get_attribute('href')
        pdf_content = extract_content_from_pdf(Tag_PDF_href)
        return pdf_content

def get_daily_update(driver, url):
    '''
    A function that extracts the daily update from a tab.
    What is a tab? Each country's new title's website is a tab.
    '''
    driver.get(url)
    time.sleep(1)
    
    if verify_daily_update(driver) == True:
        New_Titles = {}
        Titles_Href_List = []
        Title_Name_List = []

        Content_div = driver.find_element(By.CLASS_NAME, 'p_content')
        Legislation_Name = Content_div.find_element(By.TAG_NAME, 'h5').text
        Title_URLS = Content_div.find_elements(By.TAG_NAME, 'h6')

        for idxNewTitle, title in enumerate(Title_URLS):
            href = title.find_element(By.TAG_NAME, 'a').get_attribute('href')
            name = title.text.split('-')[-1].strip()
            Titles_Href_List.append(href)
            Title_Name_List.append(name)

        if len(Titles_Href_List) > 0:
            New_Titles[f'{Legislation_Name}'] = dict(zip(Title_Name_List, Titles_Href_List))
            return New_Titles
        else:
            return None
        
def create_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [5]:
def get_legislation_type(legislation_name):
    '''
    A function that returns the list of countries and the legislation type in which the legislation is in.
    '''
    All_Content_Folder = './Scraped_Content'
    dict_web_structure = {}
    dict_web_structure[str(legislation_name)] = [{'Country': Country, 'LegislationType': LegislationType} for idxCountry, Country in enumerate(os.listdir(All_Content_Folder)) for idxLegislationType, LegislationType in enumerate(os.listdir(f'{All_Content_Folder}/{Country}')) if legislation_name in os.listdir(f'{All_Content_Folder}/{Country}/{LegislationType}')]
    return dict_web_structure

import re
def replace_slashes(filename):
    # Replace backslashes and forward slashes with unique placeholders
    filename = filename.replace('\\', '__BS__') #Back Slash
    filename = filename.replace('/', '__FS__') #Front Slash
    return filename

def restore_slashes(filename):
    # Restore the placeholders back to original slashes
    filename = filename.replace('__BS__', '\\') #Back Slash
    filename = filename.replace('__FS__', '/') #Front Slash
    return filename

def append_problem(title_name, title_path, problem):
    create_dirs(
        path='./Problematic_Titles/'
    )

    #create a file called "problematic_files.txt" and int save the titlename and its path and its reason
    with open('./Problematic_Titles/problematic_files.txt', 'a') as f:
        f.write(title_name + '\n')
        f.write(title_path + '\n')
        f.write(f'{problem}' + '\n')
        f.write('---\n')
    f.close()

## From the below, drivers just uncomment what works for you.

In [6]:
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))

In [7]:
for Country_Key in URL_DAILY_UPDATE.keys():
    New_Titles = get_daily_update(driver, URL_DAILY_UPDATE[Country_Key])
    print(f'{Country_Key} - {New_Titles}')
    if New_Titles != {} and New_Titles is not None:
        for idxLegislation, legislation_name in enumerate(New_Titles.keys()):
            for title_name, title_url in New_Titles[legislation_name].items():
                '''Now since we have the title url, we can extract the title data'''
                legislation_type = get_legislation_type(legislation_name)
                print(f'{Country_Key} - {legislation_type} - {legislation_name} - {title_name} - {title_url}')

                Title_Data_Content = extract_content(driver, title_url)

                for item in legislation_type[legislation_name]:
                    key_country = item['Country']
                    key_legislation_type = item['LegislationType']

                    txt_file_path = f'./New_Content/{key_country}/{key_legislation_type}/{legislation_name}/{current_year}'
                    validated_file_name = replace_slashes(title_name)
                    validated_file_name = os.path.join(txt_file_path, validated_file_name+'.txt')

                    create_dirs(path=txt_file_path)

                    try:
                        with open(f'{validated_file_name}', 'w') as f:
                            f.write(Title_Data_Content)
                        f.close()
                        print(f'Successfully created file: {validated_file_name}')
                    except Exception as e:
                        error_message = str(e)
                        print(f'Error Message: {error_message}')
                        append_problem(title_name=title_name, 
                                        title_path=validated_file_name, 
                                        problem=error_message)
                        print(f'Failed to create file: {validated_file_name}')
                    
driver.quit()

UK - {'UK Statutory Instruments': {'The Air Navigation (Restriction of Flying) (Southport, Merseyside) (Emergency) (Revocation) Regulations 2024': 'https://www.legislation.gov.uk/uksi/2024/862/contents/made', 'The Air Navigation (Restriction of Flying) (Liverpool) (Emergency) (Revocation) Regulations 2024': 'https://www.legislation.gov.uk/uksi/2024/861/contents/made', 'The Air Navigation (Restriction of Flying) (Salisbury) Regulations 2024': 'https://www.legislation.gov.uk/uksi/2024/857/contents/made', 'The Air Navigation (Restriction of Flying) (Newcastle, County Down) (Amendment) Regulations 2024': 'https://www.legislation.gov.uk/uksi/2024/856/contents/made', 'The Air Navigation (Restriction of Flying) (Shropshire) Regulations 2024': 'https://www.legislation.gov.uk/uksi/2024/855/contents/made'}}
UK - {'UK Statutory Instruments': [{'Country': 'NothernIreland', 'LegislationType': 'May contain legislation that applies to NothernIreland'}, {'Country': 'Wales', 'LegislationType': 'May con