# Import All the Required Libraries and Packages

In [1]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By

# Functions

In [2]:
def verify_target_year_existence(driver, url, legislation_name, year):
    target_url_year = f'{url}/{year}'
    try:
        driver.get(target_url_year)
        return True
    except:
        print(f'Exception: The legislation:{legislation_name} does not have any titles for the year: {year}')
        return False
    
def fetch_leg_types(driver, div_selector):
    div_element = driver.find_element(By.CSS_SELECTOR, div_selector)
    leg_types_elements = div_element.find_elements(By.CSS_SELECTOR, 'ul.legTypes')

    if leg_types_elements:
        leg_types_element = leg_types_elements[0]
        list_items = leg_types_element.find_elements(By.TAG_NAME, 'li')
        list_names = [item.text for item in list_items]
        list_hrefs = [item.find_element(By.TAG_NAME, 'a').get_attribute('href') for item in list_items if item.find_element(By.TAG_NAME, 'a')]
        return dict(zip(list_names, list_hrefs))
    else:
        return [], []
    
def get_legislations_href(driver, country, country_home_url):
    driver.get(country_home_url)
    time.sleep(2)
    
    selectors = ['div.s_4.p_one.legCol', 'div.s_4.p_two.legCol']

    with ThreadPoolExecutor() as executor:
        future_to_selector = {executor.submit(fetch_leg_types, driver, selector): selector for selector in selectors}
        results = []

        for future in as_completed(future_to_selector):
            selector = future_to_selector[future]
            try:
                data = future.result()
                results.append(data)
            except Exception as exc:
                print(f'Error fetching data for {selector}: {exc}')

    Final_Results = {
        f'Exclusively or primarily applies to {country}': results[0],
        f'May contain legislation that applies to {country}': results[1]
    } 
    return Final_Results

def filter_keys(data):
    filtered_data = {}
    for key, value in data.items():
        if isinstance(value, dict):
            filtered_data[key] = {k: v for k, v in value.items() if k.startswith("Exclusively")}
    return filtered_data

def get_final_target_legislations(All_Legislations, Each_Countries_Uniques, Each_Countries_Overlaps):
    Each_Countries_Uniques_updated_keys = []
    Each_Countries_Uniques_updated_values = []
    for x in Each_Countries_Uniques:
        Each_Countries_Uniques_updated_keys.append(list(x.keys()))
        Each_Countries_Uniques_updated_values.append(list(x.values()))
        
    Each_Countries_Overlaps_updated_keys = []
    Each_Countries_Overlaps_updated_values = []
    for x in Each_Countries_Overlaps:
        Each_Countries_Overlaps_updated_keys.append(list(x.keys()))
        Each_Countries_Overlaps_updated_values.append(list(x.values()))
    
    Overlaps_not_in_any_unique = []
    for country, overlap in  zip(['UK', 'Scotland', 'Wales', 'ni'],Each_Countries_Overlaps_updated_keys):
        # print(f'Country: {country}')
        for ovl in overlap: #overlap of current country
            check = False
            for country_other, uniques in zip(['UK', 'Scotland', 'Wales', 'ni'],Each_Countries_Uniques_updated_keys):
                if country_other == country:
                    continue
                else:
                    if ovl in uniques:
                        check = True
            if check == False:
                Overlaps_not_in_any_unique.append(ovl)
                # print(f'Overlap: -{ovl}- not in any countries uniques')
    Unique_Overlaps = np.unique(Overlaps_not_in_any_unique)

    idx = 0
    hrefs_dict_of_unique_overlaps = {}
    for country in Each_Countries_Overlaps:
        for key, val in country.items():
            if key in Unique_Overlaps:
                hrefs_dict_of_unique_overlaps[key] = val
    hrefs_dict_of_unique_overlaps
    
    All_Legislations = filter_keys(All_Legislations)
    All_Legislations['Extras'] = {'Legislations that are unique to none': hrefs_dict_of_unique_overlaps}

    # for key, val in All_Legislations.items():
    #     print(f'{key}:\n {val}\n')
        
    return All_Legislations

# Scrapper

## Part 1
> Get All Legislations
> - Legislations Unique to a Country
> - Legislations not Unique to any Country

In [3]:
All_Legislations = {}
Each_Countries_Uniques = []
Each_Countries_Overlaps = []

for idxCountry, Country in enumerate(['UK', 'Scotland', 'Wales', 'ni']):
    Country_Name = Country
    if Country_Name == 'ni':
        Country_Name = 'NothernIreland'
    Country_URL = os.path.join('https://www.legislation.gov.uk/browse' , Country.lower())
    
    print(f'Getting Legislations HREFs for the country: {Country_Name}')
    
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get(Country_URL)
    time.sleep(2)

    Country_Legislations = get_legislations_href(driver=driver,
                                                country=f'{Country_Name}',
                                                country_home_url=f'{Country_URL}')
    
    All_Legislations[Country_Name] = Country_Legislations
    
    Each_Countries_Uniques.append(Country_Legislations[list(Country_Legislations.keys())[0]])
    Each_Countries_Overlaps.append(Country_Legislations[list(Country_Legislations.keys())[1]])
    
Final_Legislations = get_final_target_legislations(All_Legislations, Each_Countries_Uniques, Each_Countries_Overlaps)

Getting Legislations HREFs for the country: UK
Getting Legislations HREFs for the country: Scotland
Getting Legislations HREFs for the country: Wales
Getting Legislations HREFs for the country: NothernIreland


In [4]:
All_Legislations

{'UK': {'Exclusively or primarily applies to UK': {'UK Public General Acts': 'https://www.legislation.gov.uk/ukpga',
   'UK Local Acts': 'https://www.legislation.gov.uk/ukla',
   'UK Private and Personal Acts': 'https://www.legislation.gov.uk/ukppa',
   'UK Statutory Instruments': 'https://www.legislation.gov.uk/uksi',
   'UK Ministerial Directions': 'https://www.legislation.gov.uk/ukmd',
   'UK Ministerial Orders': 'https://www.legislation.gov.uk/ukmo',
   'UK Statutory Rules and Orders 1900-1948': 'https://www.legislation.gov.uk/uksro',
   'UK Draft Statutory Instruments': 'https://www.legislation.gov.uk/ukdsi'},
  'May contain legislation that applies to UK': {'Acts of the Scottish Parliament': 'https://www.legislation.gov.uk/asp',
   'Acts of the Northern Ireland Assembly': 'https://www.legislation.gov.uk/nia',
   'Acts of the Old Scottish Parliament 1424-1707': 'https://www.legislation.gov.uk/aosp',
   'Acts of the English Parliament 1267-1706': 'https://www.legislation.gov.uk/aep

In [5]:
Final_Legislations

{'UK': {'Exclusively or primarily applies to UK': {'UK Public General Acts': 'https://www.legislation.gov.uk/ukpga',
   'UK Local Acts': 'https://www.legislation.gov.uk/ukla',
   'UK Private and Personal Acts': 'https://www.legislation.gov.uk/ukppa',
   'UK Statutory Instruments': 'https://www.legislation.gov.uk/uksi',
   'UK Ministerial Directions': 'https://www.legislation.gov.uk/ukmd',
   'UK Ministerial Orders': 'https://www.legislation.gov.uk/ukmo',
   'UK Statutory Rules and Orders 1900-1948': 'https://www.legislation.gov.uk/uksro',
   'UK Draft Statutory Instruments': 'https://www.legislation.gov.uk/ukdsi'}},
 'Scotland': {'Exclusively or primarily applies to Scotland': {'Acts of the Scottish Parliament': 'https://www.legislation.gov.uk/asp',
   'Acts of the Old Scottish Parliament 1424-1707': 'https://www.legislation.gov.uk/aosp',
   'Scottish Statutory Instruments': 'https://www.legislation.gov.uk/ssi',
   'Scottish Draft Statutory Instruments': 'https://www.legislation.gov.uk

---

## Part 2
> Given all legislations and target years, extract titles and then content of each title and save to a .txt file. 

## Now Scrape Content from the Legislations

In [6]:
def verify_url_existence(driver, url): #the url is year url
    try:
        driver.get(url)
        time.sleep(1)
        content_div = driver.find_element(By.CSS_SELECTOR, 'div.results') #if there are titles. that year has titles
        return True
    except:
        return False
    
def extract_content_from_pdf(pdf_url):
    response = requests.get(f'{pdf_url}')
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf_file:
        temp_pdf_path = temp_pdf_file.name
        response = requests.get(pdf_url)
        temp_pdf_file.write(response.content)

    # Open the temporary PDF file and extract text content
    pdf_document = fitz.open(temp_pdf_path)
    text_content = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text_content += page.get_text()
    pdf_document.close()

    return text_content

def extract_content(driver, title_url):
    '''Function that extracts the content from a tab. NOte: This is same as in Scrapper.ipynb'''
    driver.get(title_url)
    time.sleep(1)
    Title_Content_Div = driver.find_element(By.CSS_SELECTOR, 'div.legToc')
    NavBar = Title_Content_Div.find_element(By.ID, 'legSubNav')
    NavBarLists = NavBar.find_elements(By.TAG_NAME, 'li')
    ContentTab = NavBarLists[1] #It may be clickable or not. If not, the media type is PDF not text
    Content_Link_Tag = None
    try:  #If Not PDF
        Content_Link_Tag = ContentTab.find_element(By.TAG_NAME, 'a')
    except:
        Content_Link_Tag = None
    
    if Content_Link_Tag != None:
        Content_Link_Tag_Href = Content_Link_Tag.get_attribute('href')
        driver.get(Content_Link_Tag_Href)
        time.sleep(1)
        '''Now get the content'''
        '''Multiple Pages of the content page'''
        Page_Number = 1
        All_Provisions_Text = ''
        while True:
            Content_Box = driver.find_element(By.ID, 'content')
            Content_Text = Content_Box.find_element(By.ID, 'viewLegContents').find_element(By.CLASS_NAME, 'LegSnippet')
            page_Text = Content_Text.text
            All_Provisions_Text += page_Text
            print(f'Page Number: {Page_Number}')
            
            '''Now check for button'''
            Button_Panel = driver.find_element(By.CLASS_NAME, 'prevNextNav')
            try:
                Next_Button = Button_Panel.find_element(By.TAG_NAME, 'ul').find_elements(By.TAG_NAME, 'li')[-1].find_element(By.TAG_NAME, 'a')
                print(f'Next Button found: {Next_Button.text}')
                try:
                    Next_Button.click()
                    time.sleep(1)
                    Page_Number += 1
                except:
                    print(f'You are probably on the very last Provision page')
                    print(f'Provision Page Number: {Page_Number}')
                    break
            except:
                print(f'No Next Button Found - Last Provision Page')
                print(f'Provision Page Number: {Page_Number}')
                break
        return All_Provisions_Text
    
    elif Content_Link_Tag == None:
        Tag_PDF_href = driver.find_element(By.CSS_SELECTOR, 'div.LegSnippet').find_element(By.TAG_NAME, 'a').get_attribute('href')
        pdf_content = extract_content_from_pdf(Tag_PDF_href)
        return pdf_content

In [7]:
def get_titles_names_hrefs(driver, country, legislation_name, legislation_url, target_year):
    All_Titles = {}

    driver.get(legislation_url)
    time.sleep(1)

    Target_Years = [target_year]
    for idxYear, year in enumerate(Target_Years):
        Target_Year_Legislation_URL = f'{legislation_url}/{year}'
        check_target_year_existence = verify_url_existence(driver, Target_Year_Legislation_URL)
            
        if check_target_year_existence == False:
            print(f'For the legislation: {legislation_name} does not have any titles for the year: {year}')
        else:
            Title_Names = []
            Title_HREFs = []
            num = 1
            while True:
                content_div = driver.find_element(By.CSS_SELECTOR, 'div.results')
                table = content_div.find_element(By.TAG_NAME, 'table')
                tbody = table.find_element(By.TAG_NAME, 'tbody')
                tr_elements = tbody.find_elements(By.TAG_NAME, 'tr')
                for tr in tr_elements: #Iterate over the table rows / titles
                    first_td = tr.find_element(By.TAG_NAME, 'td')
                    name = first_td.text
                    href = first_td.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    Title_Names.append(name)
                    Title_HREFs.append(href)
                    
                footer = driver.find_element(By.CSS_SELECTOR, 'div.contentFooter')
                ContentFooter = footer.find_element(By.CLASS_NAME, 'interface')
                ContentFooterInterface = ContentFooter.find_element(By.CSS_SELECTOR, 'div.prevPagesNextNav')
                List = ContentFooterInterface.find_element(By.TAG_NAME, 'ul')
                Lists = List.find_elements(By.TAG_NAME, 'li')
                
                Next_Button_Found = False
                LastButton = None
                try:
                    LastButton = Lists[-1].find_element(By.TAG_NAME, 'a')
                    if 'Next' in LastButton.text:
                        # print(f'Next Button found: {LastButton.text}')
                        Next_Button_Found = True
                except:
                    # print(f'No Next Button Found - Last Page')
                    All_Titles[year] = dict(zip(Title_Names, Title_HREFs))
                    break
                
                if Next_Button_Found == True:
                    num += 1
                    # print(f'Page: {num}')
                    LastButton.click()
                    time.sleep(2)
    return All_Titles

def create_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [8]:
def check_if_ttle_data_exists_locally(title_name):
    if os.path.exists(title_name):
        return True
    else:
        return False

In [9]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
for idxCountry, (Country_Key, Country_Value_Dict) in enumerate(All_Legislations.items()): #or Final_Legislations
    for data_key, data_value in Country_Value_Dict.items():
        for legislation_name, legislation_href in data_value.items():
            for idxYear, year in enumerate(['2024']):
                print(f'Country: {Country_Key} - Data Key: {data_key} - Legislation:  {legislation_name} - Year: {year}')
                '''The variable below will have titles for all the target years of the loop of that legislation'''
                All_Titles = get_titles_names_hrefs(driver=driver, country=Country_Key, legislation_name=legislation_name, legislation_url=legislation_href, target_year=year)
                
                if year in All_Titles.keys():
                    for title_name, title_href in All_Titles[year].items(): 
                        print(f'\nTitle Name: {title_name} - Title URL: {title_href}')
                        
                        title_path = f'./Scraped_Content/{Country_Key}/{data_key}/{legislation_name}/{year}/{title_name}.txt'
                        if check_if_ttle_data_exists_locally(title_path) == True:
                            print(f'Title {title_name} already exists locally')
                            continue
                        else:
                            check = False
                            try:
                                title_content = extract_content(driver=driver, title_url=title_href)
                                check = True
                            except:
                                check = False
                            
                            if check == True:
                                create_dirs(path=f'./Scraped_Content/{Country_Key}/{data_key}/{legislation_name}/{year}')
                                with open(f'./Scraped_Content/{Country_Key}/{data_key}/{legislation_name}/{year}/{title_name}.txt', 'w') as f:
                                    f.write(title_content)
                    print('-'*10, '\n')

Country: UK - Data Key: Exclusively or primarily applies to UK - Legislation:  UK Public General Acts - Year: 2024

Title Name: Leasehold and Freehold Reform Act 2024 - Title URL: https://www.legislation.gov.uk/ukpga/2024/22/contents/enacted
Title Leasehold and Freehold Reform Act 2024 already exists locally

Title Name: Victims and Prisoners Act 2024 - Title URL: https://www.legislation.gov.uk/ukpga/2024/21/contents
Title Victims and Prisoners Act 2024 already exists locally

Title Name: Zoological Society of London (Leases) Act 2024 - Title URL: https://www.legislation.gov.uk/ukpga/2024/20/contents
Title Zoological Society of London (Leases) Act 2024 already exists locally

Title Name: British Nationality (Irish Citizens) Act 2024 - Title URL: https://www.legislation.gov.uk/ukpga/2024/19/contents
Title British Nationality (Irish Citizens) Act 2024 already exists locally

Title Name: Building Societies Act 1986 (Amendment) Act 2024 - Title URL: https://www.legislation.gov.uk/ukpga/202