# Import All the Required Libraries and Packages

In [65]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By

# Functions

In [66]:
def verify_target_year_existence(driver, url, legislation_name, year):
    target_url_year = f'{url}/{year}'
    try:
        driver.get(target_url_year)
        return True
    except:
        print(f'Exception: The legislation:{legislation_name} does not have any titles for the year: {year}')
        return False
    
def fetch_leg_types(driver, div_selector):
    div_element = driver.find_element(By.CSS_SELECTOR, div_selector)
    leg_types_elements = div_element.find_elements(By.CSS_SELECTOR, 'ul.legTypes')

    if leg_types_elements:
        leg_types_element = leg_types_elements[0]
        list_items = leg_types_element.find_elements(By.TAG_NAME, 'li')
        list_names = [item.text for item in list_items]
        list_hrefs = [item.find_element(By.TAG_NAME, 'a').get_attribute('href') for item in list_items if item.find_element(By.TAG_NAME, 'a')]
        return dict(zip(list_names, list_hrefs))
    else:
        return [], []
    
def get_legislations_href(driver, country, country_home_url):
    driver.get(country_home_url)
    time.sleep(2)
    
    selectors = ['div.s_4.p_one.legCol', 'div.s_4.p_two.legCol']

    with ThreadPoolExecutor() as executor:
        future_to_selector = {executor.submit(fetch_leg_types, driver, selector): selector for selector in selectors}
        results = []

        for future in as_completed(future_to_selector):
            selector = future_to_selector[future]
            try:
                data = future.result()
                results.append(data)
            except Exception as exc:
                print(f'Error fetching data for {selector}: {exc}')

    Final_Results = {
        f'Exclusively or primarily applies to {country}': results[0],
        f'May contain legislation that applies to {country}': results[1]
    } 
    return Final_Results

def filter_keys(data):
    filtered_data = {}
    for key, value in data.items():
        if isinstance(value, dict):
            filtered_data[key] = {k: v for k, v in value.items() if k.startswith("Exclusively")}
    return filtered_data

def get_final_target_legislations(All_Legislations, Each_Countries_Uniques, Each_Countries_Overlaps):
    Each_Countries_Uniques_updated_keys = []
    Each_Countries_Uniques_updated_values = []
    for x in Each_Countries_Uniques:
        Each_Countries_Uniques_updated_keys.append(list(x.keys()))
        Each_Countries_Uniques_updated_values.append(list(x.values()))
        
    Each_Countries_Overlaps_updated_keys = []
    Each_Countries_Overlaps_updated_values = []
    for x in Each_Countries_Overlaps:
        Each_Countries_Overlaps_updated_keys.append(list(x.keys()))
        Each_Countries_Overlaps_updated_values.append(list(x.values()))
    
    Overlaps_not_in_any_unique = []
    for country, overlap in  zip(['UK', 'Scotland', 'Wales', 'ni'],Each_Countries_Overlaps_updated_keys):
        # print(f'Country: {country}')
        for ovl in overlap: #overlap of current country
            check = False
            for country_other, uniques in zip(['UK', 'Scotland', 'Wales', 'ni'],Each_Countries_Uniques_updated_keys):
                if country_other == country:
                    continue
                else:
                    if ovl in uniques:
                        check = True
            if check == False:
                Overlaps_not_in_any_unique.append(ovl)
                # print(f'Overlap: -{ovl}- not in any countries uniques')
    Unique_Overlaps = np.unique(Overlaps_not_in_any_unique)

    idx = 0
    hrefs_dict_of_unique_overlaps = {}
    for country in Each_Countries_Overlaps:
        for key, val in country.items():
            if key in Unique_Overlaps:
                hrefs_dict_of_unique_overlaps[key] = val
    hrefs_dict_of_unique_overlaps
    
    All_Legislations = filter_keys(All_Legislations)
    All_Legislations['Extras'] = {'Legislations that are unique to none': hrefs_dict_of_unique_overlaps}

    # for key, val in All_Legislations.items():
    #     print(f'{key}:\n {val}\n')
        
    return All_Legislations

# Scrapper

## Part 1
> Get All Legislations
> - Legislations Unique to a Country
> - Legislations not Unique to any Country

In [67]:
All_Legislations = {}
Each_Countries_Uniques = []
Each_Countries_Overlaps = []

for idxCountry, Country in enumerate(['UK', 'Scotland', 'Wales', 'ni']):
    Country_Name = Country
    if Country_Name == 'ni':
        Country_Name = 'NothernIreland'
    Country_URL = os.path.join('https://www.legislation.gov.uk/browse' , Country.lower())
    
    print(f'Getting Legislations HREFs for the country: {Country_Name}')
    
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get(Country_URL)
    time.sleep(2)

    Country_Legislations = get_legislations_href(driver=driver,
                                                country=f'{Country_Name}',
                                                country_home_url=f'{Country_URL}')
    
    All_Legislations[Country_Name] = Country_Legislations
    
    Each_Countries_Uniques.append(Country_Legislations[list(Country_Legislations.keys())[0]])
    Each_Countries_Overlaps.append(Country_Legislations[list(Country_Legislations.keys())[1]])
    
Final_Legislations = get_final_target_legislations(All_Legislations, Each_Countries_Uniques, Each_Countries_Overlaps)

Getting Legislations HREFs for the country: UK
Getting Legislations HREFs for the country: Scotland
Getting Legislations HREFs for the country: Wales
Getting Legislations HREFs for the country: NothernIreland


In [68]:
Final_Legislations

{'UK': {'Exclusively or primarily applies to UK': {'UK Public General Acts': 'https://www.legislation.gov.uk/ukpga',
   'UK Local Acts': 'https://www.legislation.gov.uk/ukla',
   'UK Private and Personal Acts': 'https://www.legislation.gov.uk/ukppa',
   'UK Statutory Instruments': 'https://www.legislation.gov.uk/uksi',
   'UK Ministerial Directions': 'https://www.legislation.gov.uk/ukmd',
   'UK Ministerial Orders': 'https://www.legislation.gov.uk/ukmo',
   'UK Statutory Rules and Orders 1900-1948': 'https://www.legislation.gov.uk/uksro',
   'UK Draft Statutory Instruments': 'https://www.legislation.gov.uk/ukdsi'}},
 'Scotland': {'Exclusively or primarily applies to Scotland': {'Acts of the Scottish Parliament': 'https://www.legislation.gov.uk/asp',
   'Acts of the Old Scottish Parliament 1424-1707': 'https://www.legislation.gov.uk/aosp',
   'Scottish Statutory Instruments': 'https://www.legislation.gov.uk/ssi',
   'Scottish Draft Statutory Instruments': 'https://www.legislation.gov.uk

---

## Part 2
> Given all legislations and target years, extract titles and then content of each title and save to a .txt file. 

## Now Scrape Content from the Legislations

In [69]:
def verify_url_existence(driver, url):
    try:
        driver.get(url)
        time.sleep(1)
        content_div = driver.find_element(By.CSS_SELECTOR, 'div.results') #if there are titles. that year has titles
        return True
    except:
        return False
    
def extract_content(driver, title_url):
    driver.get(title_url)
    time.sleep(1)
    Title_Content_Div = driver.find_element(By.CSS_SELECTOR, 'div.legToc')
    NavBar = Title_Content_Div.find_element(By.ID, 'legSubNav')
    NavBarLists = NavBar.find_elements(By.TAG_NAME, 'li')
    ContentTab = NavBarLists[1]
    Content_Link = ContentTab.find_element(By.TAG_NAME, 'a').get_attribute('href')
    
    driver.get(Content_Link)
    time.sleep(1)
    
    '''Now get the content'''
    '''Multiple Pages of the content page'''
    Page_Number = 1
    All_Provisions_Text = ''
    while True:
        Content_Box = driver.find_element(By.ID, 'content')
        Content_Text = Content_Box.find_element(By.ID, 'viewLegContents').find_element(By.CLASS_NAME, 'LegSnippet')
        page_Text = Content_Text.text
        All_Provisions_Text += page_Text
        print(f'Page Number: {Page_Number}')
        # print(f'Page Text: {page_Text}')
        
        '''Now check for button'''
        Button_Panel = driver.find_element(By.CLASS_NAME, 'prevNextNav')
        try:
            Next_Button = Button_Panel.find_element(By.TAG_NAME, 'ul').find_elements(By.TAG_NAME, 'li')[-1].find_element(By.TAG_NAME, 'a')
            print(f'Next Button found: {Next_Button.text}')
            try:
                Next_Button.click()
                time.sleep(1)
                Page_Number += 1
            except:
                print(f'You are probably on the very last Provision page')
                print(f'Provision Page Number: {Page_Number}')
                break
        except:
            print(f'No Next Button Found - Last Provision Page')
            print(f'Provision Page Number: {Page_Number}')
            break
    return All_Provisions_Text

In [70]:
def get_titles_names_hrefs(driver, country, legislation_name, legislation_url, target_year):
    All_Titles = {}

    driver.get(legislation_url)
    time.sleep(1)

    Target_Years = [target_year]
    for idxYear, year in enumerate(Target_Years):
        Target_Year_Legislation_URL = f'{legislation_url}/{year}'
        check_target_year_existence = verify_url_existence(driver, Target_Year_Legislation_URL)
            
        if check_target_year_existence == False:
            print(f'For the legislation: {legislation_name} does not have any titles for the year: {year}')
        else:
            Title_Names = []
            Title_HREFs = []
            num = 1
            while True:
                content_div = driver.find_element(By.CSS_SELECTOR, 'div.results')
                table = content_div.find_element(By.TAG_NAME, 'table')
                tbody = table.find_element(By.TAG_NAME, 'tbody')
                tr_elements = tbody.find_elements(By.TAG_NAME, 'tr')
                for tr in tr_elements: #Iterate over the table rows / titles
                    first_td = tr.find_element(By.TAG_NAME, 'td')
                    name = first_td.text
                    href = first_td.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    Title_Names.append(name)
                    Title_HREFs.append(href)
                    
                footer = driver.find_element(By.CSS_SELECTOR, 'div.contentFooter')
                ContentFooter = footer.find_element(By.CLASS_NAME, 'interface')
                ContentFooterInterface = ContentFooter.find_element(By.CSS_SELECTOR, 'div.prevPagesNextNav')
                List = ContentFooterInterface.find_element(By.TAG_NAME, 'ul')
                Lists = List.find_elements(By.TAG_NAME, 'li')
                
                Next_Button_Found = False
                LastButton = None
                try:
                    LastButton = Lists[-1].find_element(By.TAG_NAME, 'a')
                    if 'Next' in LastButton.text:
                        # print(f'Next Button found: {LastButton.text}')
                        Next_Button_Found = True
                except:
                    # print(f'No Next Button Found - Last Page')
                    All_Titles[year] = dict(zip(Title_Names, Title_HREFs))
                    break
                
                if Next_Button_Found == True:
                    num += 1
                    # print(f'Page: {num}')
                    LastButton.click()
                    time.sleep(2)
    return All_Titles

def create_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [71]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
for idxCountry, (Country_Key, Country_Value_Dict) in enumerate(Final_Legislations.items()):
    for data_key, data_value in Country_Value_Dict.items(): #Iterates for 1 time. i.e. the only type of legislation in it
        for legislation_name, legislation_href in data_value.items():
            for idxYear, year in enumerate(['2024']):
                '''The variable below will have titles for all the target yearsof the loop of that legislation'''
                All_Titles = get_titles_names_hrefs(driver=driver, country=Country_Key, legislation_name=legislation_name, legislation_url=legislation_href, target_year=year)
                print(len(All_Titles[year]))
                print(All_Titles[year])
                print()
                
                for title_name, title_href in All_Titles[year].items(): 
                    print(f'{title_name} - {title_href}')
                    title_content = extract_content(driver=driver, title_url=title_href)
                    
                    create_dirs(path=f'./Scraped_Content/{Country_Key}/{data_key}/{legislation_name}/{year}')
                    with open(f'./Scraped_Content/{Country_Key}/{data_key}/{legislation_name}/{year}/{title_name}.txt', 'w') as f:
                        f.write(title_content)
                    break
                break
            break
        break
    break

22
{'Leasehold and Freehold Reform Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/22/contents/enacted', 'Victims and Prisoners Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/21/contents', 'Zoological Society of London (Leases) Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/20/contents', 'British Nationality (Irish Citizens) Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/19/contents', 'Building Societies Act 1986 (Amendment) Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/18/contents', 'Paternity Leave (Bereavement) Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/17/contents', 'Pet Abduction Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/16/contents', 'Media Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/15/contents', 'Post Office (Horizon System) Offences Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/14/contents', 'Digital Markets, Competition and Consumers Act 2024': 'https://www.legislation.gov.uk/ukpga/2024/13/contents', 

In [72]:
# # Initialize the Chrome driver
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# # Open the URL
# url = 'https://www.legislation.gov.uk/browse/uk'
# driver.get(url)

# # Wait for the page to load completely
# driver.implicitly_wait(10)

# #gives us the two divs 
# # Locate the div with the specified class using a more precise CSS selector
# div_element = driver.find_element(By.CSS_SELECTOR, 'div.s_4.p_one.legCol') 

# # Locate the nested ul with the class 'legTypes' using a more general CSS selector within the entire document
# leg_types_elements = div_element.find_elements(By.CSS_SELECTOR, 'ul.legTypes')

# # Assuming there's only one such element
# if leg_types_elements:
#     leg_types_element = leg_types_elements[0]

#     # Locate the list items within the 'legTypes' ul
#     list_items = leg_types_element.find_elements(By.TAG_NAME, 'li')

#     # Get the names (text content) of the list items
#     list_names = [item.text for item in list_items]

#     # Get the href attributes of the list items
#     list_hrefs = [item.find_element(By.TAG_NAME, 'a').get_attribute('href') for item in list_items if item.find_element(By.TAG_NAME, 'a')]

#     # Print the list names
#     print("List Names:", list_names)

#     # Print the list hrefs
#     print("List Hrefs:", list_hrefs)
# else:
#     print("No legTypes element found within the specified div.")

In [73]:
# for legislation_name, legislation_href in zip(list_names, list_hrefs):
#     print(f'{legislation_name}: {legislation_href}')
    
#     driver.get(legislation_href)
    
#     Target_Years = ['2024']
#     for idxYear,  year in enumerate(Target_Years):
#         try:
#             print(f'Year: {year}')

#             year_href = f'{legislation_href}/{year}'
#             driver.get(year_href)
            
#             Page_Number = 1
#             '''Retrieve Titles for page 1'''
#             all_title_names = []
#             all_title_hrefs = []
#             content_div = driver.find_element(By.CSS_SELECTOR, 'div.results')
#             table = content_div.find_element(By.TAG_NAME, 'table')
#             tbody = table.find_element(By.TAG_NAME, 'tbody')
#             tr_elements = tbody.find_elements(By.TAG_NAME, 'tr')
#             for tr in tr_elements: #Iterate over the table rows / titles
#                 first_td = tr.find_element(By.TAG_NAME, 'td')
#                 name = first_td.text
#                 href = first_td.find_element(By.TAG_NAME, 'a').get_attribute('href')

#             print(f'For the year: {year} we have: {len(all_title_names)} titles')
#         except:
#             print(f'The legislation:{legislation_name} does not have any titles for the year: {year}')
        
# driver.quit()

In [74]:


# # Initialize the Chrome driver
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# # Base URL and target years
# base_url = 'https://www.legislation.gov.uk/ukpga'
# target_years = ['2024']

# # List to store hrefs
# all_hrefs = []

# for year in target_years:
#     # Construct the URL for the specific year
#     url = f'{base_url}/{year}'#gives us the two divs
#     print(f'Processing year {url}')
#     driver.get(url)

#     # Wait for the page to load completely
#     driver.implicitly_wait(10)

#     num = 1
#     while True:
#         try:
#             # Locate the div with the class 'content'
#             content_div = driver.find_element(By.CSS_SELECTOR, 'div.results')

#             # Locate the table within the content div
#             table = content_div.find_element(By.TAG_NAME, 'table')

#             # Locate the tbody within the table
#             tbody = table.find_element(By.TAG_NAME, 'tbody')

#             # Locate all tr elements within the tbody
#             tr_elements = tbody.find_elements(By.TAG_NAME, 'tr')

#             for tr in tr_elements:
#                 try:
#                     # Get the first td element
#                     first_td = tr.find_element(By.TAG_NAME, 'td')

#                     # Find the 'a' tag within the first td and get its href
#                     href = first_td.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    
#                     # Append the href to the list
#                     all_hrefs.append(href)
#                 except Exception as e:
#                     print(f"Error processing a row: {e}")

#             # Locate the pagination footer
#             footer = driver.find_element(By.CSS_SELECTOR, 'div.contentFooter')
#             ContentFooter = footer.find_element(By.CLASS_NAME, 'interface')
#             ContentFooterInterface = ContentFooter.find_element(By.CSS_SELECTOR, 'div.prevPagesNextNav')
#             List = ContentFooterInterface.find_element(By.TAG_NAME, 'ul')
#             Lists = List.find_elements(By.TAG_NAME, 'li')
#             NextButton = Lists[-1].find_element(By.TAG_NAME, 'a')
            
#             if NextButton and 'Next' in NextButton.text:
#                 num += 1
#                 print(f'Page: {num}')
#                 NextButton.click()
#                 time.sleep(2)
#             else:
#                 break

#         except Exception as e:
#             print(f"Error processing year {year}: {e}")
#             break

# # Print all the collected hrefs
# print(len(all_hrefs))
# print(all_hrefs)

# # Close the browser
# driver.quit()

In [75]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service as ChromeService
# from webdriver_manager.chrome import ChromeDriverManager

# # Initialize the Chrome driver
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# # Open the URL
# url = 'https://www.legislation.gov.uk/browse/uk'
# driver.get(url)

# # Locate the div with the specified class using a more precise CSS selector
# div_element = driver.find_element(By.CSS_SELECTOR, 'div.s_4.p_two.legCol')

# # Locate the nested ul with the class 'legTypes' using a more general CSS selector within the entire document
# leg_types_elements = div_element.find_elements(By.CSS_SELECTOR, 'ul.legTypes')

# # Assuming there's only one such element
# if leg_types_elements:
#     leg_types_element = leg_types_elements[0]

#     # Locate the list items within the 'legTypes' ul
#     list_items = leg_types_element.find_elements(By.TAG_NAME, 'li')

#     # Get the names (text content) of the list items
#     list_names = [item.text for item in list_items]

#     # Print the list names
#     print(list_names)
# else:
#     print("No legTypes element found within the specified div.")

# # Close the browser
# driver.quit()
