# Import All Required Libraries and Packages

In [None]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import os
import requests
import fitz
import tempfile

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

# Current Year

In [None]:
import time
current_year = time.gmtime().tm_year
current_year

# Setting Important HREF Links for the Daily Updates Tabs

In [None]:
# URL_DAILY_UPDATE = {
#     'UK' : 'https://www.legislation.gov.uk/new/uk',
#     'Wales' : 'https://www.legislation.gov.uk/new/wales',
#     'Scotland' : 'https://www.legislation.gov.uk/new/scotland',
#     'Northern Ireland' : 'https://www.legislation.gov.uk/new/ni',
# }

URL_DAILY_UPDATE = {
    'Northern Ireland' : 'https://www.legislation.gov.uk/new/ni',
}

# Definign Some Functions Which Will be Used for The Daily Updates

In [None]:
def verify_daily_update(driver):
    '''A function that chechs if a new published daily update exists in a tab'''
    h5_content = driver.find_element(By.CLASS_NAME, 'p_content').find_element(By.TAG_NAME, 'h5').text
    if h5_content == 'Nothing published on this date':
        return False
    return True

def extract_content_from_pdf(pdf_url):
    response = requests.get(f'{pdf_url}')
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf_file:
        temp_pdf_path = temp_pdf_file.name
        response = requests.get(pdf_url)
        temp_pdf_file.write(response.content)

    # Open the temporary PDF file and extract text content
    pdf_document = fitz.open(temp_pdf_path)
    text_content = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text_content += page.get_text()
    pdf_document.close()

    return text_content

def extract_content(driver, title_url):
    '''Function that extracts the content from a tab. NOte: This is same as in Scrapper.ipynb'''
    driver.get(title_url)
    time.sleep(1)
    Title_Content_Div = driver.find_element(By.CSS_SELECTOR, 'div.legToc')
    NavBar = Title_Content_Div.find_element(By.ID, 'legSubNav')
    NavBarLists = NavBar.find_elements(By.TAG_NAME, 'li')
    ContentTab = NavBarLists[1] #It may be clickable or not. If not, the media type is PDF not text
    Content_Link_Tag = None
    try:  #If Not PDF
        Content_Link_Tag = ContentTab.find_element(By.TAG_NAME, 'a')
    except:
        Content_Link_Tag = None
    
    if Content_Link_Tag != None:
        Content_Link_Tag_Href = Content_Link_Tag.get_attribute('href')
        driver.get(Content_Link_Tag_Href)
        time.sleep(1)
        '''Now get the content'''
        '''Multiple Pages of the content page'''
        Page_Number = 1
        All_Provisions_Text = ''
        while True:
            Content_Box = driver.find_element(By.ID, 'content')
            Content_Text = Content_Box.find_element(By.ID, 'viewLegContents').find_element(By.CLASS_NAME, 'LegSnippet')
            page_Text = Content_Text.text
            All_Provisions_Text += page_Text
            print(f'Page Number: {Page_Number}')
            
            '''Now check for button'''
            Button_Panel = driver.find_element(By.CLASS_NAME, 'prevNextNav')
            try:
                Next_Button = Button_Panel.find_element(By.TAG_NAME, 'ul').find_elements(By.TAG_NAME, 'li')[-1].find_element(By.TAG_NAME, 'a')
                print(f'Next Button found: {Next_Button.text}')
                try:
                    Next_Button.click()
                    time.sleep(1)
                    Page_Number += 1
                except:
                    print(f'You are probably on the very last Provision page')
                    print(f'Provision Page Number: {Page_Number}')
                    break
            except:
                print(f'No Next Button Found - Last Provision Page')
                print(f'Provision Page Number: {Page_Number}')
                break
        return All_Provisions_Text
    
    elif Content_Link_Tag == None:
        Tag_PDF_href = driver.find_element(By.CSS_SELECTOR, 'div.LegSnippet').find_element(By.TAG_NAME, 'a').get_attribute('href')
        pdf_content = extract_content_from_pdf(Tag_PDF_href)
        return pdf_content

def get_daily_update(driver, url):
    '''A function that extracts the daily update from a tab'''
    driver.get(url)
    time.sleep(1)
    
    if verify_daily_update(driver) == True:
        New_Titles = {}
        Titles_Href_List = []
        Title_Name_List = []

        Content_div = driver.find_element(By.CLASS_NAME, 'p_content')
        Legislation_Name = Content_div.find_element(By.TAG_NAME, 'h5').text
        Title_URLS = Content_div.find_elements(By.TAG_NAME, 'h6')

        for idxNewTitle, title in enumerate(Title_URLS):
            href = title.find_element(By.TAG_NAME, 'a').get_attribute('href')
            name = title.text.split('-')[-1].strip()
            Titles_Href_List.append(href)
            Title_Name_List.append(name)

        if len(Titles_Href_List) > 0:
            New_Titles[f'{Legislation_Name}'] = dict(zip(Title_Name_List, Titles_Href_List))
            return New_Titles
        else:
            return None
        
def create_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [None]:
for Country_Key in URL_DAILY_UPDATE.keys():
    New_Titles = get_daily_update(driver, URL_DAILY_UPDATE[Country_Key])
    if New_Titles != {} and New_Titles is not None:
        for idxLegislation, legislation_name in enumerate(New_Titles.keys()):
            for title_name, title_url in New_Titles[legislation_name].items():
                '''Now since we have the title url, we can extract the title data'''
                print(f'{Country_Key} - {legislation_name} - {title_name} - {title_url}')
                
                Title_Data_Content = extract_content(driver, title_url)

                create_dirs(path=f'./New_Content/{Country_Key}/{legislation_name}/{current_year}')
                with open(f'./New_Content/{Country_Key}/{legislation_name}/{current_year}/{title_name}.txt', 'w') as f:
                    f.write(Title_Data_Content)
                f.close()

In [None]:
driver.quit()