In [None]:
import pprint
from selenium.common.exceptions import WebDriverException

In [None]:
%run Model_setup.ipynb
%run Create_list_of_URLs.ipynb

In [None]:
def crawl_url(url):
    """
    Crawl a URL, extract elements, and pass them to an ML model.
    
    Args:
        url (str): The URL of the web page.
    """
    try:
        # Set up Firefor WebDriver
        driver = setup_firefox_driver()
        
        # Visit the URL and retrieve page source
        driver.get(url)
        
        sleep(5)
        html = driver.page_source
        
        # Initialize a list to store cookies
        list_cookies = []

        # Parse the HTML content
        soup = BeautifulSoup(html, 'html.parser')

        # Find iframes and active elements
        iframes = soup.find_all(['div', 'iframe'])
        active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])

        #wait until we passed the wayback redirect 
        while any("Got an HTTP 302 response at crawl time" in str(s) for s in active_elements):
            sleep(5)
            html = driver.page_source
    
            # Parse the HTML content
            soup = BeautifulSoup(html, 'html.parser')
    
            # Find iframes and active elements
            iframes = soup.find_all(['div', 'iframe'])
            active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])

        sleep(15)

        html = driver.page_source
        
        # Parse the HTML content
        soup = BeautifulSoup(html, 'html.parser')

        # Find iframes and active elements
        active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])
        iframes = []
        for x in soup.find_all(['div', 'iframe']):
            iframes.append((x,0))

        # Find button elements
        button_elements = soup.find_all(['button', 'input[type="submit"]', 'a'])

        # Extract text content from active elements
        text_content = extract_text_from_elements(active_elements)

        # Extract button text
        button_text = extract_text_from_elements(button_elements)
        #text_content.extend(extract_text_from_iframes(iframes, driver))

        # Search within iframes
        search_frames(iframes, text_content, button_text)

        # Get cookies from the page
        list_cookies = driver.get_cookies()
        driver.quit()

        # Put the text that has the word cookie first as it is more likely to be a cookie dialog
        text_content_containing_cookies = [text for text in text_content if "cookie" in text]
        text_content = text_content_containing_cookies + text_content

        
        
        # Pass text content to the ML model for dialog detection
        dialog_result = pass_to_ml_model_dialog(text_content)
        
        if dialog_result != "not found":
            return (dialog_result, pass_to_ml_model_buttons(button_text)), list_cookies, "found"
        return (), list_cookies, "not found"
    
    except (WebDriverException, TimeoutException) as e:
        return (), [], "error"


In [None]:
def collect_website_data(websites_to_crawl, dictionary_of_urls):
    """
    Collect data from websites including URLs, dates, results, and cookies.

    Args:
        dictionary_of_urls (dict): Dictionary of URLs with associated dates.

    Returns:
        dict: A dictionary containing collected data.
    """
    
    collected_data = {} # Initialize a dictionary to store cookies
    
    # Crawl each website and store the results
    for web in websites_to_crawl:
        try_saved = {}
        try_url = ""
        collected_data[web] = {}
        dialogues_found = 0
        for url_to_visit, date in tqdm(dictionary_of_urls[web]):
            if dialogues_found <= 3:
                result_data, cookies, found = crawl_url(url_to_visit)
                if found == "found":
                    dialogues_found = 0
                else:
                    dialogues_found+=1
                print("I did {} cookie dialog and the val is {}".format(found, dialogues_found))
                if not dialogues_found:
                    if try_saved:
                        if not try_saved["results dialog"] == result_data[0]:
                            collected_data[web][try_url] = try_saved
                    try_saved = {
                        "date"   : date,
                        "results dialog": result_data[0],
                        "results buttons": result_data[1],
                        "cookies": cookies
                    }
                    try_url = url_to_visit
                if found == "error" :
                    print(f"Error while crawling URL: {url_to_visit} - {result_data}")
                    dialogues_found = 0
                    collected_data[web][url_to_visit] = "error"
                else:
                    print(f"Successfully cralwed URL: {url_to_visit}")
            else:
                break
        if try_url:
            collected_data[web][try_url] = try_saved
        else: 
            collected_data[web] = "no dialog found"
    return collected_data

In [None]:
def main():
    # Read the dictionary_of_urls from the JSON file
    with open("dictionary_of_urls_1000.json.json", "r") as json_file:
        loaded_dictionary = json.load(json_file)    
    
    websites_to_visit = list(loaded_dictionary)
    for web in websites_to_visit:
        loaded_dictionary[web].reverse()
    
    collected_data = collect_website_data(websites_to_visit, loaded_dictionary)
    
    # Save the collected data to a JSON file
    createJSON("collected_data.json", collected_data)