In [None]:
import pprint
from selenium.common.exceptions import WebDriverException

In [None]:
%run text_and_button_classifier.ipynb
%run Create_list_of_URLs.ipynb

In [None]:
def crawl_url(url):
    """
    Crawls a webpage, extracts HTML elements, and passes content to the ML model for classification.
    
    Args:
        url (str): The URL of the web page.
    """
    try:
        # Set up Firefor WebDriver
        driver = setup_firefox_driver()
        
        # Visit the URL and retrieve page source
        driver.get(url)
        sleep(5)

        # Parse the HTML content
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find iframes and active elements
        iframes = soup.find_all(['div', 'iframe'])
        active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])

        #Handle Wayback Machine redirects, if encountered 
        while any("Got an HTTP 302 response at crawl time" in str(s) for s in active_elements):
            sleep(5)
            # Parse the HTML content
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # Find iframes and active elements
            iframes = soup.find_all(['div', 'iframe'])
            active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])

        sleep(15)
        
        # Refresh HTML elements post-load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])

        # Update iframe elements as tuples with depth information for frame navigation
        iframes = [(x, 0) for x in soup.find_all(['div', 'iframe'])]

        # Find button elements
        button_elements = soup.find_all(['button', 'input[type="submit"]', 'a'])

        # Extract text content from active and button elements
        text_content = extract_text_from_elements(active_elements)
        button_text = extract_text_from_elements(button_elements)

        # Additional text search within iframes
        search_frames(iframes, text_content, button_text)

        driver.quit()

        # Prioritize elements likely containing the word "cookie" for efficient processing
        text_content_containing_cookies = [text for text in text_content if "cookie" in text]
        text_content = text_content_containing_cookies + text_content

        # Classify the gathered content with ML model
        dialog_result = pass_to_ml_model_dialog(text_content)
        
        if dialog_result != "not found":
            return (dialog_result, pass_to_ml_model_buttons(button_text)), "found"
        return (), "not found"
    
    except (WebDriverException, TimeoutException) as e:
        # Handle driver-related or page load errors
        return (), "error"

In [None]:
def collect_website_data(websites_to_crawl, dictionary_of_urls):
    """
    Collect data from websites including URLs, dates, and results.

    Args:
        websites_to_crawl (list): List of website domains to crawl.
        dictionary_of_urls (dict): Dictionary where each key is a website domain, 
                                   and the value is a list of tuples containing URLs and dates.
    

    Returns:
        dict: A dictionary containing collected data, including dialogue results and timestamps.
    """
    
    collected_data = {} # Initialize a dictionary to store cookies
    
    # Crawl each website and store the results
    for web in websites_to_crawl:
        try_saved = {}      # Temporary storage for the latest successful crawl result
        try_url = ""        # URL of the latest successful crawl
        collected_data[web] = {}  # Initialize storage for each website’s crawled data
        dialogues_found = 0  # Counter for consecutive non-dialogue pages

        # Crawl each URL for the website with its associated date
        for url_to_visit, date in tqdm(dictionary_of_urls[web]):
            if dialogues_found <= 3:
                result_data, found = crawl_url(url_to_visit)

                # Reset counter if a dialogue is found
                if found == "found":
                    dialogues_found = 0
                else:
                    dialogues_found+=1

                # Save the result if dialogue is found
                if not dialogues_found:
                    # Store the result if it's the first or a new result
                    if try_saved and try_saved["results dialog"] != result_data[0]:
                        collected_data[web][try_url] = try_saved        
                    try_saved = {
                        "date"   : date,
                        "results dialog": result_data[0],
                        "results buttons": result_data[1]
                    }
                    try_url = url_to_visit

                # Log errors
                if found == "error" :
                    dialogues_found = 0
                    collected_data[web][url_to_visit] = "error"
            else:
                break

        # Finalize data collection for the current website  
        if try_url:
            collected_data[web][try_url] = try_saved
        else: 
            collected_data[web] = "no dialog found" #  No dialogue found for this website
            
    return collected_data

In [None]:
def main():
    # Read the dictionary_of_urls from the JSON file
    with open("dictionary_of_urls_1000.json.json", "r") as json_file:
        loaded_dictionary = json.load(json_file)    
    
    websites_to_visit = list(loaded_dictionary)
    for web in websites_to_visit:
        loaded_dictionary[web].reverse()
    
    collected_data = collect_website_data(websites_to_visit, loaded_dictionary)
    
    # Save the collected data to a JSON file
    createJSON("collected_data.json", collected_data)