In [19]:
import os
import json
from time import sleep, time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup, ResultSet
from simpletransformers.classification import ClassificationModel
# Set environment variable to avoid tokenizers parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
def setup_firefox_driver():
    """
    Sets up the Selenium Firefox driver with headless options.
    """
    firefox_options = FirefoxOptions()
    firefox_options.add_argument("--headless")
    firefox_options.set_preference("privacy.clearOnShutdown.cache", True)
    driver = webdriver.Firefox(options=firefox_options)
    return driver

In [3]:
def extract_text_from_element(element):
    """
    Extracts the text content from a given element.
    """
    return element.get_text().strip()

def extract_text_from_elements(elements):
    """
    Extracts the text content from a list of elements.
    """
    return [extract_text_from_element(element) for element in elements]

In [1]:
def pass_to_ml_model_dialog(text_content):
    """
    Passes text content to a machine learning model to detect dialogue elements.
    
    Args:
        text_content (list): The list of text content to analyze.
        
    Returns:
        str: The detected dialog element or "not found" if none is found.
    """
    ML_dir_cookie_dialog = "./models/dialogue_model0"
    model_dialog = ClassificationModel("xlmroberta", ML_dir_cookie_dialog, args={"silent": True}, use_cuda=False)
    
    start_time = int(time())
    for ele in text_content:
        if start_time+600 < int(time()):
            return "not found"
        #Check if length of the text is longer than 125 characters
        if len(ele) > 125: 
            if model_dialog.predict([ele])[0][0] == 'True':
                return ele
    return "not found"

In [None]:
def pass_to_ml_model_dialog_text(ele):
    """
    Passes a single element to the ML model for dialog detection.
    
    Args:
        ele (str): The text content to analyze.
        
    Returns:
        str: The detected dialog element or "not found" if none is found.
    """
    ML_dir_cookie_dialog = "./models/dialogue_model0"
    model_dialog = ClassificationModel("xlmroberta", ML_dir_cookie_dialog, args={"silent": True}, use_cuda=False)
    #Check if length of the text is longer than 125 characters
    if len(ele) > 125:
            if model_dialog.predict([ele])[0][0] == 'True':
                return ele
    return "not found"

In [None]:
def pass_to_ml_model_buttons(text_content):
    """
    Passes text content to a machine learning model to classify button elements.
    
    Args:
        text_content (list): The list of text content to analyze.
        
    Returns:
        dict: A dictionary containing classified buttons under "ACCEPT" and "DECLINE".
    """
    ML_dir_buttons = "./models/buttons_model0"
    model_buttons = ClassificationModel("xlmroberta", ML_dir_buttons, args={"silent": True}, use_cuda=False)
    
    buttons_classification ={"ACCEPT":[], "DECLINE": []}
    checked = []
    to_verify = []

    for ele in text_content:
        if ele not in checked and 1 < len(ele) < 35:
            if isinstance(ele, list) or isinstance(ele, ResultSet):
                for e in ele:
                    to_verify.append(e)
            else:
                to_verify.append(ele)
        checked.append(ele)
    
    # Make predictions on button texts
    predictions = model_buttons.predict(to_verify)
    
    for i, pred in enumerate(predictions[0]):
        if pred == 'ACCEPT':
            buttons_classification["ACCEPT"].append(to_verify[i])
        elif pred == 'DECLINE':
            buttons_classification["DECLINE"].append(to_verify[i])
            
    return buttons_classification

In [5]:
def search_frames(iframes, text_content, button_elements):
    """
    Searches through iframe elements to extract text and button elements recursively.
    
    Args:
        iframes (list): The list of iframes to search through.
        text_content (list): The list to store extracted text content.
        button_elements (list): The list to store extracted button elements.
    """
    start_time = int(time())
    while len(iframes):
        try:
            # Get the first iframe to process
            iframe,depth = iframes.pop(0)

            # Extract links, buttons, and text from the iframe
            iframe_elements = iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])
            buttons = iframe.find_all(['button', 'input', 'span', 'a'])
            text_content.extend(extract_text_from_elements(iframe_elements))
            button_elements.extend(extract_text_from_elements(buttons))

            # Check for nested iframes
            nested_iframes = iframe.find_all(['div', 'iframe'])
            if depth < 20 and (start_time+360) > int(time()):
                for nested_iframe in nested_iframes:
                    iframes.append((nested_iframe, depth+1))
            counter +=1
        except Exception as e:
            pass
