In [19]:
import datetime
from time import sleep
from time import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from tranco import Tranco
from bs4 import BeautifulSoup
from bs4 import Doctype
from bs4.element import ResultSet
import json
import waybackpy

from simpletransformers.classification import ClassificationModel
from simpletransformers.config.model_args import ClassificationArgs
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [2]:
def setup_firefox_driver():
    """
    Sets up the Selenium Firefox driver with headless options.

    Returns:
        webdriver: The configured Firefox webdriver instance.
    """
    firefox_options = FirefoxOptions()
    firefox_options.add_argument("--headless")
    firefox_options.set_preference("privacy.clearOnShutdown.cache", True)
    driver = webdriver.Firefox(options=firefox_options)
    return driver


In [3]:
def extract_text_from_element(element):
    """
    Extracts the text content from a given element.
    Args:
        element (bs4.element.Tag): The HTML element.
    Returns:
        str: The text content of the element.
    """
    return element.get_text().strip()

def extract_text_from_elements(elements):
    """
    Extracts the text content from a list of elements.
    Args:
        elements (list): The list of HTML elements.
    Returns:
        list: The text content of the elements.
    """
    return [extract_text_from_element(element) for element in elements]

In [1]:
def pass_to_ml_model_dialog(text_content):
    ML_dir_cookie_dialog = "./models/dialogue_model0"

    model_dialog = ClassificationModel("xlmroberta", ML_dir_cookie_dialog, args={"silent": True}, use_cuda=False)
    start_time = int(time())
    for ele in text_content:
        if start_time+600 < int(time()):
            return "not found"
        if len(ele) > 125:
            if model_dialog.predict([ele])[0][0] == 'True':
                return ele
    return "not found"


def pass_to_ml_model_dialog_text(ele):
    
    ML_dir_cookie_dialog = "./models/dialogue_model0"
    model_dialog = ClassificationModel("xlmroberta", ML_dir_cookie_dialog, args={"silent": True}, use_cuda=False)
    if len(ele) > 125:
            if model_dialog.predict([ele])[0][0] == 'True':
                return ele
    return "not found"

def pass_to_ml_model_buttons(text_content):
    ML_dir_buttons = "./models/buttons_model0"
    buttons_clas ={"ACCEPT":[], "DECLINE": []}
    checked = []
    to_verify = []
    model_buttons = ClassificationModel("xlmroberta", ML_dir_buttons, args={"silent": True}, use_cuda=False)
    for ele in text_content:
        if not ele in checked:
            if len(ele) > 1 and len(ele) < 35:
                if isinstance(ele, list) or isinstance(ele, ResultSet):
                    for e in ele:
                        to_verify.append(e)
                else:
                    to_verify.append(ele)
        checked.append(ele)
    
    predict = model_buttons.predict(to_verify)
    
    for i, pred in enumerate(predict[0]):
        if pred == 'ACCEPT':
            buttons_clas["ACCEPT"].append(to_verify[i])
        elif pred == 'DECLINE':
            buttons_clas["DECLINE"].append(to_verify[i])
            
    return buttons_clas

In [5]:
def search_frames(iframes, text_content, button_elements):
    start_time = int(time())
    while len(iframes):
        try:
            iframe,depth = iframes.pop(0)
            iframe_elements = iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])
            buttons = iframe.find_all(['button', 'input', 'span', 'a'])
            text_content.extend(extract_text_from_elements(iframe_elements))
            button_elements.extend(extract_text_from_elements(buttons))
            nested_iframes = iframe.find_all(['div', 'iframe'])
            if depth < 20 and (start_time+360) > int(time()):
                for nested_iframe in nested_iframes:
                    iframes.append((nested_iframe, depth+1))
            counter +=1
        except Exception as e:
            pass
