In [19]:
import datetime
from time import sleep
from time import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tranco import Tranco
from bs4 import BeautifulSoup
from bs4 import Doctype
from bs4.element import ResultSet
import json
import waybackpy

from simpletransformers.classification import ClassificationModel
from simpletransformers.config.model_args import ClassificationArgs
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [2]:
def setup_firefox_driver():
    """
    Sets up the Selenium Firefox driver with headless options.

    Returns:
        webdriver: The configured Firefox webdriver instance.
    """
    firefox_options = FirefoxOptions()
    firefox_options.add_argument("--headless")
    driver = webdriver.Firefox(options=firefox_options)
    return driver


In [3]:
def extract_text_from_element(element):
    """
    Extracts the text content from a given element.

    Args:
        element (bs4.element.Tag): The HTML element.

    Returns:
        str: The text content of the element.
    """
    
    return element.get_text().strip()

def extract_text_from_elements(elements):
    """
    Extracts the text content from a list of elements.

    Args:
        elements (list): The list of HTML elements.

    Returns:
        list: The text content of the elements.
    """
    
    
    return [extract_text_from_element(element) for element in elements]




In [29]:
def pass_to_ml_model_dialog(text_content):
    ML_dir_cookie_dialog = "./models/dialogue_model0"

    model_dialog = ClassificationModel("xlmroberta", ML_dir_cookie_dialog, args={"silent": True}, use_cuda=False)
    for ele in text_content:
        if len(ele) > 125:
            if model_dialog.predict([ele])[0][0] == 'True':
                return ele
    return "not found"


def pass_to_ml_model_dialog_text(ele):
    
    ML_dir_cookie_dialog = "./models/dialogue_model0"
    model_dialog = ClassificationModel("xlmroberta", ML_dir_cookie_dialog, args={"silent": True}, use_cuda=False)
    if len(ele) > 125:
            if model_dialog.predict([ele])[0][0] == 'True':
                return ele
    return "not found"

# def pass_to_ml_model_buttons(text_content):
    
#     ML_dir_buttons = "./models/buttons_model0"
    
#     buttons_clas ={"ACCEPT":[], "DECLINE": []}
#     checked = []

#     model_buttons = ClassificationModel("xlmroberta", ML_dir_buttons, args={"silent": True}, use_cuda=False)
#     for ele in text_content:
#         if not ele in checked:
#             if len(ele) > 1 and len(ele) < 35:
#                 if isinstance(ele, list) or isinstance(ele, ResultSet):
#                     for e in ele:
#                         predict = model_buttons.predict([str(e)])[0][0]
#                         if predict == 'ACCEPT':
#                             buttons_clas["ACCEPT"].append(e)
#                         elif predict == 'DECLINE':
#                             buttons_clas["DECLINE"].append(e)
#                 else:
#                     predict = model_buttons.predict([str(ele)])[0][0]
#                     if predict == 'ACCEPT':
#                         buttons_clas["ACCEPT"].append(ele)
#                     elif predict == 'DECLINE':
#                         buttons_clas["DECLINE"].append(ele)
#         checked.append(ele)
#     return buttons_clas

def pass_to_ml_model_buttons(text_content):
    ML_dir_buttons = "./models/buttons_model0"
    buttons_clas ={"ACCEPT":[], "DECLINE": []}
    checked = []
    to_verify = []
    model_buttons = ClassificationModel("xlmroberta", ML_dir_buttons, args={"silent": True}, use_cuda=False)
    for ele in text_content:
        if not ele in checked:
            if len(ele) > 1 and len(ele) < 35:
                if isinstance(ele, list) or isinstance(ele, ResultSet):
                    for e in ele:
                        to_verify.append(e)
                else:
                    to_verify.append(ele)
        checked.append(ele)
    
    predict = model_buttons.predict(to_verify)
    
    for i, pred in enumerate(predict[0]):
        if pred == 'ACCEPT':
            buttons_clas["ACCEPT"].append(to_verify[i])
        elif pred == 'DECLINE':
            buttons_clas["DECLINE"].append(to_verify[i])
            
    return buttons_clas


In [5]:
def search_frames(iframes, text_content, button_elements, depth = 0):
    for iframe in iframes:
        try:
            iframe_elements = iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])
            buttons = iframe.find_all(['button', 'input', 'span', 'a'])
            text_content.extend(extract_text_from_elements(iframe_elements))
            button_elements.extend(extract_text_from_elements(buttons))
            nested_iframes = iframe.find_all(['div', 'iframe'])
            if depth < 10:
                for nested_iframe in nested_iframes:
                    search_frames(nested_iframe, text_content, button_elements, depth + 1)
                #iframe_elements = nested_iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p'])
                #text_content.extend(extract_text_from_elements(iframe_elements))
        except Exception as e:
            pass

In [1]:
def main(url):
    """
    Main function to extract iframes and active elements from a web page and pass them to an ML model.

    Args:
        url (str): The URL of the web page.
    """
    driver = setup_firefox_driver()
    driver.get(url)
    sleep(5)
    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')
    
    iframes = soup.find_all(['div', 'iframe'])
    active_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p', 'span', 'form'])
    
    #button_elements = soup.find_all(['button', 'input', 'span', 'a'])
    button_elements = soup.find_all(['button', 'input[type="submit"]', 'a'])
    

    # Extract text content from iframes and active elements
    text_content = extract_text_from_elements(active_elements)
        
    button_text = extract_text_from_elements(button_elements)
    #text_content.extend(extract_text_from_iframes(iframes, driver))
    
    #check = search_dialog(iframes, text_content)
    search_frames(iframes, text_content, button_text)
    
    driver.quit()
    dialog_result = pass_to_ml_model_dialog(text_content)
    if dialog_result != "not found":
        return (dialog_result, pass_to_ml_model_buttons(button_text))
    return "not found"


In [26]:
# Test with a sample URL
url = 'https://youtube.com'
start = time()
butlist = main(url)
print(time()-start)




45.47003793716431


In [27]:
butlist


('Non-personalized content and ads are influenced by things like the content you’re currently viewing and your location (ad serving is based on general location). Personalized content and ads can also include things like video recommendations, a customized YouTube homepage, and tailored ads based on past activity, like the videos you watch and the things you search for on YouTube. We also use cookies and data to tailor the experience to be age-appropriate, if relevant.',
 {'ACCEPT': ['respect 😱🤯🔥', 'Cancel', 'Confirm', 'Live', 'en', 'Accept all'],
  'DECLINE': ['How Ridiculous',
   'Indiana Jones - Destined To Flop',
   'Do ONLY Dutch People Do This?',
   'Reject all']})

In [30]:
to_visit = {'amazon.fr': [
                       ('https://web.archive.org/web/20201029205242/https://www.amazon.fr/', (2020, 10)), 
                       ('https://web.archive.org/web/20201129165445/https://www.amazon.fr/', (2020, 11)), 
                       ('https://web.archive.org/web/20201229171608/https://www.amazon.fr/', (2020, 12))]}

In [31]:
webs = []
for web in to_visit:
    webs.append(web)

In [32]:
for web in webs:
    for web_visit in to_visit[web]:
        results_web = main(web_visit[0])
        if not ( results_web == "not found"):
            print(web_visit)
            print(results_web)
            break



('https://web.archive.org/web/20201029205242/https://www.amazon.fr/', (2020, 10))
("Choisir vos préférences en matière de cookiesNous utilisons des cookies et des outils similaires pour faciliter vos achats, fournir nos services, pour comprendre comment les clients utilisent nos services afin de pouvoir apporter des améliorations, et pour présenter des annonces. Des tiers approuvés ont également recours à ces outils dans le cadre de notre affichage d’annonces.Désolé, un problème s'est produit lors de l'enregistrement de vos préférences en matière de cookies. Veuillez réessayer.Accepter les cookiesPersonnaliser les cookies", {'ACCEPT': ['Accepter les cookies', 'Compte', 'Tout voir', 'La nouvelle saison est là', 'Inde'], 'DECLINE': ['Voir tout', 'Voir plus', 'Denim', 'Voir tous les produits']})


In [None]:
from ipynb.fs.full.Crawling import *

### Code not used at this moment, mainly try out code: 


In [120]:
driver = setup_firefox_driver()
driver.get("https://bbc.com")

sleep(3)

html = driver.page_source

soup = BeautifulSoup(html, 'html.parser')

iframes = soup.find_all(['div'])

iframe = iframes[-1].find_all(['iframe'][0])

print(type(iframe[0]))


#driver.switch_to.frame(iframe[0])

#driver.switch_to.frame("sp_message_iframe_783538")

#html = driver.page_source

#soup = BeautifulSoup(html, 'html.parser')

#print(iframe_soup.find('#document'))
#print(iframes)

#print(iframes[-1].find_all(['iframe'][0].find('#document')))

driver.quit()





<class 'bs4.element.Tag'>


In [15]:
def search_dialog(iframes, text_content):
    counter = 0
    for iframe in iframes:
        try:
            iframe_elements = iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p'])
            elements_text = extract_text_from_elements(iframe_elements)
            for element in text_content + elements_text: 
                if element:
                    print(counter)
                    counter+=1
                    check = pass_to_ml_model_dialog_text(element)
                    if not ( check == "not found"):
                        return check
            text_content = []
            text_content.extend(extract_text_from_elements(iframe_elements))
            nested_iframes = iframe.find_all(['div', 'iframe'])
            for nested_iframe in nested_iframes:
                #search_dialog(nested_iframe, text_content)
                iframe_elements = nested_iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p'])
                text_content.extend(extract_text_from_elements(iframe_elements))
        except:
            pass

In [None]:
def search_dialog(soup):
    try:
        iframe_elements = soup.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p'])
        elements_text = extract_text_from_elements(iframe_elements)
        for element in elements_text: 
            if element:
                check = pass_to_ml_model_dialog_text(element)
                if not ( check == "not found"):
                    return check
            
        nested_iframes = soup.find_all(['div', 'iframe'])
        for nested_iframe in nested_iframes:
            end_loop = search_frames(nested_iframe)
            if not (end_loop == "not found"):
                return end_loop
        return "not found"
    except:
        pass

In [None]:
def extract_text_from_iframes(iframes, driver):
    """
    Extracts the text content from a list of iframes and their child iframes.

    Args:
        iframes (list): The list of iframe elements.

    Returns:
        list: The text content from iframes and their child iframes.
    """
    text_content = []
    

    for iframe in iframes:
        print(iframe)
        driver.switch_to.frame(iframe['name'])
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        iframe_elements = soup.find_all(['a', 'button', 'input', 'iframe'])
        text_content.extend(extract_text_from_elements(iframe_elements))
        nested_iframes = soup.find_all('iframe')
        for nested_iframe in nested_iframes:
            extract_text_from_iframe(nested_iframe, driver)
        driver.switch_to.default_content()

    driver.quit()
    return text_content



In [None]:
def search_frames(iframes, text_content, button_text):
    for iframe in iframes:
        try:
            iframe_elements = iframe.find_all(['a', 'button', 'input', 'yt-formatted-string', 'p'])
            text_content.extend(extract_text_from_elements(iframe_elements))
            nested_iframes = iframe.find_all(['div', 'iframe'])
            for nested_iframe in nested_iframes:
                search_frames(nested_iframe, text_content, button_text)
        except:
            pass

In [None]:
# Test with a sample URL
url = 'https://bbc.com'
print(main(url))

In [None]:
# Test with a sample URL
url = 'https://bbc.com'
print(main(url))

In [None]:
def search_frames(iframes, text_content):
    for iframe in iframes:
        try:
            iframe_elements = iframe.find_all(['button'])
            text_content.extend(extract_text_from_elements(iframe_elements))
            nested_iframes = iframe.find_all(['div'])
            for nested_iframe in nested_iframes:
                iframe_elements = nested_iframe.find_all(['button'])
                text_content.extend(extract_text_from_elements(iframe_elements))
        except:
            pass
def main(url):
    """
    Main function to extract iframes and active elements from a web page and pass them to an ML model.

    Args:
        url (str): The URL of the web page.
    """
    driver = setup_firefox_driver()
    driver.get(url)
    sleep(10)
    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')
    iframes = soup.find_all(['div', 'iframe'])
    active_elements = soup.find_all(['button'])

    # Extract text content from iframes and active elements
    text_content = extract_text_from_elements(active_elements)
    #text_content.extend(extract_text_from_iframes(iframes, driver))
    
    search_frames(iframes, text_content)
    
    #print(active_elements)
    driver.quit()
    # Pass the text content to the ML model
    print(len(text_content))
    return pass_to_ml_model(text_content)