In [7]:
import time
import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import chromedriver_autoinstaller

def setup_driver(extension_path):
    # Ensure Chromedriver is installed
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()
    
    # Load Ghostery Extension
    options.add_extension(extension_path)
    
    service = Service()  # Automatically finds the installed chromedriver
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def load_and_scroll_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    # Accept cookies if presented on the website
    try:
        cookie_btn = driver.find_element(By.XPATH, '//button[text()="Accept Cookies"]')
        if cookie_btn:
            cookie_btn.click()
            time.sleep(2)
    except:
        pass

    # Scroll down the page
    for _ in range(5):
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(2)  # Sleep to give time for content to load

def extract_ghostery_data(driver):
    # Switch to Ghostery extension popup - this may vary; needs detailed inspection
    try:
        # Open Ghostery extension
        driver.execute_script("chrome.runtime.sendMessage('mlomiejdfkolichcflejclcbmpeaniij', { action: 'open' });")
        time.sleep(2)
        
        # Inspect the popup for data extraction
        driver.switch_to.window(driver.window_handles[-1])
        
        # Extract data from Ghostery popup
        ghostery_data = driver.execute_script("return document.body.innerText")
        return ghostery_data
    except:
        return None
    finally:
        # Switch back to main window
        driver.switch_to.window(driver.window_handles[0])

def main(urls, extension_path):
    driver = setup_driver(extension_path)
    all_ghostery_data = {}
    
    for url in urls:
        try:
            load_and_scroll_page(driver, url)
            ghostery_data = extract_ghostery_data(driver)
            if ghostery_data:
                all_ghostery_data[url] = ghostery_data
        except Exception as e:
            print(f"Failed to process {url}: {str(e)}")
    
    driver.quit()
    return all_ghostery_data

if __name__ == "__main__":
    urls = ["Golem.de", "Heise.de"]
    extension_path = "C:/Users/megar/OneDrive/Documenten/Business Analytics Management/Scriptie/data/MLOMIEJDFKOLICHCFLEJCLCBMPEANIIJ_8_12_10_0 (1).crx"
    
    data = main(urls, extension_path)
    print(json.dumps(data, indent=4))

Failed to process Golem.de: Message: invalid argument
  (Session info: chrome=124.0.6367.158)
Stacktrace:
	GetHandleVerifier [0x00F0C113+48259]
	(No symbol) [0x00E9CA41]
	(No symbol) [0x00D908A3]
	(No symbol) [0x00D7C89F]
	(No symbol) [0x00D7B4DD]
	(No symbol) [0x00D7B8BB]
	(No symbol) [0x00D9315A]
	(No symbol) [0x00E0A297]
	(No symbol) [0x00DF0D9C]
	(No symbol) [0x00E09B9C]
	(No symbol) [0x00DF0B36]
	(No symbol) [0x00DC570D]
	(No symbol) [0x00DC62CD]
	GetHandleVerifier [0x011C65A3+2908435]
	GetHandleVerifier [0x01203BBB+3159851]
	GetHandleVerifier [0x00FA50CB+674875]
	GetHandleVerifier [0x00FAB28C+699900]
	(No symbol) [0x00EA6244]
	(No symbol) [0x00EA2298]
	(No symbol) [0x00EA242C]
	(No symbol) [0x00E94BB0]
	BaseThreadInitThunk [0x756D7BA9+25]
	RtlInitializeExceptionChain [0x7716BE3B+107]
	RtlClearBits [0x7716BDBF+191]

Failed to process Heise.de: Message: invalid argument
  (Session info: chrome=124.0.6367.158)
Stacktrace:
	GetHandleVerifier [0x00F0C113+48259]
	(No symbol) [0x00E9CA4

In [27]:
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
import pyautogui

def setup_driver(extension_crx_path):
    # Ensure Chromedriver is installed
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()

    # Add the CRX extension
    options.add_extension(extension_crx_path)
    
    # Enable verbose logging
    options.add_argument("--log-level=3")
    
    # Initialize the Chrome driver with the extension
    service = Service()  # Automatically finds the installed Chromedriver
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def load_and_scroll_page(driver, url):
    driver.get(url)
    
    # Wait for the page to load and display the initial content
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    
    # Accept cookies if presented on the website
    try:
        cookie_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept Cookies"]')))
        if cookie_btn:
            cookie_btn.click()
    except Exception as e:
        print(f"No cookie button found or failed to click: {e}")

    # Scroll down the page
    for _ in range(5):
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(2)  # Sleep to give time for content to load
        
def handle_ghostery_onboarding(driver):
    # Ghostery automatically opens a new tab on installation.
    # Wait for the new tab and switch to it
    WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
    driver.switch_to.window(driver.window_handles[-1])
    
    # Click on 'Get Ready' button
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "/html/body/ui-onboarding/ui-onboarding-layout/ui-onboarding-main-view/ui-onboarding-card/div[2]/div[2]/ui-button[1]/a"))).click()

    # Switch back to the original page
    driver.switch_to.window(driver.window_handles[0])



        
def extract_ghostery_data(driver):
    # Open Ghostery extension popup
    driver.get("chrome-extension://mlomiejdfkolichcflejclcbmpeaniij/popup.html")
    time.sleep(2)  # Wait for the popup to load 
    try:
        detailed_view_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div/div/div/header/div[1]/div/div[2]/span")))
        detailed_view_btn.click()
        time.sleep(2)  # Wait for the detailed view to load

        # Extract all content from the detailed view
        ghostery_data = driver.execute_script("return document.body.innerHTML")
        return ghostery_data
    except Exception as e:
        print(f"Error extracting Ghostery data: {e}")
        return None
    finally:
        driver.switch_to.window(driver.window_handles[0])

def main(urls, extension_crx_path):
    driver = setup_driver(extension_crx_path)
    handle_ghostery_onboarding(driver)
    all_ghostery_data = {}
    
    for url in urls:
        try:
            load_and_scroll_page(driver, url)
            ghostery_data = extract_ghostery_data(driver)
            if ghostery_data:
                all_ghostery_data[url] = ghostery_data
        except Exception as e:
            print(f"Failed to process {url}: {str(e)}")
    
    driver.quit()
    return all_ghostery_data
if __name__ == "__main__":
    urls = ["https://www.nu.nl/"]
    extension_crx_path = "C:/Users/megar/OneDrive/Documenten/Business Analytics Management/Scriptie/data/MLOMIEJDFKOLICHCFLEJCLCBMPEANIIJ_8_12_10_0 (1).crx"
    
    data = main(urls, extension_crx_path)
    print(json.dumps(data, indent=4))

No cookie button found or failed to click: Message: 
Stacktrace:
	GetHandleVerifier [0x00F0C113+48259]
	(No symbol) [0x00E9CA41]
	(No symbol) [0x00D90A17]
	(No symbol) [0x00DD0BED]
	(No symbol) [0x00DD0C9B]
	(No symbol) [0x00E0BC12]
	(No symbol) [0x00DF0DE4]
	(No symbol) [0x00E09B9C]
	(No symbol) [0x00DF0B36]
	(No symbol) [0x00DC570D]
	(No symbol) [0x00DC62CD]
	GetHandleVerifier [0x011C65A3+2908435]
	GetHandleVerifier [0x01203BBB+3159851]
	GetHandleVerifier [0x00FA50CB+674875]
	GetHandleVerifier [0x00FAB28C+699900]
	(No symbol) [0x00EA6244]
	(No symbol) [0x00EA2298]
	(No symbol) [0x00EA242C]
	(No symbol) [0x00E94BB0]
	BaseThreadInitThunk [0x756D7BA9+25]
	RtlInitializeExceptionChain [0x7716BE3B+107]
	RtlClearBits [0x7716BDBF+191]

Error extracting Ghostery data: Message: 
Stacktrace:
	GetHandleVerifier [0x00F0C113+48259]
	(No symbol) [0x00E9CA41]
	(No symbol) [0x00D90A17]
	(No symbol) [0x00DD0BED]
	(No symbol) [0x00DD0C9B]
	(No symbol) [0x00E0BC12]
	(No symbol) [0x00DF0DE4]
	(No symbol)

In [19]:

import pyautogui
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
import cv2

def setup_driver(extension_crx_path):
    # Ensure Chromedriver is installed
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()

    # Add the CRX extension
    options.add_extension(extension_crx_path)
    
    # Enable verbose logging
    options.add_argument("--log-level=3")
    
    # Initialize the Chrome driver with the extension
    service = Service()  # Automatically finds the installed Chromedriver
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def pin_extension():
    
    puz_location = pyautogui.locateOnScreen("C:/Users/megar/OneDrive/Documenten/Business Analytics Management/Scriptie/data/Puzzel.png")
    if puz_location is not None:
        pyautogui.click(puz_location)
    else:
        print("puzzle not found")
    
    pin_location = pyautogui.locateOnScreen("C:/Users/megar/OneDrive/Documenten/Business Analytics Management/Scriptie/data/Pin.png", confidence=0.6)
    if pin_location is not None:
        pyautogui.click(pin_location)
    else:
        print("Pin not found")
        
    # click on random page location to close the extension
    pyautogui.click(100, 100)
    

def load_and_scroll_page(driver, url):
    driver.get(url)
    
    # Wait for the page to load and display the initial content
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    
    # Accept cookies if presented on the website
    try:
        cookie_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept Cookies"]')))
        if cookie_btn:
            cookie_btn.click()
    except Exception as e:
        print(f"No cookie button found or failed to click: {e}")

    # Scroll down the page
    for _ in range(5):
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(2)  # Sleep to give time for content to load

def handle_ghostery_onboarding(driver):
    # Ghostery automatically opens a new tab on installation.
    # Wait for the new tab and switch to it
    WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
    driver.switch_to.window(driver.window_handles[-1])
    
    # Click on 'Get Ready' button
    get_ready_btn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "/html/body/ui-onboarding/ui-onboarding-layout/ui-onboarding-main-view/ui-onboarding-card/div[2]/div[2]/ui-button[1]/a")))
    get_ready_btn.click()
    
    # now exit the tab
    driver.close()
    
    # Switch back to the original tab
    driver.switch_to.window(driver.window_handles[0])

def open_ghostery_extension():
    pyautogui.click(x=1087, y=84)
    
def extract_ghostery_data(driver):
    # Interact with the detailed view in the Ghostery popup
    try:
        # Move pyautogui to handle the popup interactions
        # Adjust coordinates as necessary
        detailed_view_position = (100, 200)  # This is an example position
        
        pyautogui.click(detailed_view_position[0], detailed_view_position[1])
        time.sleep(2)  # Wait for the detailed view to load
        
        # Extract all content from the detailed view, this is not on the page source, but in the pop-up window, therefore I want to extract the html code of the popup window:
        ghostery_data = driver.execute_script("return document.body.innerText")
        return ghostery_data
    except Exception as e:
        print(f"Error extracting Ghostery data: {e}")
        return None

def main(urls, extension_crx_path):
    driver = setup_driver(extension_crx_path)
    handle_ghostery_onboarding(driver)  # Handle onboarding for Ghostery extension

    # Pin the extension before starting to interact with it
    pin_extension()

    all_ghostery_data = {}

    for url in urls:
        try:
            load_and_scroll_page(driver, url)
            time.sleep(2)  # Wait for the page to load
            open_ghostery_extension()  # Ensure the extension popup opens
            time.sleep(2)  # Wait for the popup to load
            # make it possible to extract the data from the popup window manually, so that I can extract the html code of the popup window
            # so don't close the popup window
            
            ghostery_data = extract_ghostery_data(driver)
            if ghostery_data:
                all_ghostery_data[url] = ghostery_data
        except Exception as e:
            print(f"Failed to process {url}: {e}")
    
    # driver.quit()
    return all_ghostery_data

# Example usage
if __name__ == "__main__":
    urls = ["https://www.trauer.kreiszeitung.de"]
    extension_crx_path = "C:/Users/megar/OneDrive/Documenten/Business Analytics Management/Scriptie/data/MLOMIEJDFKOLICHCFLEJCLCBMPEANIIJ_8_12_10_0 (1).crx"  # Update with actual path
    
    data = main(urls, extension_crx_path)
    print(json.dumps(data, indent=4))

ImageNotFoundException: 

In [12]:
while True:
    print(pyautogui.position())
    time.sleep(3)

Point(x=605, y=539)
Point(x=1087, y=84)
Point(x=1087, y=84)
Point(x=1087, y=84)


KeyboardInterrupt: 

In [18]:
from html.parser import HTMLParser

class TrackerParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.trackers = []
        self.current_tracker = {}
        self.in_trk_name = False
        self.in_trk_stats = False

    def handle_starttag(self, tag, attrs):
        if tag == "div":
            attrs_dict = dict(attrs)
            if "class" in attrs_dict:
                class_value = attrs_dict["class"]
                if "trk-header" in class_value:
                    self.current_tracker = {}
                elif "warning slow" in class_value:
                    self.current_tracker["slow"] = True
                elif "trk-name" in class_value:
                    self.in_trk_name = True
                elif "trk-common-stats-container" in class_value:
                    self.in_trk_stats = True

    def handle_endtag(self, tag):
        if tag == "div":
            if self.current_tracker:
                self.trackers.append(self.current_tracker)
                self.current_tracker = {}

    def handle_data(self, data):
        if self.in_trk_name:
            self.current_tracker["name"] = data.strip()
            self.in_trk_name = False
        elif self.in_trk_stats:
            self.current_tracker["stats"] = data.strip()
            self.in_trk_stats = False

# Read the HTML content from the file
with open("paste.txt", "r", encoding="utf-8") as file:
    html_content = file.read()

# Create an instance of the parser and feed the HTML content
parser = TrackerParser()
parser.feed(html_content)

# Print the extracted trackers
for tracker in parser.trackers:
    print("Tracker Name:", tracker.get("name", ""))
    print("Slow Tracker:", tracker.get("slow", False))
    print("Additional Info:", tracker.get("stats", ""))
    print()

FileNotFoundError: [Errno 2] No such file or directory: 'paste.txt'

In [17]:
from bs4 import BeautifulSoup

# Function to extract track names from the HTML content
def extract_track_names(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    track_names = []
    start_collecting = False

    # Find all elements in the order they appear in the HTML
    for element in soup.find_all(['div']):
        if 'cat-name' in element.get('class', []):
            # Stop collecting when a new category starts
            if start_collecting:
                break
            # Start collecting trackers when the first 'cat-name' is found
            start_collecting = True
        elif 'trk-name' in element.get('class', []) and start_collecting:
            track_name = element.get_text(strip=True)
            if track_name:
                track_names.append(track_name)

    return track_names

# Load HTML content from a file
with open('C:/Users/megar/OneDrive/Documenten/Business Analytics Management/Scriptie/Output/Website info/juraforum.de.txt', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Extract track names
track_names = extract_track_names(html_content)

# Print the extracted track names
for name in track_names:
    print(name)
# Print the number of trackers
print(len(track_names))


Adform
Adglue
Adition
AdScale
AppNexus
Bidswitch
Criteo
Dotomi
DoubleClick
Google Adsense
Google AdServices
Google Syndication
Improve Digital
Index Exchange
Just Premium
Lotame
MarketGid
Platform161
PubMatic
Roq.ad
Rubicon
Semasio
Smaato
SMART AdServer
sovrn
Teads
TradeDesk
TripleLift
xplosion
Yieldlab
30


In [34]:
import requests
from bs4 import BeautifulSoup

# The URL of the webpage you want to scrape
url = "https://www.ghostery.com/whotracksme/websites/750g.com"

# Make an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content of the webpage
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    print(soup)
    # Print the prettified HTML content (optional)
    # print(soup.prettify())
    # Extract advertisement information
    ads = soup.find_all('a', {'data-category': "Advertising"})
    print(ads)
    advertisements = []

    for ad in ads:
        ad_info = {}
        name_div = ad.find('div', {'class': 'ds-body-m ds-color-white ds-text-ellipsis'})
        percentage_div = ad.find('div', {'class': 'ds-color-gray-400 ds-uppercase ds-label-xs ds-text-ellipsis'})

        if name_div and percentage_div:
            ad_info['name'] = name_div.text
            percentage_text = percentage_div.text.split('%')[0].strip()
            ad_info['percentage'] = percentage_text + '%'

            details = percentage_text.split('•')
            if len(details) >= 3:
                ad_info['company'] = details[1].strip()
                ad_info['category'] = details[2].strip()
            advertisements.append(ad_info)

    # Print the extracted advertisement information
    for ad in advertisements:
        print(f"Name: {ad['name']}, Percentage: {ad['percentage']}")

    # Alternatively, you can store the information in a structured format like a list of dictionaries.
    print(advertisements)
else:
    print(f"Failed to retrieve webpage. Status code: {response.status_code}")


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<title>750g.com | WhoTracks.Me | Ghostery</title>
<link href="https://www.ghostery.com/assets/favicon-137e9f14dacc1b870b52d2e10dcafac9e3c021449f271018d966e51b08732d0b.ico" rel="icon" type="image/x-icon"/>
<link href="https://www.ghostery.com/assets/webclip-3c3ee3efd06e37df0a0ad69e81513e98cae04a6c92701271f4e183e4da3ca362.svg" rel="apple-touch-icon" sizes="180x180" type="image/jpg"/>
<link href="https://www.ghostery.com/whotracksme/websites/750g.com" rel="canonical"/>
<link href="https://www.ghostery.com/assets/ghostery-mobile-5b87b0a50ae994a6526277563ed73ab9ee96bb34506cf30b11b19bfc0d365f0f.png" rel="image_src"/>
<meta content="Ghostery" property="og:site_name"/>
<meta content="750g.com | WhoTracks.Me" property="og:title"/>
<meta content="website" property="og:type"/>
<meta content="https://www.ghostery.com/whotracksme/websites/750g.com" property="og:url"/