In [188]:
import os
import pandas as pd
import numpy as np
import time
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Helper Functions

In [189]:
#ensure tor is connected before attempting to scrape
def check_connection(driver):
    print("Checking Tor network connection status...")
    driver.get("about:preferences#connection")
    time.sleep(5)  # Allow page to load
    
    try:
        # Check for the "Connected" message
        connected = driver.find_element(By.XPATH, "//*[contains(text(), 'Connected')]")
        if connected:
            print("Tor Browser successfully connected to the network.")
            return True
    except:
        print("Tor Browser is not connected. Trying with bridges...")
        return False

#scrolling down website
def scroll_down(driver):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

## 1) Tor-Selenium Setup

In [190]:
tor_browser = r"/home/admin/.local/share/torbrowser/tbb/x86_64/tor-browser/Browser/start-tor-browser"
gecko_driver_path = r"/usr/bin/geckodriver"

options = Options()
options.binary_location = tor_browser
# Set True for faster scrape
# options.headless = True

profile_path = r"/home/admin/.local/share/torbrowser/tbb/x86_64/tor-browser/Browser/TorBrowser/Data/Browser/profile.default"
options.profile = profile_path
options.set_preference('extensions.torlauncher.prompt_at_startup', False)
options.set_preference('torbrowser.settings.enabled', True)
options.set_preference('torbrowser.settings.firewall.enabled', True)
options.set_preference('torbrowser.settings.proxy.enabled', False)
options.set_preference('torbrowser.settings.quickstart.enabled', True)
options.set_preference('torbrowser.settings.dom.security.https_only_mode', False)
#options.set_preference('torbrowser.settings.dom.security.https_only_mode_ever_enabled', False)

service = Service(gecko_driver_path)

driver = webdriver.Firefox(service=service, options=options)

print("Waiting for Tor Browser to connect to the network...")
connected = check_connection(driver)

if not connected:
    driver.quit()
    options.set_preference('torbrowser.settings.bridges.builtin_type', '')
    options.set_preference('torbrowser.settings.bridges.enabled', True)
    options.set_preference('torbrowser.settings.bridges.source', -1)
    driver = webdriver.Firefox(service=service, options=options)

if connected:
    driver.get("http://check.torproject.org")
    time.sleep(5)
    tor_check = driver.find_element(By.TAG_NAME, "h1")
    print(tor_check.text)


Waiting for Tor Browser to connect to the network...
Checking Tor network connection status...
Tor Browser successfully connected to the network.
Congratulations. This browser is configured to use Tor.


## 2) Start scrape

In [191]:
if connected:
    driver.get('http://ransomxifxwc5eteopdobynonjctkxxvap77yqifu2emfbecgbqdw6qd.onion/')
    #authenticity check, modify
    time.sleep(15)

    #list of companies to scrape
    date_threshold = datetime.strptime("2024-09-01","%Y-%m-%d")

    prev_div_count = 0

    # Store list of company data
    company_data = []
    # Store list of all divs in site
    data_divs = []

    removed_div_count = 0
    scrape_div_count = 0
    
    while True:
        #class of div to scrape
        curr_data_divs = driver.find_elements(By.XPATH, "//div[@class='card-body']")

        if len(curr_data_divs) == prev_div_count:
            print("All divs loaded")
            break
        else:
            prev_div_count = len(curr_data_divs)
            print(f"Loaded {prev_div_count} divs so far...")
            data_divs.extend(curr_data_divs)
            
        scroll_down(driver)
        #waiting for more divs to load
        time.sleep(3)

        for div in data_divs:

            # Check date and company name first
            company_name = div.find_element(By.XPATH, ".//div[contains(@class, 'card-title')]").text
            date_compromised_text = div.find_element(By.XPATH, "../div[@class='card-footer']").text
            date_compromised = datetime.strptime(date_compromised_text, "%Y-%m-%d %H:%M:%S")

            # Get all other information if condition
            if date_compromised < date_threshold:
                # get anchor href
                anchor = div.find_element(By.XPATH, "./ancestor::a").get_attribute('href')  # Locate the <a> wrapping this div
            
                # get leak status
                status = div.find_element(By.XPATH, ".//span[contains(@class, 'post-status-timer') or contains(@class, 'post-status-published')]").text            
                
                # get visits, size of data, last view time
                visit_size_last = div.find_element(By.XPATH, "(.//p)[2]").get_attribute("innerHTML")  # Extract content before <br> tags
                soup = BeautifulSoup(visit_size_last, 'html.parser')
                visit_size_last = soup.get_text(separator="|").split('|')  # Use a separator that allows easy splitting

                visits = visit_size_last[0].strip() if len(visit_size_last) > 0 else None
                size = visit_size_last[1].strip() if len(visit_size_last) > 1 else None
                last = visit_size_last[2].strip() if len(visit_size_last) > 2 else None
            
                company_data.append({
                    'anchor': anchor,
                    'name': company_name,
                    'status': status,
                    'visits': visits,
                    'data_size': size,
                    'last_view': last,
                    'date_compromised': date_compromised,
                    'post_content': None
                })
                print(f"Adding {company_name} with status {status} to the company data list.")
                scrape_div_count += 1
            else:
                print(f"Skipping {company_name} with date compromised: {date_compromised_text} (after threshold date)")
                removed_div_count += 1
    print(f"Divs included: {scrape_div_count}, Divs not included {removed_div_count}")


Loaded 229 divs so far...
Skipping www.southeasternretina.com with date compromised: 2024-09-12 20:34:57 (after threshold date)
Skipping mechdyne.com with date compromised: 2024-09-12 08:12:25 (after threshold date)
Skipping thornton-inc.com with date compromised: 2024-09-12 08:10:02 (after threshold date)
Skipping allamericanpoly.com with date compromised: 2024-09-10 17:10:58 (after threshold date)
Skipping www.vinatiorganics.com with date compromised: 2024-09-10 08:45:34 (after threshold date)
Skipping americagraphics.com with date compromised: 2024-09-06 20:26:02 (after threshold date)
Skipping www.avf-biomedical.com with date compromised: 2024-09-09 15:04:52 (after threshold date)
Skipping www.bsg.com.au with date compromised: 2024-09-09 15:02:21 (after threshold date)
Skipping www.dpe.go.th with date compromised: 2024-09-09 14:36:48 (after threshold date)
Skipping www.iiitd.ac.in with date compromised: 2024-09-09 14:34:46 (after threshold date)
Skipping www.unige.it with date comp

## 3) Get more info

In [192]:
for company in company_data:
    
    company_name = company['name']

    print(f"Clicking into company: {company_name}")
    try:
        driver.get(company['anchor'])
        
        post_content = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='post-content']"))
        ).text
        company['post_content'] = post_content
        print("content added")
        
    except Exception as e:
        print(f"Could not find the div for company: {company_name}")
        print(str(e))
        continue  # Skip to the next company if the div is not found

    # Wait for page load
    #time.sleep(3)

    # try:
    #     post_content = driver.find_element(By.XPATH, "//div[@class='post-content']").text
    #     company['post_content'] = post_content
        
    #     print("content added")
    #     #print(f"Company Description: {post_content}")
    # except:
    #     print("No post content available for this company.")
        
    driver.back()
    time.sleep(3)
    
print("Scraping completed.")

Clicking into company: www.electriforce.com
content added
Clicking into company: ciot.com
content added
Clicking into company: wilmingtoncc.org
content added
Clicking into company: www.nissan-dubai.com
content added
Clicking into company: grant-associates.uk.com
content added
Clicking into company: www.lfewines.com
content added
Clicking into company: www.gruyeria.ch
content added
Clicking into company: www.fenceauthority.com
content added
Clicking into company: www.johnkellys.com
content added
Clicking into company: www.iph-bet.fr
content added
Clicking into company: www.suvacity.org
content added
Clicking into company: www.primariatm.ro
content added
Clicking into company: www.mineduc.gob.gt
content added
Clicking into company: www.swinburne.edu
content added
Clicking into company: www.iiitd.ac.in
content added
Clicking into company: www.ramoncorripio.com
content added
Clicking into company: www.timortelecom.tl
content added
Clicking into company: www.citebd.org
content added
Clickin

## 4) Export

In [193]:
df = pd.DataFrame(company_data)
print(df)
df.to_csv('company_data_final.csv', index=False)

                                                anchor  \
0    http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
1    http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
2    http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
3    http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
4    http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
..                                                 ...   
194  http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
195  http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
196  http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
197  http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   
198  http://ransomxifxwc5eteopdobynonjctkxxvap77yqi...   

                          name          status          visits  \
0         www.electriforce.com  2D 15h 40m 46s    Visits: 5186   
1                     ciot.com     15h 40m 46s    Visits: 6242   
2             wilmingtoncc.org       PUBLISHED    Visits: 3805   
3         www.nissan-dubai.com       PU

## 5) Clean up

In [194]:
driver.quit()