In [45]:
import os
import pandas as pd
import numpy as np
import time
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

## 1) Tor-Selenium Setup

In [7]:
torexe = os.popen(r'C:\Users\Daniel\Desktop\Tor Browser\Browser\TorBrowser\Tor\tor.exe')

firefoxBinary = r"C:\Users\Daniel\Desktop\Tor Browser\Browser\firefox.exe"
geckodriverPath = r"C:\Users\Daniel\Desktop\Tor Browser\geckodriver.exe"
proxyIP = "127.0.0.1"
proxyPort = 9150

binary = FirefoxBinary(firefoxBinary)

proxy_settings = {
    "network.proxy.type": 1,
    "network.proxy.socks": proxyIP,
    "network.proxy.socks_port": proxyPort,
    "network.proxy.socks_remote_dns": False,
    "extensions.torlauncher.start_tor": True
}

profile = FirefoxProfile(r'C:\Users\Daniel\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default')
profile.set_preference('extensions.torlauncher.start_tor', True) # bypass tor connection page
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9150)
profile.set_preference("network.proxy.socks_remote_dns", True)
profile.update_preferences()

driver = webdriver.Firefox(executable_path=geckodriverPath, 
                           firefox_binary=binary, 
                           firefox_profile=profile)



## 2) Scraping Medusa

Obtain all information about threatened companies posted in Medusa Blog  
- Company Name  
- Company Description  
- Date Published by Medusa  
- Number of clicks  
- Type of data stolen (not sure about how to extract this)

BlackMatter link: http://medusaxko7jxtrojdkxo66j7ck4q5tgktf7uqsqyfry4ebnxlcbkccyd.onion/

Medusa Blog is displayed in a timeline format where content is continually loaded as you scroll down to the end of the webpage. This means we need to scroll all the way down, then extract the information.  

To know when the webpage is fully loaded, either scroll all the way down each iteration until there is no new content loaded after an explicit wait, or search for the absence of HTML element "progress" after scrolling all the way down each iteration.  

Each company information is in the form of the HTML element "Card" and have the class "card"  

In [8]:
# Wait till your tor browser is connected to the tor network before executing this
driver.get('http://medusaxko7jxtrojdkxo66j7ck4q5tgktf7uqsqyfry4ebnxlcbkccyd.onion/')

In [9]:
SCROLL_PAUSE_TIME = 5 # as the Tor browser is slow, might need to increase this accordingly

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [10]:
companyNames = []
companyDescription = []
dateVictimised = []
noClicks = []

cards = driver.find_elements_by_xpath("//div[@class='card']")

for card in cards:
    companyNameElement = card.find_element_by_xpath(".//h3[@class='card-title']")
    companyNameText = companyNameElement.text
    companyNames.append(companyNameText)

    companyDescriptionElement = card.find_element_by_xpath(".//p[@class='card-text text-left']")
    companyDescriptionText = companyDescriptionElement.text
    companyDescription.append(companyDescriptionText)

    dateVictimisedElement = card.find_element_by_xpath(".//div[@class='date-updated']/span[@class='text-muted']")
    dateVictimisedText = dateVictimisedElement.text
    dateVictimised.append(dateVictimisedText)

    noClicksElement = card.find_element_by_xpath(".//div[@class='number-view']/span[@class='text-muted']")
    noClicksText = noClicksElement.text
    noClicks.append(noClicksText)

driver.quit()


## 3) Clean data + Extract to CSV/Excel format


Requirements for data:  
- Date of victim company within 2023

In [38]:
df = pd.DataFrame({"Company": companyNames, "Company Description": companyDescription, "Date Victimised": dateVictimised, "Number of Clicks": noClicks})
df['Date Victimised'] = pd.to_datetime(df['Date Victimised'], format='%Y-%m-%d %H:%M:%S')

date_mask = df['Date Victimised'].dt.year < 2023

df = df[~date_mask]

df.reset_index(drop=True)

df

Unnamed: 0,Company,Company Description,Date Victimised,Number of Clicks
0,Steripharma,Steripharma is a Moroccan pharmaceutical labor...,2023-09-11 08:15:25,1583
1,Wave Hill,Wave Hill is a community garden and cultural c...,2023-09-11 08:14:53,1516
2,Jules B,Jules B is a chain of designer clothing stores...,2023-09-04 07:02:59,2842
3,Betton France,Administration of the city of Betton (France)....,2023-09-04 07:02:24,3019
4,Skynet,Skynet (https://www.skynetwisp.com/) provides ...,2023-08-30 08:24:43,3923
...,...,...,...,...
86,European Window,European Window Company is an Australian leade...,2023-02-02 18:24:06,9527
87,EightPixelsSquare,"Founded in December 2012 and based in Derby, U...",2023-01-25 18:01:43,10246
88,Aglobis,Resistance shall not prevent us from fulfillin...,2023-01-20 17:59:08,9610
89,Integerity Tax,"Small business accounting, tax preparation, bo...",2023-01-11 13:11:48,9158


In [39]:
df.to_csv("Medusa_Name.csv")

## 3) Scraping Glassdoor for more company information

More information we need:  
- Industry  
- Geography  
  - Country  

In [42]:
# pip install webdriver-manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [55]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()

In [56]:
# companyNames
# use crunchbase, linkedin, opencorporates

companyWebsite_list = []
industry_list = []
location_list = []
number = 0

for company in companyNames: 
    try:
        driver.get('https://www.google.com')
        search = driver.find_element(By.NAME, 'q')
        search.send_keys(company + ' glassdoor "overview"')
        search.send_keys(Keys.RETURN)
        glassdoor_page = driver.find_element_by_tag_name('h3') # clicking the first search result
        glassdoor_page.click()
        
        # if "glassdoor" in driver.current_url:
        #     companyWebsite_list.append(company.text)
            
        #     pagesource = driver.page_source
        #     soup = BeautifulSoup(pagesource, 'html.parser')
        #     labels = soup.find_all('label')
        #     for label in labels:
        #         if (label.text == "Industry:"):
        #             industry = label.next_sibling.text
        #             industry_list.append(industry)
        #             break
        #     else:
        #         industry_list.append(np.nan)
                
        #     for label in labels:
        #         if (label.text == "Headquarters:"):
        #             location = label.next_sibling.text
        #             location_list.append(location)
        #             break
        #     else:
        #         location_list.append(np.nan)
        # else:
        #     companyWebsite_list.append(np.nan)
        #     industry_list.append(np.nan)
        #     location_list.append(np.nan)
        
        if "glassdoor" in driver.current_url:
            links = glassdoor_page.find_elements_by_class("employer-overview__employer-overview-module__employerOverviewLink")
            print(links)
        else:
            pass

    except Exception as e:
        companyWebsite_list.append(np.nan)
        industry_list.append(np.nan)
        location_list.append(np.nan)
        print(e)
    
driver.quit()

'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'WebElement' object has no attribute 'find_elements_by_class'
'NoneType' object has no attribute 'click'
Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=116.0.5845.188)

Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=116.0.5845.188)

Message

In [51]:
print(industry_list)

[nan, nan, nan, nan, nan, nan, nan, nan, nan]
