In [2]:
import os
import pandas as pd
import numpy as np
import time
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

## 1) Tor-Selenium Setup

In [13]:
torexe = os.popen(r'C:\Users\Daniel\Desktop\Tor Browser\Browser\TorBrowser\Tor\tor.exe')

firefoxBinary = r"C:\Users\Daniel\Desktop\Tor Browser\Browser\firefox.exe"
geckodriverPath = r"C:\Users\Daniel\Desktop\Tor Browser\geckodriver.exe"
proxyIP = "127.0.0.1"
proxyPort = 9150

binary = FirefoxBinary(firefoxBinary)

proxy_settings = {
    "network.proxy.type": 1,
    "network.proxy.socks": proxyIP,
    "network.proxy.socks_port": proxyPort,
    "network.proxy.socks_remote_dns": False,
    "extensions.torlauncher.start_tor": True
}

profile = FirefoxProfile(r'C:\Users\Daniel\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default')
profile.set_preference('extensions.torlauncher.start_tor', True) # bypass tor connection page
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9150)
profile.set_preference("network.proxy.socks_remote_dns", True)
profile.update_preferences()

driver = webdriver.Firefox(executable_path=geckodriverPath, 
                           firefox_binary=binary, 
                           firefox_profile=profile)



## 2) Scraping Medusa

Obtain all information about threatened companies posted in Medusa Blog  
- Company Name  
- Company Description  
- Date Published by Medusa  
- Number of clicks  
- Type of data stolen (not sure about how to extract this - i think we need to do this manually)

BlackMatter link: http://medusaxko7jxtrojdkxo66j7ck4q5tgktf7uqsqyfry4ebnxlcbkccyd.onion/

Medusa Blog is displayed in a timeline format where content is continually loaded as you scroll down to the end of the webpage. This means we need to scroll all the way down, then extract the information.  

To know when the webpage is fully loaded, either scroll all the way down each iteration until there is no new content loaded after an explicit wait, or search for the absence of HTML element "progress" after scrolling all the way down each iteration.  

Each company information is in the form of the HTML element "Card" and have the class "card"  

In [14]:
# Wait till your tor browser is connected to the tor network before executing this
driver.get('http://medusaxko7jxtrojdkxo66j7ck4q5tgktf7uqsqyfry4ebnxlcbkccyd.onion/')

In [15]:
SCROLL_PAUSE_TIME = 5 # as the Tor browser is slow, might need to increase this accordingly

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [16]:
companyNames = []
companyDescription = []
dateVictimised = []
noClicks = []

cards = driver.find_elements_by_xpath("//div[@class='card']")

for card in cards:
    dateVictimisedElement = card.find_element_by_xpath(".//div[@class='date-updated']/span[@class='text-muted']")
    dateVictimisedText = dateVictimisedElement.text
    # if dateVictimisedText[0:4] != "2023":
    #     continue
    dateVictimised.append(dateVictimisedText)

    companyNameElement = card.find_element_by_xpath(".//h3[@class='card-title']")
    companyNameText = companyNameElement.text
    companyNames.append(companyNameText)

    companyDescriptionElement = card.find_element_by_xpath(".//p[@class='card-text text-left']")
    companyDescriptionText = companyDescriptionElement.text
    companyDescription.append(companyDescriptionText)

    noClicksElement = card.find_element_by_xpath(".//div[@class='number-view']/span[@class='text-muted']")
    noClicksText = noClicksElement.text
    noClicks.append(noClicksText)

driver.quit()


## 3) Clean data + Extract to CSV/Excel format


Requirements for data:  
- Date of victim company within 2023

In [None]:
df = pd.DataFrame({"Company": companyNames, "Company Description": companyDescription, "Date Victimised": dateVictimised, "Number of Clicks": noClicks})
df['Date Victimised'] = pd.to_datetime(df['Date Victimised'], format='%Y-%m-%d %H:%M:%S')

date_mask = df['Date Victimised'].dt.year < 2023

df = df[~date_mask]

df.reset_index(drop=True)

df

In [8]:
df.to_csv("Medusa_Data.csv")

## 3) Scraping Glassdoor for more company information

More information we need:  
- Industry  
- Geography  
  - Country  

Additional information we can scrape (the more we have, the more insights we can infer):
- Company Website

In [5]:
# pip install webdriver-manager
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [35]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()

In [27]:
import requests

def location_to_country(location):
    base_url = "https://nominatim.openstreetmap.org/search"
    
    params = {
        "addressdetails": 1,
        "q": location,
        "format": "jsonv2",
        "limit": 1
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    if data:
        country = data[0].get("address", {}).get("country", "")
        return country

    return np.nan

In [None]:
# Using LinkedIn

companyWebsite_list = []
industry_list = []
headquarter_list = []
country_list = []

wait = WebDriverWait(driver, 5)

for company in companyNames:
    try:
        driver.get('https://www.google.com')
        search = driver.find_element(By.NAME, 'q')
        search.send_keys(company + ' linkedin')
        search.send_keys(Keys.RETURN)
        linkedin_page = driver.find_element_by_tag_name('h3') # clicking the first search result
        time.sleep(2) # to limit rate of requests to linkedin
        linkedin_page.click()

        if "linkedin" in driver.current_url:

            loaded = wait.until(EC.presence_of_element_located((By.ID, "main-content")))

            # Check if the scraped company name is even in the linkedin page (sometimes the company data is not in linkedin; false positive)
            # try:
            #     linkedin_companyName_element = driver.find_element_by_xpath("//h1[@class='top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0']")
            #     linkedin_companyName = linkedin_companyName_element.text
            # except:
            #     print(company)
            #     continue

            # if not substring_included(company, linkedin_companyName):
            #     print(company)
            #     continue

            # bypass sign in modal
            try:
                sign_in_button_element = driver.find_element_by_xpath("//button[@class='sign-in-modal__outlet-btn cursor-pointer btn-md btn-primary']")
                sign_in_button_element.click()
                dismiss_button_element = driver.find_element_by_xpath("//button[@class='modal__dismiss btn-tertiary h-[40px] w-[40px] p-0 rounded-full indent-0 sign-in-modal__dismiss absolute right-0 cursor-pointer m-[20px]']")
                dismiss_button_element.click()
            except:
                pass

            CLASS = ".//dd[@class='font-sans text-md text-color-text break-words overflow-hidden']"

            try:
                website_element = driver.find_element_by_css_selector('[data-test-id="about-us__website"]')
                website_text_element = website_element.find_element_by_xpath(CLASS)
                website_text = website_text_element.text
            except Exception as e:
                website_text = np.nan
                print(e)
            companyWebsite_list.append(website_text)

            try:
                industry_element = driver.find_element_by_css_selector('[data-test-id="about-us__industry"]')
                industry_text_element = industry_element.find_element_by_xpath(CLASS)
                industry_text = industry_text_element.text
            except Exception as e:
                industry_text = np.nan
                print(e)
            industry_list.append(industry_text)
            
            try:
                headquarters_element = driver.find_element_by_css_selector('[data-test-id="about-us__headquarters"]')
                headquarters_text_element = headquarters_element.find_element_by_xpath(CLASS)
                headquarters_text = headquarters_text_element.text

                country = location_to_country(headquarters_text)
            except Exception as e:
                headquarters_text = np.nan
                country = np.nan
                print(e)
            headquarter_list.append(headquarters_text)
            country_list.append(country)

        else:
            companyWebsite_list.append(np.nan)
            industry_list.append(np.nan)
            headquarter_list.append(np.nan)
            country_list.append(np.nan)

    except Exception as e:
        companyWebsite_list.append(np.nan)
        industry_list.append(np.nan)
        headquarter_list.append(np.nan)
        country_list.append(np.nan)
        print(e)
    number += 1
    
driver.quit()

In [None]:
df["Company Website"] = companyWebsite_list
df["Industry"] = industry_list
df["Location"] = headquarter_list
df["Country"] = country_list
df

In [41]:
df.to_csv("Medusa_Linkedin_Data.csv")