In [1]:
import requests
import spacy
import re
import pandas as pd
import csv
import time
from datetime import datetime, timedelta, date
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
nlp = spacy.load("en_core_web_sm")
#THIS PART GETS THE COMPANY NAME AND DOMAIN!
# Load keywords from the Excel file
try:
    keywords_df = pd.read_excel("Ransomware_List.xlsx")
    ransomware_group = keywords_df['Ransomware Group'].str.lower()
except Exception as e:
    print("An error occurred while loading the keywords file:", str(e))
    ransomware_group = []

# Additional keywords to exclude
additional_keywords = ['databreaches', 'the office of inadequate security']

# Define a custom pattern for organization names with hyphenated, dashed, or numbers
custom_pattern = re.compile(r'\b(?:[A-Za-z0-9]+[-\s]?)+[A-Za-z0-9]+\b')

def extract_company_domain(html_content, company_name):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract href links
    href_links = [a['href'] for a in soup.find_all('a', href=True)]

    # Filter links that contain the company name and exclude specific keywords
    if company_name is not None:
        filtered_links = [link for link in href_links if re.search(r'\b{}\b'.format(re.escape(company_name)), link, re.IGNORECASE) and not any(keyword in link.lower() for keyword in additional_keywords)]
    else:
        filtered_links = []

    # Extract domain from filtered links
    domains = [urlparse(link).netloc for link in filtered_links]

    return domains[0] if domains else None

def extract_company_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract title, h1, and p elements with a specific class
    title = soup.title.text.strip() if soup.title else ""

    # Exclude h3 tags and any text within <a> tags
    div_text = ' '.join([div.text.strip() for div in soup.find_all(['div', 'p'], class_='entry-content') if div.name not in ['h3'] and not div.find('a')])

    # Concatenate title, h1, and p text
    combined_text = f"{title} {div_text}"

    # Process the combined text with spaCy's NER
    doc = nlp(combined_text)

    # Extract entities that are likely to be organizations (ORG) and include dashes, hyphens, and numbers
    organizations_ner = [ent.text for ent in doc.ents if ent.label_ == 'ORG' and all(word[0].isupper() for word in ent.text.split())]

    # Add organizations matching the custom pattern
    organizations_custom_pattern = custom_pattern.findall(combined_text)

    # Capitalize the first letter of each word in organizations from the custom pattern
    organizations_custom_pattern = [' '.join(word.capitalize() for word in org.split()) for org in organizations_custom_pattern]

    # Combine organizations from NER and custom pattern
    organizations = organizations_ner + organizations_custom_pattern

    # Exclude organizations containing specific keywords
    filtered_organizations = [org for org in organizations if not any(keyword in org.lower() for keyword in ransomware_group)] + [org for org in organizations if any(keyword in org.lower() for keyword in additional_keywords)]

    company_name = filtered_organizations[0] if filtered_organizations else None

    # Extract company domain
    company_domain = extract_company_domain(html_content, company_name)

    return company_name, company_domain

In [3]:
#THIS PART GETS THE THREAT ACTOR/RANSOMWARE USED
def find_ransomware_group(html_content, ransomware_groups):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Exclude div elements with class 'crp_related crp-text-only' and their descendants
    divs_to_exclude = soup.find_all('div', class_='crp_related crp-text-only')
    for div_to_exclude in divs_to_exclude:
        div_to_exclude.decompose()  # Remove the div and its descendants from the soup

    # Extract title and div text
    title = soup.title.text.strip() if soup.title else ""

    # Extract text from divs excluding text within anchor links
    div_text = ' '.join([div.text.strip() for div in soup.find_all('div', class_='entry-content') if not div.find('a')])

    # Concatenate title and div text
    combined_text = f"{title} {div_text}"

    # Convert both combined_text and ransomware group names to lowercase for case-insensitive comparison
    combined_text_lower = combined_text.lower()
    ransomware_groups_lower = [group.lower() for group in ransomware_groups]

    # Check for ransomware groups in the combined text with word boundaries
    for group, group_lower in zip(ransomware_groups, ransomware_groups_lower):
        if re.search(fr'\b{re.escape(group_lower)}\b', combined_text_lower):
            return group

    # If no match is found, default to 'ransomware'
    return 'Ransomware'

# Load ransomware groups from Excel file
def load_ransomware_groups(file_path):
    try:
        df = pd.read_excel(file_path)
        return df['Ransomware Group'].tolist()
    except Exception as e:
        print("Error loading ransomware groups:", str(e))
        return []

excel_file_path = 'Ransomware_List.xlsx'

# Load ransomware groups
ransomware_groups = load_ransomware_groups(excel_file_path)

In [4]:
#THIS PART GETS THE COMPANY'S INDUSTRY
def load_industry_keywords(file_path):
    try:
        df = pd.read_excel(file_path)
        return df
    except Exception as e:
        print("Error loading industry keywords:", str(e))
        return pd.DataFrame()

def extract_company_industry(html_content, industry_keywords):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract title and div text, excluding div with class 'crp_related crp-text-only' and its descendants
    title = soup.title.text.strip() if soup.title else ""
    divs_to_exclude = soup.find_all('div', class_='crp_related crp-text-only')
    for div_to_exclude in divs_to_exclude:
        div_to_exclude.decompose()  # Remove the div and its descendants from the soup

    div_text = ' '.join([div.text.strip() for div in soup.find_all('div', class_='entry-content')])

    # Concatenate title and div text
    combined_text = f"{title} {div_text}"

    # Convert both combined_text and industry keywords to lowercase for case-insensitive comparison
    combined_text_lower = combined_text.lower()

    # Check for industry keywords in the combined text
    for index, row in industry_keywords.iterrows():
        industry = row['Industry']
        subcategory = row['Subcategory']
        industry_lower = industry.lower()
        subcategory_lower = subcategory.lower()

        # Use word boundaries in regex for whole word matching
        word_boundary_industry = rf'\b{re.escape(industry_lower)}\b'
        word_boundary_subcategory = rf'\b{re.escape(subcategory_lower)}\b'

        if re.search(word_boundary_subcategory, combined_text_lower):
            return industry

        if re.search(word_boundary_industry, combined_text_lower):
            return industry

    # If no match is found, default to 'Unknown'
    return 'Unknown'

excel_file_path = 'Industry_List.xlsx'

# Load industry keywords
industry_keywords = load_industry_keywords(excel_file_path)

In [5]:
#THIS PART GETS THE DATE OF BREACH
def extract_dates(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove specified div elements
    divs_to_exclude = soup.find_all('div', class_='crp_related crp-text-only')
    for div_to_exclude in divs_to_exclude:
        div_to_exclude.decompose()

    # Extract the text from the remaining div
    div_text = ' '.join([div.text.strip() for div in soup.find_all('div', class_='entry-content')])

    # Process the text with NER
    doc = nlp(div_text)

    # Extract dates from the NER results
    dates = []
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            dates.append(ent.text)

    # Extract partial dates using regular expressions
    partial_dates = re.findall(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)?\b\s?(?:\d{4})?\b', div_text)

    # Combine the extracted dates and partial dates
    all_dates = dates + partial_dates

    # Filter out empty or whitespace strings and non-date-related strings
    filtered_dates = []
    year_counter = Counter()
    seen_dates = set()
    for date in all_dates:
        year_match = re.search(r'\b\d{4}\b', date)
        if year_match:
            year = year_match.group().strip()
            if year_counter[year] == 0 and date.lower() not in seen_dates:
                filtered_dates.append(date)
                year_counter[year] += 1
                seen_dates.add(date.lower())
        elif re.search(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b', date):
            if date.lower() not in seen_dates:
                filtered_dates.append(date)
                seen_dates.add(date.lower())

    # Return the extracted dates without repeating years
    return filtered_dates

In [6]:
#THIS PART GETS THE MAIN LINK AND CREATES THE CSV COMBINING ALL THE FUNCTIONS FROM ABOVE
def extract_article_links(main_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36'
    }

    r = requests.get(main_url, headers=headers)

    if r.status_code == 200:
        html_content = r.text
        soup = BeautifulSoup(html_content, 'html.parser')

        article_links = [a['href'] for a in soup.select('div.text-center a.blogpost-button')]

        return article_links

    else:
        print("Failed to fetch the main webpage. Status code:", r.status_code)
        return []

# Base URL of the website containing overviews of different articles
base_url = 'https://www.databreaches.net'

# Set up headers for the HTTP request
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36'
}

# Define the start and end dates
start_date = date(2023, 2, 21)
end_date = date(2023, 1, 1)
delta = timedelta(days=1)

# Define the CSV file name
csv_file_name = 'data.csv'

# Open the CSV file in write mode
with open(csv_file_name, 'w', newline='', encoding='utf-8') as csvfile:
    # Define the CSV header
    fieldnames = ['Company Domain', 'Company Name', 'Company Industry', 'Date of Breach', 'Threat Actor', 'Article URL']

    # Create a CSV writer object
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='|')

    # Write the header to the CSV file
    csv_writer.writeheader()

    # Loop through the pages
    page_index = 271
    while True:
        # Construct the URL for the current page
        main_website_url = f"{base_url}/page/{page_index}/"

        try:
            # Fetch the main webpage
            r = requests.get(main_website_url, headers=headers)

            if r.status_code == 200:
                html_content = r.text
                soup = BeautifulSoup(html_content, 'html.parser')

                # Extract the article links from the current page
                article_links = [a['href'] for a in soup.select('div.text-center a.blogpost-button')]

                if not article_links:
                    # No more articles found, break the loop
                    break

                for article_link in article_links:
                    try:
                        # Fetch and process each article
                        r = requests.get(article_link, headers=headers)

                        if r.status_code == 200:
                            html_content = r.text

                            # Extracting the published date using the specified time class
                            published_date_element = BeautifulSoup(html_content, 'html.parser').find('time', class_='entry-date published updated') or BeautifulSoup(html_content, 'html.parser').find('time', class_='entry-date published')

                            if published_date_element:
                                # Convert the extracted string to a date object
                                published_date_str = published_date_element['datetime']
                                published_date = datetime.strptime(published_date_str, "%Y-%m-%dT%H:%M:%S%z").date()
                            else:
                                # Handle the case where the published date is not found
                                print(f"Published date not found in the article {article_link}. Skipping.")
                                continue

                            # Check if the published date is within the specified range
                            if end_date <= published_date <= start_date:
                                # Extract company info and other details (Don't Change or Touch)
                                company_name, company_domain = extract_company_info(html_content)
                                date_of_breach = extract_dates(html_content)
                                found_ransomware = find_ransomware_group(html_content, ransomware_groups)
                                company_industry = extract_company_industry(html_content, industry_keywords)

                                # Output Extracted Info
                                print("Writing to CSV:", company_name, company_domain, company_industry, date_of_breach, found_ransomware, published_date)
                                print("--------------------------------------")

                                # Write the data to the CSV file
                                csv_writer.writerow({
                                    'Article URL': article_link,
                                    'Company Domain': company_domain,
                                    'Company Name': company_name,
                                    'Company Industry': company_industry,
                                    'Date of Breach': date_of_breach,
                                    'Threat Actor': found_ransomware,
                                })

                            else:
                                print(f"Skipping article with URL {article_link} published on {published_date} as the published date is outside the specified range.")

                        else:
                            print(f"Failed to fetch the webpage {article_link}. Status code:", r.status_code)

                    except Exception as e:
                        print(f"An error occurred while processing the article {article_link}: {str(e)}")

                    # Delay to prevent excessive memory usage
                time.sleep(1)

            else:
                print(f"Failed to fetch the webpage {main_website_url}. Status code:", r.status_code)

        except Exception as e:
            print(f"An error occurred: {str(e)}")

        # Delay to prevent excessive memory usage
        # time.sleep(1)

        # Move to the next page
        page_index += 1

Skipping article with URL https://www.databreaches.net/chinese-security-researchers-claim-to-have-identified-against-the-west-hackers/ published on 2023-02-23 as the published date is outside the specified range.
Skipping article with URL https://www.databreaches.net/ks-hutchinson-clinic-issues-alert-concerning-december-data-breach/ published on 2023-02-23 as the published date is outside the specified range.
Skipping article with URL https://www.databreaches.net/north-korean-hackers-move-3-2m-from-gate-io-2018-hack/ published on 2023-02-23 as the published date is outside the specified range.
Skipping article with URL https://www.databreaches.net/l-a-unified-admits-that-at-least-2000-student-records-dumped-after-ransomware-attack/ published on 2023-02-23 as the published date is outside the specified range.
Skipping article with URL https://www.databreaches.net/cyberattack-on-food-giant-dole-temporarily-shuts-down-north-america-production-company-memo-says/ published on 2023-02-23 as 

KeyboardInterrupt: 