# Cybersecurity breaches in large companies
### Using data from https://en.wikipedia.org/wiki/List_of_data_breaches#cite_note-250
#### Updated in real time

In [235]:
# IMPORT LIBRARIES USED


# Webscrapping cyber security data
import pandas as pd
from bs4 import BeautifulSoup
import requests
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import plotly.graph_objects as go
# Cleaning
import numpy as np
import statistics
# ML models
from sklearn.linear_model import LinearRegression

In [270]:
# WEBSCRAPPING CYBER SECURITY DATA

def getTableFromURL(url, type_):
    # PURPOSE:
        # retrieves the first table from a website as a df
    # INPUT: 
        # url (string) - link to website with table
        # type_ (dict) - class attribute from html of table in page
            # exp: {"class": "wikitable sortable"}
    # OUTPUT:
        # df (df) - pd.DataFrame table with headers
        
    # from url, gets table with header and cells into df
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "lxml")
    table = soup.find("table", type_) # (change this) depend on table
    # get headers
    headers = []
    for col in table.find_all('th'):
        # gets each col label (title) from table
        title = col.text
        headers.append(title.strip())
    # create df from headers and data
    df = pd.DataFrame(columns = headers)
    for row in table.find_all('tr')[1:]:
        # getting data from each rows after the header
        data = row.find_all('td')
        row_data = [td.text.strip() for td in data]
        df.loc[len(df)] = row_data # appends new row to df
    return df

In [293]:
df = getTableFromURL("https://en.wikipedia.org/wiki/List_of_data_breaches#cite_note-250", 
                     {"class": "wikitable sortable"})
df.head()

Unnamed: 0,Entity,Year,Records,Organization type,Method,Sources
0,21st Century Oncology,2016,2200000,healthcare,hacked,[5][6]
1,500px,2020,14870304,social networking,hacked,[7]
2,Accendo Insurance Co.,2020,175350,healthcare,poor security,[8][9]
3,Adobe Systems Incorporated,2013,152000000,tech,hacked,[10]
4,Adobe Inc.,2019,7500000,tech,poor security,[11][12]


In [254]:
# CLEANING DATA


def colUnique(table, col):
    # PURPOSE:
        # Prints unique values for each column in a table
    # INPUT: 
        # table (DF) - table to take
        # col (string) - value for the column to use
    # OUTPUT:
        # a list of the distinct each col and its counts
    
    # gets dictionary and counts of each distinct value in col
    distinct_counts = dict((item, list(table[col]).count(item)) for item in set(list(table[col])))
    # sorts list by the value of the counts
    distinct_counts = sorted(distinct_counts.items(), key=lambda x:x[1])
    for i in distinct_counts:
        print(i)

In [304]:
def cleanNumerical(table, col):
    # PURPOSE:
        # Cleans an intended numerical column and shows info about how to replace bad values
        # Also removes commas from values
    # INPUT: 
        # table (DF) - table to take
        # col (string) - value for the column to use
    # OUTPUT:
        # Prints Mean and median of good values
        # Prints bad values not integer

    try:
        table[col] = table.apply(lambda x: x[col].replace(',', ''), axis=1) # removes commas from records col
        print("Converted comman values")
    except:
        ls = list(table[col])
        # check if all the list values are of the same type
        check = all(type(val) == type(ls[0]) for val in ls)
        if (check):
            print("All values are the same type in column")
        else:
            print("Different type values in the column")
    success = []
    failures = []
    for i in range(len(table)):
        try:
            table.loc[i, col] = int(table.loc[i, col])
            success.append(table.loc[i, col])
        except:
            failures.append(table.loc[i, col])

    print("Mean of correct values:", statistics.mean(success))
    print("Median of correct values:", statistics.median(success))
    print("Items not numerical:")
    for i in set(failures):
        print(i)

In [305]:
# Cleaning numerical col 'Year'
cleanNumerical(df, "Year")

All values are the same type in column
Mean of correct values: 2014.1477272727273
Median of correct values: 2014.0
Items not numerical:


In [308]:
df.loc[df.Year == "2018-2019", "Year"] = 2018
df.loc[df.Year == "2019-2020", "Year"] = 2019
df.loc[df.Year == "2014 and 2015", "Year"] = 2014
cleanNumerical(df, "Year")

All values are the same type in column
Mean of correct values: 2014.1477272727273
Median of correct values: 2014.0
Items not numerical:


In [316]:
# Cleaning numerical col 'Year'
cleanNumerical(df, "Records")
df.loc[df.Records == "9000000 (approx) - basic booking 2208 (credit card details)", "Records"] = 9000000
df.loc[df.Records == "over 5000000", "Records"] = 5000000
df.loc[df.Records == "tens of thousands", "Records"] = 5000000
cleanNumerical(df, "Records")


All values are the same type in column
Mean of correct values: 42163702.15340909
Median of correct values: 3900000.0
Items not numerical:
All values are the same type in column
Mean of correct values: 42163702.15340909
Median of correct values: 3900000.0
Items not numerical:
