# DDoS Data Preparation

In [2]:
import pandas as pd
import numpy as np

In [2]:
# data source: https://surfdrive.surf.nl/files/index.php/s/CZWfWQp3VKGKa8m
# clarifying to the csv reader what dtypes these columns are
types = {'region' : object,'as_type' : object, 'tg_op' : object, 'org_tag' : object}

df = pd.read_csv('data/data_with_ports.csv.gz', sep='|', compression='gzip', dtype = types)

## Data Preparation

In [3]:
# parse string to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['start_time'] = pd.to_datetime(df['start_time'], format='%Y-%m-%dT%H:%M:%S+09:00')
df['stop_time'] = pd.to_datetime(df['stop_time'], format='%Y-%m-%dT%H:%M:%S+09:00')

In [4]:
# remove reserved ips

reserved_ips = [
    "0.",       # software
    "10.",      # private network
    "100.",     # private network
    "169.254.", # subnet
    "172.",     # private network
    "192."      # private network
]

for ips in reserved_ips:
    df = df[~(df.org.isna() & df.target_ip.str.startswith(ips))]

In [5]:
# fix NaNs in `regions` data for victims that reside in United States and Canada
df.loc[df.raw_country.str.contains("united states|canada", case=False), "region"] = "NA" # NA = North America

# fix comma in string
df.raw_country = df.raw_country.str.replace("<comma>", ",")

# fix `cc` and `region` missing values
df.loc[df.raw_country == "Tonga", "cc"] = "TO"; df.loc[df.raw_country == "Tonga", "region"] = "AP"
df.loc[df.raw_country == "Namibia", "cc"] = "NA"; df.loc[df.raw_country == "Namibia", "region"] = "AF"
df.loc[df.raw_country == "San Marino", "cc"] = "SM"; df.loc[df.raw_country == "San Marino", "region"] = "EU"
df.loc[df.raw_country == "Europe", "region"] = "EU"
df.loc[df.raw_country == "Asia/Pacific Region", "region"] = "AP"

# fix for "France, Metropolitan"
df.loc[df.raw_country.str.contains("france", case=False), "raw_country"] = "France"
df.loc[df.raw_country == "France", "cc"] = "FR"; df.loc[df.raw_country == "France", "region"] = "EU"

In [6]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5719250 entries, 0 to 5721431
Data columns (total 35 columns):
target_ip                 5719250 non-null object
date                      5719250 non-null datetime64[ns]
sensor_id                 5719250 non-null object
service                   5719250 non-null object
start_time                5719250 non-null datetime64[ns]
stop_time                 5719250 non-null datetime64[ns]
duration                  5719250 non-null float64
packets                   5719250 non-null int64
raw_country               5719250 non-null object
raw_as                    5719250 non-null object
raw_hostname              5719248 non-null object
udp_port_list             5719250 non-null object
pyasn_as                  5712272 non-null float64
pyasn_as_bgp_size         5502309 non-null float64
cc                        5695077 non-null object
region                    5703844 non-null object
is_oecd                   5693982 non-null float64
as_type   

In [7]:
df.as_type = df.as_type.fillna("Unknown") # fill the missing values in case those data were dropped
df.raw_country = df.raw_country.apply(lambda s: s.replace("<comma>", ","))

In [None]:
# victim org         = org
# size of victims    = org_ipsize_seen
# type of victim     = org_tag
# country victim     = raw_country
# network hygiene    = count_attacks_2014
# dependent variable = count_attacks

df.as_type = df.as_type.fillna("Unknown") # fill the missing values in case those data were dropped

orgs = (df.groupby(by=["org", "org_ipsize_seen", "caida_type", "as_type", "raw_country", "region", "year"])
          .size().unstack().add_prefix("count_attack_")
          .fillna(0))

orgs.reset_index(inplace=True)
orgs.columns = orgs.columns.tolist()

orgs["count_attack"] = orgs.filter(regex="^count_attack").sum(axis=1)
orgs["count_attack_norm"] = orgs.count_attack / orgs.org_ipsize_seen

orgs.raw_country = orgs.raw_country.apply(lambda s: s.replace("<comma>", ","))

# remove companies that received just 1 attack, as we assume the first attack was only a trial
orgs = orgs[orgs.count_attack > 1]

In [None]:
# https://tcdata360.worldbank.org/indicators/h2e1ddd20
itu_index = pd.read_csv("ITU-ICT-index.csv")

orgs = pd.merge(orgs, itu_index, how="left", 
                left_on="raw_country", right_on="Country").drop(columns=["Country"])

In [None]:
# source: World Bank
pop_country = pd.read_csv("population2016.csv")

orgs = pd.merge(orgs, pop_country, how="left", 
                left_on="raw_country", right_on="Country").drop(columns=["Country"])

In [8]:
df.to_csv("orgs.csv")