In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# data source: https://surfdrive.surf.nl/files/index.php/s/CZWfWQp3VKGKa8m
df = pd.read_csv('data_with_ports.csv.gz', sep='|', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


### Metadata (Cheung, 2017)

| Column | Description |
| ------ | ----------- |
| `target_ip` | The IP address that has been targeted by a DDoS attack |
| `date` | The date of attack |
| `sensor_id` | The name of the honeypot that monitored the attack traffic |
| `service` | The protocol that was used to execute the attack |
| `start_time` | The start time of the attack |
| `stop_time` | The stop time of the attack |
| `duration` | Attack duration |
| `pyasn_as` | The autonomous system number identifying which AS is routing traffic for the attacked IP |
| `pyasn_as_bpg_size` | The total number of IP addresses that the AS routes |
| `cc` | Short form of the country where the IP address seems to reside |
| `as_type` | The type of the Autonomous system (could be ISP, Hosting, EDU, etc.) |
| `tg_op` | A string identifier to ASes that are known to be Broadband ISPs |
| `caida_type` | A type identifier for ASes based on different source (CAIDA) |
| `dc` | The number of second level domains that have been observed to map to the attacked IP addresses in DNS traffic |
| `subs` | The number of subscribers for those ASes that are known to be Broadband ISPs |
| `as_ipsize_seen` | The total number of IP addresses of the AS that have been observed to be routed to IPs of the AS in DNS traffic |
| `as_domainsize_seen` | The total number of second level domains that have been observed to be routed to IPs of the AS in DNS traffic |
| `year` | The year of the attack |
| `udp_port_list` | The ports that attack packets have been sent to | 

In [3]:
# parse string to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['start_time'] = pd.to_datetime(df['start_time'], format='%Y-%m-%dT%H:%M:%S+09:00')
df['stop_time'] = pd.to_datetime(df['stop_time'], format='%Y-%m-%dT%H:%M:%S+09:00')

In [4]:
# remove reserved ips

reserved_ips = [
    "0.",       # software
    "10.",      # private network
    "100.",     # private network
    "169.254.", # subnet
    "172.",     # private network
    "192."      # private network
]

for ips in reserved_ips:
    df = df[~(df.org.isna() & df.target_ip.str.startswith(ips))]

In [5]:
def convert_cidr_to_ipstartswith( cidr ):
    '''converts cidr to ipv4 addresses (only for n<24)'''
    firstIP, n = cidr.split("/")
    firstIP = firstIP.rstrip(".0").rsplit(".", 1)
    i1, i2 = firstIP[0], int(firstIP[1])
    ip_startswith = [i1 + "." + str(i2+j) for j in range(24-int(n)+1)]
    return ip_startswith

In [6]:
# source: https://www.cloudflare.com/ips/
netblock_cloudflare = [
    "173.245.48.0/20",
    "103.21.244.0/22",
    "103.22.200.0/22",
    "103.31.4.0/22",
    "141.101.64.0/18",
    "108.162.192.0/18",
    "190.93.240.0/20",
    "188.114.96.0/20",
    "197.234.240.0/22",
    "198.41.128.0/17",
    "162.158.0.0/15",
    "104.16.0.0/12",
    "172.64.0.0/13",
    "131.0.72.0/22"
]

from itertools import chain
ipstartswith_cloudflare = list(chain(*list(map(convert_cidr_to_ipstartswith, netblock_cloudflare))))

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5719250 entries, 0 to 5721431
Data columns (total 35 columns):
target_ip                 object
date                      datetime64[ns]
sensor_id                 object
service                   object
start_time                datetime64[ns]
stop_time                 datetime64[ns]
duration                  float64
packets                   int64
raw_country               object
raw_as                    object
raw_hostname              object
udp_port_list             object
pyasn_as                  float64
pyasn_as_bgp_size         float64
cc                        object
region                    object
is_oecd                   float64
as_type                   object
as_type_confidence        float64
tg_op                     object
org                       object
org_range                 object
org_rangesize             float64
org_ipsize_seen           float64
org_domainsize_seen       float64
org_ipsize_seen_shared    float

In [19]:
df[df.target_ip.str.startswith(tuple(ipstartswith_cloudflare))][["org", "org_tag"]].drop_duplicates()

Unnamed: 0,org,org_tag
8499,CloudFlare,cdn
13502,Xplornet Communications,isp-broadband
13504,IT7 Networks,
13532,Time Warner Cable,isp-broadband
13564,Outofwall,
13566,"QuickPacket, LLC",
13570,"Elauwit, LLC",
13571,Contina,
13572,Frontier Communications,
13719,AT&T U-verse,isp-broadband


In [7]:
df[df.target_ip.str.startswith(tuple(ipstartswith_cloudflare))].org.value_counts()

CloudFlare                                           24138
Time Warner Cable                                     5489
Choopa, LLC                                           3859
Global Frag Networks                                  2806
AT&T U-verse                                          2227
VolumeDrive                                           1588
Digital Ocean                                         1191
Cnservers LLC                                          932
Frontier Communications                                877
Psychz Networks                                        787
ClearDDoS Technologies                                 741
Enzu                                                   694
Versaweb, LLC                                          562
Google Cloud                                           526
Sharktech                                              501
Microsoft Azure                                        473
Secured Servers LLC                                    4

In [18]:
df[df.target_ip.str.startswith(tuple(ipstartswith_cloudflare))].org_tag.value_counts()

cdn              24148
isp-broadband     7945
hosting           5162
isp-other          428
edu                 41
isp-mobile          35
other                7
Name: org_tag, dtype: int64

In [10]:
df.org.value_counts()

Comcast Cable                                       254202
Hangzhou Alibaba Advertising Co.,Ltd.               167254
Time Warner Cable                                   158270
Guangdong                                           149305
OVH SAS                                             143735
AT&T U-verse                                        116836
Aliyun Computing Co., LTD                           100982
China Telecom jiangsu province backbone              92592
Hostspace Networks LLC                               87206
OVH Hosting                                          85942
Verizon FiOS                                         79356
Charter Communications                               69577
Cox Communications                                   67630
Sharktech                                            65325
Orange                                               55370
Choopa, LLC                                          55041
Virgin Media                                         545

In [16]:
df[df.org.str.contains("amazon", case=False, na=False)].org.value_counts()

Amazon.com                                            26433
AMAZON                                                 3612
Amazon Technologies                                    1779
Amazon.com Tech Telecom                                 939
Amazon                                                  440
Amazon Web Services, Elastic Compute Cloud, EC2, E      297
Amazon Data Services Ireland Ltd                         61
Amazon Web Services, Elastic Compute Cloud, EC2, J       16
Amazon AWS Services - Cloudfront - LHR                   10
Amazon AWS Services - Cloudfront - FRA                    7
Amazon AWS Services - Cloudfront - DUB                    3
Amazon Web Services, Elastic Compute Cloud, EC2, S        2
Amazon AWS Services - Cloudfront - AMS                    2
PRODAM Processamento de Dados Amazonas S.A                1
Name: org, dtype: int64