In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# data source: https://surfdrive.surf.nl/files/index.php/s/CZWfWQp3VKGKa8m
df = pd.read_csv('data_with_ports.csv.gz', sep='|', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# parse string to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['start_time'] = pd.to_datetime(df['start_time'], format='%Y-%m-%dT%H:%M:%S+09:00')
df['stop_time'] = pd.to_datetime(df['stop_time'], format='%Y-%m-%dT%H:%M:%S+09:00')

In [4]:
# remove reserved ips

reserved_ips = [
    "0.",       # software
    "10.",      # private network
    "100.",     # private network
    "169.254.", # subnet
    "172.",     # private network
    "192."      # private network
]

for ips in reserved_ips:
    df = df[~(df.org.isna() & df.target_ip.str.startswith(ips))]

In [5]:
from random import randint

df.iloc[randint(0, len(df))]

target_ip                                                   199.180.255.217
date                                                    2014-08-19 00:00:00
sensor_id                                                         sensor006
service                                                                 ntp
start_time                                              2014-08-19 15:00:05
stop_time                                               2014-08-19 15:06:48
duration                                                                403
packets                                                                1124
raw_country                                                   United States
raw_as                                        AS46562 Colo at 55<comma> LLC
raw_hostname                                                    zerosec.net
udp_port_list                                                          [80]
pyasn_as                                                              46562
pyasn_as_bgp

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5719250 entries, 0 to 5721431
Data columns (total 35 columns):
target_ip                 object
date                      datetime64[ns]
sensor_id                 object
service                   object
start_time                datetime64[ns]
stop_time                 datetime64[ns]
duration                  float64
packets                   int64
raw_country               object
raw_as                    object
raw_hostname              object
udp_port_list             object
pyasn_as                  float64
pyasn_as_bgp_size         float64
cc                        object
region                    object
is_oecd                   float64
as_type                   object
as_type_confidence        float64
tg_op                     object
org                       object
org_range                 object
org_rangesize             float64
org_ipsize_seen           float64
org_domainsize_seen       float64
org_ipsize_seen_shared    float

In [7]:
df.as_type.unique()

array(['isp-broadband', nan, 'isp-other', 'isp-mobile',
       'other-intermediary', 'hosting', 'non-intermediary', 'gov', 'edu'],
      dtype=object)

In [8]:
df.org_tag.unique()

array([nan, 'other', 'isp-mobile', 'isp-broadband', 'isp-other',
       'hosting', 'cdn', 'edu', 'gov'], dtype=object)

### Major hosting providers

In [9]:
# a list of major hosting providers
hosts = ["godaddy", "amazon", "google", "1&1", "squarespace", "hostgator", "ovh", "hetzner", "softlayer", "liquid web"]

In [10]:
majorhps = (df[df.org.str.contains("|".join(hosts), case=False, na=False)]
              [["org", "org_tag", "as_type", "as_ipsize_seen", "org_ipsize_seen"]]
              .drop_duplicates()
              .sort_values(by=["org"]))

In [11]:
majorhps.to_csv("major-hosting-providers.csv")

### CDNs

In [12]:
cdn = df[df.org_tag == "cdn"][["org", "org_tag", "as_type", "as_ipsize_seen", "org_ipsize_seen"]].drop_duplicates()

In [13]:
# cloudflare
(df[df.org.str.contains("cloudflare", case=False, na=False)]
   [["org", "org_tag", "as_type", "as_ipsize_seen", "org_ipsize_seen", "cc"]]
   .drop_duplicates())

Unnamed: 0,org,org_tag,as_type,as_ipsize_seen,org_ipsize_seen,cc
8499,CloudFlare,cdn,,122283.0,88540.0,JP
8501,CloudFlare,cdn,,122283.0,88540.0,HK
13452,CloudFlare,cdn,,122283.0,88540.0,US
31561,CloudFlare,cdn,,122283.0,88540.0,CA
31579,CloudFlare,cdn,,122283.0,88540.0,FR
31597,CloudFlare,cdn,,122283.0,88540.0,NO
127572,CloudFlare,cdn,,122283.0,88540.0,
128575,CloudFlare,cdn,,122283.0,88540.0,AT
128578,CloudFlare,cdn,,122283.0,88540.0,GB
341994,CloudFlare,cdn,,122283.0,88540.0,CR


In [14]:
(df[df.org.str.contains("akamai", case=False, na=False)]
   [["org", "org_tag", "as_type", "as_ipsize_seen", "org_ipsize_seen", "cc"]]
   .drop_duplicates())

Unnamed: 0,org,org_tag,as_type,as_ipsize_seen,org_ipsize_seen,cc
12347,Akamai Technologies,cdn,,118405.0,1567292.0,US
18829,Akamai Technologies,cdn,,1531580.0,1567292.0,US
18837,Akamai Technologies,cdn,isp-broadband,1568045.0,1567292.0,US
83971,Akamai Technologies,cdn,,119372.0,1567292.0,SG
118972,Akamai,,isp-other,683571.0,12076.0,US
144896,Akamai,,,1531580.0,12076.0,US
160561,Akamai,,,118405.0,12076.0,US
168011,Akamai Technologies,cdn,hosting,937404.0,1567292.0,US
176762,Akamai Technologies,cdn,,304812.0,1567292.0,US
245556,Akamai Technologies,cdn,isp-broadband,13773.0,1567292.0,US
