# RAPIDS for Cybersecurity Applications

#### KDD 2019 Workshop

#### Authors
- Bartley Richardson (NVIDIA)
- Brad Rees (NVIDIA)
- Haekyu Park (Georgia Tech, NVIDIA)

In [1]:
import dask
import dask.dataframe as dd
import cudf
import dask_cudf
import pandas as pd

## Data Import

In [5]:
%%time
enconn_gdf = cudf.read_csv("./df_enconn_pd.tab", sep='\t')

CPU times: user 452 ms, sys: 264 ms, total: 716 ms
Wall time: 833 ms


In [8]:
%%time
# enconn_pdf = pd.read_csv("./df_enconn_pd.tab", sep='\t')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.7 µs


In [4]:
print(enconn_gdf.head())

   Unnamed: 0                  ts     id_orig_h  id_orig_p       id_resp_h  id_resp_p  proto ...  resp_port
0           0   1424246667.886334  175.45.176.1      54776  149.171.126.17        111    udp ...      111.0
1           1    1424246931.96847  175.45.176.1      61723  149.171.126.17        111    udp ...      111.0
2           2  1424247419.3913927  175.45.176.1      60443  149.171.126.17        111    udp ...      111.0
3           3   1421929976.371788  175.45.176.1       5724  149.171.126.17        111    udp ...      111.0
4           4  1421930473.0328019  175.45.176.1      37292  149.171.126.17        111    udp ...      111.0
[9 more columns]


## Feature Engineering

**[TODO]** 
- Migrate to cuDF

In [None]:
df_enriched_conn = hc.sql("SELECT c.ts,c.id_orig_h,c.id_orig_p,c.id_resp_h,c.id_resp_p,c.proto,c.orig_ip_bytes,c.resp_ip_bytes,g.attack_category,g.attack_subcategory,g.attack_name FROM unsw_nb15.bro_conn as c LEFT JOIN unsw_nb15.ground_truth AS g ON c.ts>=g.start_time AND c.ts<=g.last_time AND c.id_orig_h=g.source_ip AND c.id_resp_h=g.dest_ip AND c.proto=g.protocol")
# enconn_pdf = df_enriched_conn.toPandas()

In [9]:
%%time
enconn_pdf = pd.read_csv("./df_enconn_pd.tab", sep='\t')

CPU times: user 6.67 s, sys: 768 ms, total: 7.44 s
Wall time: 7.44 s


In [10]:
enconn_pdf[:10]

Unnamed: 0.1,Unnamed: 0,ts,id_orig_h,id_orig_p,id_resp_h,id_resp_p,proto,orig_ip_bytes,resp_ip_bytes,attack_category,attack_subcategory,attack_name,orig_ip,resp_ip,attack_color,orig_port,resp_port
0,0,1424247000.0,175.45.176.1,54776,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,10000.0,111.0
1,1,1424247000.0,175.45.176.1,61723,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,10000.0,111.0
2,2,1424247000.0,175.45.176.1,60443,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,10000.0,111.0
3,3,1421930000.0,175.45.176.1,5724,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,5724.0,111.0
4,4,1421930000.0,175.45.176.1,37292,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,10000.0,111.0
5,5,1421930000.0,175.45.176.1,63447,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,10000.0,111.0
6,6,1421931000.0,175.45.176.1,48500,149.171.126.17,111,udp,168,0,no_attack,,,2939006977,2511044113,13,10000.0,111.0
7,7,1421931000.0,175.45.176.1,60588,149.171.126.17,5060,udp,1320,702,no_attack,,,2939006977,2511044113,13,10000.0,5060.0
8,8,1421931000.0,175.45.176.1,54155,149.171.126.17,33107,udp,226,0,no_attack,,,2939006977,2511044113,13,10000.0,10000.0
9,9,1421931000.0,175.45.176.1,20551,149.171.126.17,69,udp,1104,0,no_attack,,,2939006977,2511044113,13,10000.0,69.0


In [11]:
import socket, struct

def ip2long(ip):
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]

In [12]:
enconn_pdf['orig_ip'] = enconn_pdf['id_orig_h'].apply(lambda x: ip2long(x))
enconn_pdf['resp_ip'] = enconn_pdf['id_resp_h'].apply(lambda x: ip2long(x))

In [13]:
enconn_pdf['attack_category'] = enconn_pdf['attack_category'].fillna('no_attack')

In [14]:
enconn_pdf['attack_color'] = enconn_pdf['attack_category'].astype('category').cat.codes

In [15]:
# df_enconn_pd = df_enconn_pd.drop(['orig_port','resp_port'], axis=1)

In [16]:
enconn_pdf.loc[enconn_pdf['id_orig_p']>10000, 'orig_port'] = 10000
enconn_pdf.loc[enconn_pdf['id_orig_p']<=10000, 'orig_port'] = enconn_pdf['id_orig_p']

In [17]:
enconn_pdf.loc[enconn_pdf['id_resp_p']>10000, 'resp_port'] = 10000
enconn_pdf.loc[enconn_pdf['id_resp_p']<=10000, 'resp_port'] = enconn_pdf['id_resp_p']

## Graph Embedding

**[TODO]**
- cuGraph from cuDF
- graph aggregate statistics

## Graph Analytics

**[TODO]**
- Temporal embedding
- PPR (static)
- PPR (dynamic)

## Visualization

**[TODO]**
- Replace parallel coordinates with PyDataViz+GPU (as possible)
- Temporal visualization (currently D3/JS)

Use parallel coordinates to visualize attacks

In [18]:
subset_edges_plot_df = enconn_pdf.sort_values(by=['ts'])[:500000]

In [19]:
subset_edges_plot_df = enconn_pdf[enconn_pdf['attack_category'] != 'no_attack']

In [22]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import plot

In [23]:
data = [
    go.Parcoords(
        line = dict(color = subset_edges_plot_df['attack_color'],
                   colorscale = 'Jet',
                   showscale = True),
        dimensions = list([
            dict(range = [subset_edges_plot_df['ts'].min(),subset_edges_plot_df['ts'].max()],
                label = 'Time', values = subset_edges_plot_df['ts']),
            dict(range = [subset_edges_plot_df['orig_ip'].min(),subset_edges_plot_df['orig_ip'].max()],
                label = 'Orig IP', values = subset_edges_plot_df['orig_ip']),
            dict(range = [subset_edges_plot_df['orig_port'].min(),subset_edges_plot_df['orig_port'].max()],
                label = 'Orig Port', values = subset_edges_plot_df['orig_port']),
            dict(range = [subset_edges_plot_df['resp_port'].min(),subset_edges_plot_df['resp_port'].max()],
                label = 'Resp Port', values = subset_edges_plot_df['resp_port']),
            dict(range = [subset_edges_plot_df['resp_ip'].min(),subset_edges_plot_df['resp_ip'].max()],
                label = 'Resp IP', values = subset_edges_plot_df['resp_ip'])
        ])
    )
]

# plotly.offline.iplot(data, filename = 'unsw-1')
# plotly.offline.iplot(data)
# py.iplot(data, filename = 'unsw-1')
plot(data, filename = 'unsw-kdd-v1.html')

'unsw-kdd-v1.html'

## Additional Questions / Exploration

**[TODO]**
- Add guided, self-paced questions for workshop
- Example questions/topics include:
  - Adding another data source in addition to `conn` log type
  - Adding addtional graph analytics
  - Other viz/plot investigations
  - Clustering and/or UMAP