### Task 1a: Load the data... without leaking GPU memory

#### 1a. Task

In [None]:
%%time

gdf = cudf.read_csv('./conn5M.log')

print(gdf.tail(3))

About 600MB of GPU memory should have been freed after running `gdf = None`

### Task 1b: Load 500MB

#### 1b. Task

In [None]:
%%time

gdf = cudf.read_csv('./conn1K.log', sep='\t')

print(gdf.head(3))

gdf = None

### Task 1c: Format & load 2.6GB

#### 1c. Task

In [None]:
%%time

gdf = None
! nvidia-smi


gdf = cudf.read_csv('./conn1K.log',  sep='\t',
    names=cols,
    dtypes=dtypes,
    usecols=cols_subset,
    na_values=['-', '-','(empty)'])


! nvidia-smi
print(gdf.dtypes)
print(gdf.head(3))
gdf = None

* Load all columns with native types: 2.6 GB file => 3.4 GB in-GPU-memory
* ... just the subset: 2.4 GB

## Task 2. Analytics & Wrangling

### Task 2a: Column manipulation

#### 2a. Task

In [None]:
%%time

### id.resp_h unique value count

gdf = cudf.read_csv('./conn.log', 
              sep='\t', 
              names=cols,
              dtypes=dtypes,
              usecols=['id.resp_h'],
              na_values=['-', '-','(empty)'])

unique_resp_ips = gdf['id.resp_h'].unique()

print('# unique', len(unique_resp_ips))

print(unique_resp_ips[:10])

unique_resp_ips = None
gdf = None

**Advanced**

In [None]:
%%time

### col orig_bytes max

gdf = cudf.read_csv('./conn.log', 
              sep='\t', 
              names=cols,
              dtypes=dtypes,
              usecols=['orig_bytes'],
              na_values=['-', '-','(empty)'])

mx = gdf['orig_bytes'].max()

print('max orig_bytes', mx)

gdf = None

### Task 2b: Group by & column summaries

#### 2b. Task

In [None]:
%%time

gdf = cudf.read_csv('./conn.log', 
              sep='\t', 
              names=cols,
              dtypes=dtypes,
              usecols=cols_subset,
              na_values=['-', '-','(empty)'])

out = gdf.groupby(['id.resp_h', 'id.orig_h'])\
    .agg({
        'ts': ['count', 'min', 'max', 'mean'],
        'uid': 'nunique',
        'id.resp_p': ['min', 'max', 'nunique'],
        'proto': ['nunique'],
        'duration': ['min', 'max', 'mean', 'sum'],
        'orig_bytes': ['min', 'max', 'mean', 'sum'],
        'resp_bytes': ['min', 'max', 'mean', 'sum'],
    }).reset_index()


gdf = None
print(out.shape)
print(out.dtypes)
print(out.head(3))
out = None

**Advanced**

In [None]:
%%time

gdf = cudf.read_csv('./conn1K.log', 
              sep='\t', 
              names=cols,
              dtypes=dtypes,
              usecols=cols_subset,
              na_values=['-', '-','(empty)'])


gdf = gdf[ gdf['id.resp_p'] == 22 ]


out = gdf.groupby(['id.resp_h', 'id.orig_h'])\
    .agg({
        'duration': ['max'],
        'resp_bytes': ['sum']
    }).reset_index()


gdf = None
print('longest: ',
      out.sort_values(by='duration', ascending=False).head(1))
print('biggest: ',
      out.sort_values(by='resp_bytes', ascending=False).head(1))
out = None

## Task 3: Visualize!

### Task 3b: Map SSH activity

#### 3b. Task

In [None]:
def compute_groupby2(file='./conn1K.log'):

    gdf = cudf.read_csv(file, 
                  sep='\t', 
                  names=cols,
                  dtypes=dtypes,
                  usecols=cols_subset,
                  na_values=['-', '-','(empty)'])
    
    gdf = gdf[ gdf['id.resp_p'] == 22 ]

    out = gdf.groupby(['id.resp_h', 'id.orig_h'])\
        .agg({
            'ts': ['count', 'min', 'max', 'mean'],
            'uid': 'nunique',
            'id.resp_p': ['min', 'max', 'nunique'],
            'proto': ['nunique'],
            'duration': ['min', 'max', 'mean', 'sum'],
            'orig_bytes': ['min', 'max', 'mean', 'sum'],
            'resp_bytes': ['min', 'max', 'mean', 'sum'],
        }).reset_index()


    ########### Data cleaning: normal column names and times as actual timestamps

    out.columns = out.columns.to_flat_index() # -> col_name = (col, stat)
    out.columns = [ '%s_%s' % c for c in out.columns ]

    out = out.rename(columns={
        'id.resp_h_': 'id.resp_h',
        'id.orig_h_': 'id.orig_h',
    })

    out['ts_min'] = cudf.Series(pd.to_datetime((out['ts_min']*1000000000).to_pandas()))
    out['ts_max'] = cudf.Series(pd.to_datetime((out['ts_max']*1000000000).to_pandas()))
    out['ts_mean'] = cudf.Series(pd.to_datetime((out['ts_mean']*1000000000).to_pandas()))
    
    return out