## Configure your environment variables

```bash
export PYSPARK_DRIVER_PYTHON=/path/to/anaconda3/bin/jupyter
export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880"
```

# Sampling/filtering RDDs to pick out relevant data points

In [10]:
raw_data = sc.textFile("./kddcup.data.gz")

In [11]:
from time import time

In [12]:
sampled = raw_data.sample(False, 0.1, 42)
contains_normal_sample = sampled.map(lambda x: x.split(",")).filter(lambda x: "normal" in x)

In [13]:
t0 = time()
num_sampled = contains_normal_sample.count()
duration = time() - t0

In [14]:
duration

23.724565505981445

In [15]:
contains_normal = raw_data.map(lambda x: x.split(",")).filter(lambda x: "normal" in x)
t0 = time()
num_sampled = contains_normal.count()
duration = time() - t0

In [16]:
duration

36.51565098762512

In [18]:
data_in_memory = raw_data.takeSample(False, 10, 42)
contains_normal_py = [line.split(",") for line in data_in_memory if "normal" in line]

In [19]:
len(contains_normal_py)

1

In [20]:
normal_sample = sampled.filter(lambda line: "normal." in line)

In [21]:
non_normal_sample = sampled.subtract(normal_sample)

In [22]:
sampled.count()

490705

In [23]:
normal_sample.count()

97404

In [24]:
non_normal_sample.count()

393301

In [25]:
feature_1 = sampled.map(lambda line: line.split(",")).map(lambda features: features[1]).distinct()

In [26]:
feature_2 = sampled.map(lambda line: line.split(",")).map(lambda features: features[2]).distinct()

In [27]:
f1 = feature_1.collect()
f2 = feature_2.collect()

In [28]:
f1

['tcp', 'udp', 'icmp']

In [29]:
f2

['http',
 'finger',
 'auth',
 'domain_u',
 'smtp',
 'ftp',
 'telnet',
 'eco_i',
 'ntp_u',
 'ecr_i',
 'other',
 'private',
 'pop_3',
 'ftp_data',
 'daytime',
 'remote_job',
 'supdup',
 'name',
 'ssh',
 'domain',
 'gopher',
 'time',
 'rje',
 'ctf',
 'mtp',
 'X11',
 'urp_i',
 'pm_dump',
 'IRC',
 'exec',
 'bgp',
 'nnsp',
 'iso_tsap',
 'http_443',
 'login',
 'shell',
 'printer',
 'efs',
 'courier',
 'uucp',
 'kshell',
 'klogin',
 'whois',
 'echo',
 'discard',
 'systat',
 'netstat',
 'hostnames',
 'csnet_ns',
 'pop_2',
 'sunrpc',
 'uucp_path',
 'nntp',
 'netbios_ns',
 'netbios_ssn',
 'netbios_dgm',
 'imap4',
 'sql_net',
 'vmnet',
 'link',
 'Z39_50',
 'ldap',
 'urh_i',
 'tftp_u',
 'red_i',
 'tim_i']

In [31]:
len(feature_1.cartesian(feature_2).collect())

198