In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

conf = SparkConf()\
    .setMaster("local[*]")\
    .setAppName("Working with DF")\
    .setExecutorEnv("spark.driver.memory","2g")\
    .setExecutorEnv("spark.executor.memory","4g")

spark = SparkSession\
    .builder\
    .config(conf=conf)\
    .getOrCreate()

sc = spark.sparkContext
sc

In [1]:
##### data source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [2]:
rdd = sc.textFile('data\kddcup.data.gz')

In [3]:
print('Number of partitions:', rdd.getNumPartitions())

# spark consider this file as small file

Number of partitions: 1


In [4]:
rdd.count()
# as we see data is not small

4898431

In [4]:
#  let's repartiton it to increase number of partitions
rdd = rdd.repartition(10) # shuffle all data
print(rdd.count())

In [5]:
print('Number of partitions:', rdd.getNumPartitions())
print('Default parallelism:', sc.defaultParallelism)

Number of partitions: 10
Default parallelism: 16


### 1. Get 10 records randomly

In [8]:
rdd.takeSample(withReplacement=False, num=10, seed=42)

# This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver’s memory.

['0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,258,16,1.00,1.00,0.00,0.00,0.06,0.06,0.00,255,16,0.06,0.07,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 '0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,112,9,1.00,1.00,0.00,0.00,0.08,0.06,0.00,255,9,0.04,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,202,2,1.00,1.00,0.00,0.00,0.01,0.05,0.00,255,2,0.01,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 '0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,246,20,1.00,1

### 2. Number of records

In [9]:
rdd.count()

4898431

### 3. Ratio of normal connections

In [11]:
normal_rdd = rdd.filter(lambda line: 'normal' in line)

ratio = normal_rdd.count() / rdd.count()

print(f'The ratio of normal connections is {round(ratio,4)*100} %')

The ratio of normal connections is 19.86 %


### 4. Get the list of labels. 

In [13]:
split_rdd = rdd.map(lambda line: line.split(','))
label_rdd = split_rdd.map(lambda x: x[-1]).distinct()

print(f'Labels: {label_rdd.collect()}')



Labels: ['neptune.', 'loadmodule.', 'warezclient.', 'pod.', 'smurf.', 'nmap.', 'spy.', 'back.', 'teardrop.', 'ipsweep.', 'multihop.', 'phf.', 'ftp_write.', 'guess_passwd.', 'normal.', 'land.', 'satan.', 'imap.', 'portsweep.', 'warezmaster.', 'rootkit.', 'buffer_overflow.', 'perl.']


### 5. Get number of connections per label

In [25]:
rdd_split = rdd.map(lambda line: line.split(','))
rdd_kv = rdd_split.map(lambda x: (x[-1],1))
rdd_rbk = rdd_kv.reduceByKey(lambda x, y: x+y)

print(rdd_rbk.collect())

[('neptune.', 1072017), ('loadmodule.', 9), ('warezclient.', 1020), ('pod.', 264), ('smurf.', 2807886), ('nmap.', 2316), ('spy.', 2), ('back.', 2203), ('teardrop.', 979), ('ipsweep.', 12481), ('multihop.', 7), ('phf.', 4), ('ftp_write.', 8), ('guess_passwd.', 53), ('normal.', 972781), ('land.', 21), ('satan.', 15892), ('imap.', 12), ('portsweep.', 10413), ('warezmaster.', 20), ('rootkit.', 10), ('buffer_overflow.', 30), ('perl.', 3)]


In [37]:
import pandas as pd

label_pivot_df = pd.DataFrame({
    'label': rdd_rbk.keys().collect(),
    'count': rdd_rbk.values().collect()
})

label_pivot_df.sort_values(by=['count'], ascending=False)

Unnamed: 0,label,count
4,smurf.,2807886
0,neptune.,1072017
14,normal.,972781
16,satan.,15892
9,ipsweep.,12481
18,portsweep.,10413
5,nmap.,2316
7,back.,2203
2,warezclient.,1020
8,teardrop.,979


In [32]:
rdd \
    .map(lambda line: line.split(',')) \
    .map(lambda x: (x[-1], 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda x: x[1], ascending=False) \
    .collect()

[('smurf.', 2807886),
 ('neptune.', 1072017),
 ('normal.', 972781),
 ('satan.', 15892),
 ('ipsweep.', 12481),
 ('portsweep.', 10413),
 ('nmap.', 2316),
 ('back.', 2203),
 ('warezclient.', 1020),
 ('teardrop.', 979),
 ('pod.', 264),
 ('guess_passwd.', 53),
 ('buffer_overflow.', 30),
 ('land.', 21),
 ('warezmaster.', 20),
 ('imap.', 12),
 ('rootkit.', 10),
 ('loadmodule.', 9),
 ('ftp_write.', 8),
 ('multihop.', 7),
 ('phf.', 4),
 ('perl.', 3),
 ('spy.', 2)]

### 6. Get the connection type with successful `root_shell` connections to servers, where the number of data bytes from source (`src_bytes`) is 500 times more than from server (`dst_bytes`)

In [46]:
rdd.map(lambda line: line.split(','))\
    .filter(lambda x: x[13] == '1')\
    .map(lambda x: (x[1],x[4],x[5]))\
    .filter(lambda x: int(x[2]) > int(x[1]) * 500)\
    .collect()

[('tcp', '353', '759161'),
 ('tcp', '433', '1524348'),
 ('tcp', '296', '507534'),
 ('tcp', '296', '507534'),
 ('tcp', '246', '866032'),
 ('tcp', '317', '394616'),
 ('tcp', '262', '744605'),
 ('tcp', '173', '744605'),
 ('tcp', '224', '2776333'),
 ('tcp', '262', '744605'),
 ('tcp', '0', '2072'),
 ('tcp', '351', '759161'),
 ('tcp', '1794', '3851730'),
 ('tcp', '465', '320362'),
 ('tcp', '0', '2072'),
 ('tcp', '0', '2072'),
 ('tcp', '296', '507534'),
 ('tcp', '266', '507534'),
 ('tcp', '255', '574784'),
 ('tcp', '0', '2072')]

### 7.  Get the list of `Protocols` that are `normal` and `vulnerable to attacks`, where there is NOT `guest login` to the destination addresses

In [7]:
normal_protocols_rdd = \
    rdd\
        .map(lambda line: line.split(','))\
        .filter(lambda line: 'normal' in line[-1] and line[21] != '1')\
        .map(lambda line: (line[1], 1))\
        .reduceByKey(lambda x, y: x + y)

attack_protocols_rdd = \
    rdd\
        .map(lambda line: line.split(','))\
        .filter(lambda line: 'normal' not in line[-1] and line[21] != '1')\
        .map(lambda line: (line[1], 1))\
        .reduceByKey(lambda x, y: x + y)

import pandas as pd

normal_protocols_df = pd.DataFrame({
    'label': normal_protocols_rdd.keys().collect(),
    'state': 'normal',
    'count': normal_protocols_rdd.values().collect()
})

attack_protocols_df = pd.DataFrame({
    'label': attack_protocols_rdd.keys().collect(),
    'state': 'attack',
    'count': attack_protocols_rdd.values().collect()
})

normal_and_attacked_merged_df = pd.concat([normal_protocols_df, attack_protocols_df])
normal_and_attacked_merged_df.sort_values(by=['label','count'], ascending=False)

Unnamed: 0,label,state,count
1,udp,normal,191348
1,udp,attack,2940
2,tcp,attack,1101613
2,tcp,normal,764894
0,icmp,attack,2820782
0,icmp,normal,12763


## 8. Get a summary statistics for the sum of `tcp` connections to the same destination IP address (`protocol_type` and `dst_host_count` features)

In [14]:
from pyspark.mllib.stat import Statistics
from math import sqrt

stat_rdd = \
    rdd \
        .map(lambda x: x.split(','))\
        .filter(lambda x: x[1] == 'tcp')\
        .map(lambda x: [int(x[31])])

summary = Statistics.colStats(stat_rdd)

tcp_mean  = round(float(summary.mean()),2)
tcp_count = round(float(summary.count()),2)
tcp_min   = round(float(summary.min()),2)
tcp_max   = round(float(summary.max()),2)
tcp_std   = round(float(sqrt(summary.variance())))

print(tcp_mean, tcp_count, tcp_min, tcp_max, tcp_std)


201.75 1870598.0 0.0 255.0 91


### 9. Filter the number of `icmp`-based attacks for each `service`

In [19]:
rdd \
    .map(lambda x: x.split(','))\
    .filter(lambda x: x[1] == 'icmp' and 'normal' not in x[-1])\
    .map(lambda x: (x[2], 1))\
    .reduceByKey(lambda x, y: x + y)\
    .sortBy(lambda x: x[1])\
    .collect()


[('urp_i', 3), ('tim_i', 5), ('eco_i', 12570), ('ecr_i', 2808204)]