# Aggregating and summarizing data into useful reports
## Loading the data

In [1]:
raw_data = sc.textFile("./kddcup.data.gz")

## Calculating averages with map and reduce

In [9]:
csv = raw_data.map(lambda x: x.split(","))
normal_data = csv.filter(lambda x: x[41]=="normal.")
duration = normal_data.map(lambda x: int(x[0]))
total_duration = duration.reduce(lambda x, y: x+y)
total_duration

211895753

In [10]:
total_duration/(normal_data.count())

217.82472416710442

## Faster average computation with aggregate

In [14]:
duration_count = duration.aggregate(
    (0,0),
    (lambda db, new_value: (db[0] + new_value, db[1] + 1)),
    (lambda db1, db2: (db1[0] + db2[0], db1[1] + db2[1]))
)
duration_count[0]/duration_count[1]

217.82472416710442

## Pivot tabling with key-value paired data points

In [15]:
kv = csv.map(lambda x: (x[41], x))
kv.take(1)

[('normal.',
  ['0',
   'tcp',
   'http',
   'SF',
   '215',
   '45076',
   '0',
   '0',
   '0',
   '0',
   '0',
   '1',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '1',
   '1',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '1.00',
   '0.00',
   '0.00',
   '0',
   '0',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   'normal.'])]

In [17]:
kv_duration = csv.map(lambda x: (x[41], float(x[0]))).reduceByKey(lambda x, y: x + y)
kv_duration.collect()

[('normal.', 211895753.0),
 ('buffer_overflow.', 2751.0),
 ('loadmodule.', 326.0),
 ('perl.', 124.0),
 ('neptune.', 2.0),
 ('smurf.', 0.0),
 ('guess_passwd.', 144.0),
 ('pod.', 0.0),
 ('teardrop.', 0.0),
 ('portsweep.', 24257982.0),
 ('ipsweep.', 13049.0),
 ('land.', 0.0),
 ('ftp_write.', 259.0),
 ('back.', 284.0),
 ('imap.', 72.0),
 ('satan.', 500.0),
 ('phf.', 18.0),
 ('nmap.', 0.0),
 ('multihop.', 1288.0),
 ('warezmaster.', 301.0),
 ('warezclient.', 627563.0),
 ('spy.', 636.0),
 ('rootkit.', 1008.0)]

In [18]:
kv.countByKey()

defaultdict(int,
            {'back.': 2203,
             'buffer_overflow.': 30,
             'ftp_write.': 8,
             'guess_passwd.': 53,
             'imap.': 12,
             'ipsweep.': 12481,
             'land.': 21,
             'loadmodule.': 9,
             'multihop.': 7,
             'neptune.': 1072017,
             'nmap.': 2316,
             'normal.': 972781,
             'perl.': 3,
             'phf.': 4,
             'pod.': 264,
             'portsweep.': 10413,
             'rootkit.': 10,
             'satan.': 15892,
             'smurf.': 2807886,
             'spy.': 2,
             'teardrop.': 979,
             'warezclient.': 1020,
             'warezmaster.': 20})