## FAIR USE POLICY
**Please do not leave your Jupyter lab server idle for extended periods of time.** The Jupyter process, active Python kernels, and especially running Spark contexts, claim a minimum amount of cluster resources. These add up and will get starve resources of others eventually. Leaving your environment idle for a few hours (e.g., over lunch) is fine. But letting it idle overnight or for multiple days in which you are not actively using the cluster is not. You can kill the server from your SSH session, by pressing ctrl+c repeatedly, or by selecting *File->Shutdown* from the menu.

### Imports

In [1]:
import os
import time
from datetime import datetime, timedelta, date
import pprint

import collections
import numpy as np
import pandas as pd
import subprocess

import pyspark
import pyspark.sql.functions as F 
from pyspark.sql.types import *

# Find Spark
import findspark
findspark.init()

from datetime import datetime, timedelta, date
import datetime as dt
from datetime import timedelta as td
import subprocess
import re
import pyarrow as pa
import copy
from pyspark.sql.types import DateType
from string import digits

import pyspark
from pyspark.sql import *  
from pyspark.sql.functions import *
from pyspark.sql import functions as F, Window
from pyspark.sql.functions import from_unixtime
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import mean as _mean
from pyspark.sql.window import Window as W


from functools import reduce
from pyspark.ml.feature import Tokenizer, StopWordsRemover


import json

#### Create Spark Configuration

In [2]:
# Create a SparkConf
APP_NAME = "apwg-median-app"

spark_conf = pyspark.SparkConf().setAppName(APP_NAME).setMaster("yarn").set(
    "spark.submit.deployMode", "client"
).set("spark.sql.parquet.binaryAsString", "true"
).set("spark.dynamicAllocation.maxExecutors", "16"
).set("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-spark24_2.11:3.0.0"
).set("spark.sql.debug.maxToStringFields", "1000"
).set("spark.executor.memory", "7G")

#### Start SparkContext
1. This may take a minute to complete
2. You should not (and cannot) start two Spark contexts. If you accidentally run this cell twice or get stuck somehow, restart your Python kernel from the menu above.
3. Please **stop your Spark context** when idling for extended periods of time (see code at bottom of notebook)

In [3]:
print("[{}] Starting Spark context.".format(datetime.now().replace(microsecond=0)))

# SparkContext
sc = pyspark.SparkContext(conf=spark_conf)

# SQLContext

sqlc = pyspark.SQLContext(sc)

[2021-05-05 11:21:56] Starting Spark context.


### Your code below

In [4]:
APWG_CLEAN_DATA_CONVERTED_BASE = "PATH TO DATA-RECORDS"

INTEREST_DATE_START = datetime(2017, 12, 18)
INTEREST_DATE_END   = datetime(2019, 8, 16)


# Read JSON files into Spark DF
clean_mails_df = sqlc.read.json(APWG_CLEAN_DATA_CONVERTED_BASE, multiLine=True).withColumn(
    "parsed_date", F.from_unixtime(F.col("date_received")).cast("date")
).filter(
    # Filter date range of interest
    (F.col("parsed_date") >= INTEREST_DATE_START.date().isoformat()) &
    (F.col("parsed_date") <= INTEREST_DATE_END.date().isoformat())
)

In [5]:
original_df = clean_mails_df
original_df = original_df.filter(original_df.language == "english")

In [6]:
h1_df = original_df   
h1_df = h1_df.select('parsed_date', 'id')

In [7]:
remover = StopWordsRemover(inputCol='body_words', outputCol='words_clean')
h2_df = remover.transform(original_df)

h2_df = h2_df.drop("body_words")

word_list=['unsubscribed', 'hack', 'takedown', 'password', 'transparent',\
           'attempt', 'redirect', 'impersonate', 'network', 'obsolete', 'illegal', 'damage', 'edit',\
           'unauthenticated', 'initial', 'survey', 'collect', 'victim', 'detect', 'recharge', 'test',\
           'attachment', 'claim', 'profitable', 'virus', 'fraudulent', 'revalidation', 'link', 'description']

#array_intersect function requires two arrays as arguments, create array from the list of given values:
list_col = F.array(*[F.lit(cl) for cl in word_list])
h2_df = h2_df.filter(F.size(F.array_intersect(F.col("words_clean"), list_col)) > 0)
h2_df = h2_df.select('parsed_date', 'id')

In [8]:
h3_df = original_df.filter(original_df.email_has_attachments == "1")
h3_df = h3_df.select('parsed_date', 'id')

In [9]:
# announcement date of ddos attack

ddos_list = ['2018-01-17', '2018-01-27', '2018-02-08', '2018-03-01', '2018-03-06', '2018-05-14', '2018-05-24', '2018-07-30',\
             '2019-01-16','2019-01-31', '2019-02-23', '2019-03-22', '2019-04-16', '2019-06-12', '2019-08-08']

In [10]:
#partitionBy is used to shuffle data before applying the functions
def hypo_1(df, day):
    
    
    h1_df1 = (df.filter(f"parsed_date < '{day}' and parsed_date > '{day}' - interval 20 days")
             .withColumn('rn', F.dense_rank().over(Window.orderBy(F.desc('parsed_date'))))
             .filter('rn <= 7')
             .drop('rn')
             .withColumn('count_before', F.count('id').over(Window.partitionBy('parsed_date')))
             .orderBy('parsed_date').withColumn('ddos_date', F.lit(day))
          )

    h1_df2 = (df.filter(f"parsed_date < '{day}' + interval 20 days and parsed_date > '{day}'")
             .withColumn('rn', F.dense_rank().over(Window.orderBy('parsed_date')))
             .filter('rn <= 7')
             .drop('rn')
             .withColumn('count_after', F.count('id').over(Window.partitionBy('parsed_date')))
             .orderBy('parsed_date').withColumn('ddos_date', F.lit(day))
          )
    
    return [h1_df1, h1_df2]

In [11]:
def hypo_2(df, day):
    """
    Example usage: df_list = hypo_2(df, '2017-12-18', 15)
    Returns a list of 2 dataframes.
    """
    h2_df1 = (df.filter(f"parsed_date < '{day}' and parsed_date > '{day}' - interval 20 days")
             .withColumn('rn', F.dense_rank().over(Window.orderBy(F.desc('parsed_date'))))
             .filter('rn <= 15')
             .drop('rn')
             .withColumn('count_before', F.count('id').over(Window.partitionBy('parsed_date')))
             .orderBy('parsed_date').withColumn('ddos_date', F.lit(day))
          )
    
    h2_df2 = (df.filter(f"parsed_date < '{day}' + interval 20 days and parsed_date > '{day}'")
             .withColumn('rn', F.dense_rank().over(Window.orderBy('parsed_date')))
             .filter('rn <= 15')
             .drop('rn')
             .withColumn('count_after', F.count('id').over(Window.partitionBy('parsed_date')))
             .orderBy('parsed_date').withColumn('ddos_date', F.lit(day))
          )
    
    return [h2_df1, h2_df2]

In [12]:
def hypo_3(df, day):
    """
    Example usage: df_list = hypo_3(df, '2017-12-18', 15)
    Returns a list of 2 dataframes.
    """
    h3_df1 = (df.filter(f"parsed_date < '{day}' and parsed_date > '{day}' - interval 20 days")
             .withColumn('rn', F.dense_rank().over(Window.orderBy(F.desc('parsed_date'))))
             .filter('rn <= 15')
             .drop('rn')
             .withColumn('count_before', F.count('id').over(Window.partitionBy('parsed_date')))
             .orderBy('parsed_date').withColumn('ddos_date', F.lit(day))
          )
    
    
    h3_df2 = (df.filter(f"parsed_date < '{day}' + interval 20 days and parsed_date > '{day}'")
             .withColumn('rn', F.dense_rank().over(Window.orderBy('parsed_date')))
             .filter('rn <= 15')
             .drop('rn')
             .withColumn('count_after', F.count('id').over(Window.partitionBy('parsed_date')))
             .orderBy('parsed_date').withColumn('ddos_date', F.lit(day))
          )
    
    return [h3_df1, h3_df2]

In [13]:
statistics_h1_before_list = []
statistics_h1_after_list = []

for d in ddos_list:
    h1_df1, h1_df2 = hypo_1(h1_df, d)
    
    df1 = h1_df1.dropDuplicates(['parsed_date', 'count_before'])
    df2 = h1_df2.dropDuplicates(['parsed_date', 'count_after'])
    
    statistics_h1_before = df1.withColumn('median_h1_before', F.expr("percentile_approx(count_before, 0.5, 10) over ()")).withColumn('ddos_date', F.lit(d))
    statistics_h1_after = df2.withColumn('median_h1_after', F.expr("percentile_approx(count_after, 0.5, 10) over ()")).withColumn('ddos_date', F.lit(d))
    
    
    mean_before =  statistics_h1_before.groupBy().avg("count_before").take(1)[0][0]
    statistics_h1_before = statistics_h1_before.withColumn("mean_h1_before", lit(mean_before))
    
    mean_after =  statistics_h1_after.groupBy().avg("count_after").take(1)[0][0]
    statistics_h1_after = statistics_h1_after.withColumn("mean_h1_after", lit(mean_after))
    
    
    statistics_h1_before = statistics_h1_before.withColumn("std_h1_before", F.round(F.stddev("count_before").over(Window.partitionBy('ddos_date')), 3))
    statistics_h1_after = statistics_h1_after.withColumn("std_h1_after", F.round(F.stddev("count_after").over(Window.partitionBy('ddos_date')), 3))
    
    statistics_h1_before_list.append(statistics_h1_before)
    statistics_h1_after_list.append(statistics_h1_after)
    
statistics_h1_before = reduce(DataFrame.unionAll, statistics_h1_before_list)
statistics_h1_after = reduce(DataFrame.unionAll, statistics_h1_after_list)

statistics_h1_before = statistics_h1_before.drop('parsed_date','id','count_before').dropDuplicates(['ddos_date', 'median_h1_before'])
statistics_h1_after = statistics_h1_after.drop('parsed_date','id','count_after').dropDuplicates(['ddos_date', 'median_h1_after'])


In [14]:
statistics_h2_before_list = []
statistics_h2_after_list = []

for d in ddos_list:
    h2_df1, h2_df2 = hypo_2(h2_df, d)
    
    df1 = h2_df1.dropDuplicates(['parsed_date', 'count_before'])
    df2 = h2_df2.dropDuplicates(['parsed_date', 'count_after'])
    
    statistics_h2_before = df1.withColumn('median_h2_before', F.expr("percentile_approx(count_before, 0.5, 10) over ()")).withColumn('ddos_date', F.lit(d))
    statistics_h2_after = df2.withColumn('median_h2_after', F.expr("percentile_approx(count_after, 0.5, 10) over ()")).withColumn('ddos_date', F.lit(d))
    
    
    mean_before =  statistics_h2_before.groupBy().avg("count_before").take(1)[0][0]
    statistics_h2_before = statistics_h2_before.withColumn("mean_h2_before", lit(mean_before))
    
    mean_after =  statistics_h2_after.groupBy().avg("count_after").take(1)[0][0]
    statistics_h2_after = statistics_h2_after.withColumn("mean_h2_after", lit(mean_after))
    
    
    statistics_h2_before = statistics_h2_before.withColumn("std_h2_before", F.round(F.stddev("count_before").over(Window.partitionBy('ddos_date')), 3))
    statistics_h2_after = statistics_h2_after.withColumn("std_h2_after", F.round(F.stddev("count_after").over(Window.partitionBy('ddos_date')), 3))
    
    statistics_h2_before_list.append(statistics_h2_before)
    statistics_h2_after_list.append(statistics_h2_after)
    
statistics_h2_before = reduce(DataFrame.unionAll, statistics_h2_before_list)
statistics_h2_after = reduce(DataFrame.unionAll, statistics_h2_after_list)

statistics_h2_before = statistics_h2_before.drop('parsed_date','id','count_before').dropDuplicates(['ddos_date', 'median_h2_before'])
statistics_h2_after = statistics_h2_after.drop('parsed_date','id','count_after').dropDuplicates(['ddos_date', 'median_h2_after'])


In [15]:
statistics_h3_before_list = []
statistics_h3_after_list = []

for d in ddos_list:
    h3_df1, h3_df2 = hypo_3(h3_df, d)
    
    df1 = h3_df1.dropDuplicates(['parsed_date', 'count_before'])
    df2 = h3_df2.dropDuplicates(['parsed_date', 'count_after'])
    
    statistics_h3_before = df1.withColumn('median_h3_before', F.expr("percentile_approx(count_before, 0.5, 10) over ()")).withColumn('ddos_date', F.lit(d))
    statistics_h3_after = df2.withColumn('median_h3_after', F.expr("percentile_approx(count_after, 0.5, 10) over ()")).withColumn('ddos_date', F.lit(d))
    
    
    mean_before =  statistics_h3_before.groupBy().avg("count_before").take(1)[0][0]
    statistics_h3_before = statistics_h3_before.withColumn("mean_h3_before", lit(mean_before))
    
    mean_after =  statistics_h3_after.groupBy().avg("count_after").take(1)[0][0]
    statistics_h3_after = statistics_h3_after.withColumn("mean_h3_after", lit(mean_after))
    
    
    statistics_h3_before = statistics_h3_before.withColumn("std_h3_before", F.round(F.stddev("count_before").over(Window.partitionBy('ddos_date')), 3))
    statistics_h3_after = statistics_h3_after.withColumn("std_h3_after", F.round(F.stddev("count_after").over(Window.partitionBy('ddos_date')), 3))
    
    statistics_h3_before_list.append(statistics_h3_before)
    statistics_h3_after_list.append(statistics_h3_after)
    
statistics_h3_before = reduce(DataFrame.unionAll, statistics_h3_before_list)
statistics_h3_after = reduce(DataFrame.unionAll, statistics_h3_after_list)

statistics_h3_before = statistics_h3_before.drop('parsed_date','id','count_before').dropDuplicates(['ddos_date', 'median_h3_before'])
statistics_h3_after = statistics_h3_after.drop('parsed_date','id','count_after').dropDuplicates(['ddos_date', 'median_h3_after'])


In [16]:
statistics_h1_before.persist()
statistics_h1_after.persist()
statistics_h2_before.persist()
statistics_h2_after.persist()
statistics_h3_before.persist()
statistics_h3_after.persist()

DataFrame[ddos_date: string, median_h3_after: bigint, mean_h3_after: double, std_h3_after: double]

In [17]:
statistics_h1_before.orderBy('ddos_date').show(25)
statistics_h1_after.orderBy('ddos_date').show(25)
statistics_h2_before.orderBy('ddos_date').show(25)
statistics_h2_after.orderBy('ddos_date').show(25)
statistics_h3_before.orderBy('ddos_date').show(25)
statistics_h3_after.orderBy('ddos_date').show(25)

+----------+----------------+------------------+-------------+
| ddos_date|median_h1_before|    mean_h1_before|std_h1_before|
+----------+----------------+------------------+-------------+
|2018-01-17|            1147|1119.7142857142858|       174.14|
|2018-01-27|            1106|1074.4285714285713|      173.223|
|2018-02-08|            1081| 1041.142857142857|      150.503|
|2018-03-01|            1150| 1117.142857142857|       114.08|
|2018-03-06|            1195|1135.4285714285713|      227.619|
|2018-05-14|            1088|            1017.0|      209.015|
|2018-05-24|            1246|1231.7142857142858|      179.391|
|2018-07-30|            1352|1301.4285714285713|      306.922|
|2019-01-16|              48|             649.0|      849.942|
|2019-02-23|              70|             839.5|     1088.237|
|2019-03-22|            1666|1577.6666666666667|     1430.547|
|2019-04-16|            1673|1309.2857142857142|      779.417|
|2019-08-08|             136|             782.0|      9

#### Stop the SparkContext
note: don't run this block unless you actually want to stop your context

In [19]:
print("[{}] Stopping Spark context.".format(datetime.now().replace(microsecond=0)))
sc.stop()

[2021-05-05 18:21:59] Stopping Spark context.
