# Malware Detection in Network Traffic Data

## Imports and spark setup

In [1]:
import pyspark
from tqdm import tqdm
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SQLContext, SparkSession

spark = SparkSession.builder.appName('Malware').config("spark.executor.memory", "8g").getOrCreate()

#sc = spark.sparkContext
#sqlsc = SQLContext(spark)

import os
os.environ['HADOOP_HOME'] = 'C:/dummy/hadoop_home'

In [2]:
print(spark.sparkContext.getConf().getAll())

[('spark.executor.id', 'driver'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'), ('spark.driver.port', '54529'), ('spark.app.name', 'Malware'), ('spark.rdd.compress', 'True'), ('spark.driver.host', '192.1

## Presentation

In [3]:
data = r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv"

In [4]:
df = spark.read.option("escape","\"").option("delimiter", "|").csv(data, header='true', inferSchema='true')

In [5]:
rows, cols = df.count(), len(df.columns)
rows, cols

(1008748, 23)

In [6]:
df.printSchema()

root
 |-- ts: double (nullable = true)
 |-- uid: string (nullable = true)
 |-- id.orig_h: string (nullable = true)
 |-- id.orig_p: double (nullable = true)
 |-- id.resp_h: string (nullable = true)
 |-- id.resp_p: double (nullable = true)
 |-- proto: string (nullable = true)
 |-- service: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- orig_bytes: string (nullable = true)
 |-- resp_bytes: string (nullable = true)
 |-- conn_state: string (nullable = true)
 |-- local_orig: string (nullable = true)
 |-- local_resp: string (nullable = true)
 |-- missed_bytes: double (nullable = true)
 |-- history: string (nullable = true)
 |-- orig_pkts: double (nullable = true)
 |-- orig_ip_bytes: double (nullable = true)
 |-- resp_pkts: double (nullable = true)
 |-- resp_ip_bytes: double (nullable = true)
 |-- tunnel_parents: string (nullable = true)
 |-- label: string (nullable = true)
 |-- detailed-label: string (nullable = true)



In [7]:
df.show(5)

+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|                 ts|               uid|      id.orig_h|id.orig_p|      id.resp_h|id.resp_p|proto|service|duration|orig_bytes|resp_bytes|conn_state|local_orig|local_resp|missed_bytes|history|orig_pkts|orig_ip_bytes|resp_pkts|resp_ip_bytes|tunnel_parents|    label|      detailed-label|
+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|1.525879831015811E9|CUmrqr4svHuSXJy5z7|192.168.100.103|  51524.0| 65.127.233.163|     23.0|  tcp|      -|2.999051|         0|         0|     

## Dataset concatenation / data assesment / schema corrections [preprocessing]

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Define your custom schema
custom_schema = StructType([
    StructField("ts", DoubleType(), True),
    StructField("uid", StringType(), True),
    StructField("id.orig_h", StringType(), True),
    StructField("id.orig_p", StringType(), True),
    StructField("id.resp_h", StringType(), True),
    StructField("id.resp_p", StringType(), True),
    StructField("proto", StringType(), True),
    StructField("service", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("orig_bytes", DoubleType(), True),
    StructField("resp_bytes", DoubleType(), True),
    StructField("conn_state", StringType(), True),
    StructField("local_orig", StringType(), True),
    StructField("local_resp", StringType(), True),
    StructField("missed_bytes", DoubleType(), True),
    StructField("history", StringType(), True),
    StructField("orig_pkts", DoubleType(), True),
    StructField("orig_ip_bytes", DoubleType(), True),
    StructField("resp_pkts", DoubleType(), True),
    StructField("resp_ip_bytes", DoubleType(), True),
    StructField("tunnel_parents", StringType(), True),
    StructField("label", StringType(), True),
    StructField("detailed-label", StringType(), True),
])

# List of file paths
file_paths = [
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-9-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-48-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-60-1conn.log.labeled.csv",
]

# Initialize an empty DataFrame with the custom schema
df = spark.createDataFrame(spark.sparkContext.emptyRDD(), custom_schema)

# Use tqdm for progress bar
for file_path in tqdm(file_paths, desc="Reading files", unit="file"):
    df_temp = spark.read.option("escape", "\"").option("delimiter", "|").csv(file_path, header=True, schema=custom_schema)
    df = df.union(df_temp)

# Show the first 5 rows and print the schema
df.show(5)
df.printSchema()


Reading files: 100%|██████████| 11/11 [00:00<00:00, 46.27file/s]


+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|                 ts|               uid|      id.orig_h|id.orig_p|      id.resp_h|id.resp_p|proto|service|duration|orig_bytes|resp_bytes|conn_state|local_orig|local_resp|missed_bytes|history|orig_pkts|orig_ip_bytes|resp_pkts|resp_ip_bytes|tunnel_parents|    label|      detailed-label|
+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|1.525879831015811E9|CUmrqr4svHuSXJy5z7|192.168.100.103|    51524| 65.127.233.163|       23|  tcp|      -|2.999051|       0.0|       0.0|     

In [9]:
#replacing dots with underscores in column names to avoid errors
df = df.toDF(*(c.replace('.', '_') for c in df.columns))

In [10]:
#looking for the coluns that have the '-' value to later work on them
from pyspark.sql.functions import col

columns_with_dash = [col_name for col_name in df.columns if df.filter(col(col_name) == "-").count() > 0]
print("Columns with '-' values:", columns_with_dash)

Columns with '-' values: ['service', 'local_orig', 'local_resp', 'history', 'tunnel_parents', 'detailed-label']


In [11]:
from pyspark.sql.functions import col, when
from pyspark.sql import SparkSession

def replace_dash_with_nan(df, columns):

    result_df = df
    for column in columns:
        result_df = result_df.withColumn(column, when(col(column) == '-', None).otherwise(col(column)))

    return result_df

df = replace_dash_with_nan(df, columns_with_dash)

In [12]:
columns_with_dash = [col_name for col_name in df.columns if df.filter(col(col_name) == "-").count() > 0]
print("Columns with '-' values:", columns_with_dash)

Columns with '-' values: []


In [13]:
from pyspark.sql.functions import isnan, when, count, col, isnull
missing = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()

+---+---+---------+---------+---------+---------+-----+--------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+-----+--------------+
| ts|uid|id_orig_h|id_orig_p|id_resp_h|id_resp_p|proto| service|duration|orig_bytes|resp_bytes|conn_state|local_orig|local_resp|missed_bytes|history|orig_pkts|orig_ip_bytes|resp_pkts|resp_ip_bytes|tunnel_parents|label|detailed-label|
+---+---+---------+---------+---------+---------+-----+--------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+-----+--------------+
|  0|  0|        0|        0|        0|        0|    0|24982603|15265888|  15265888|  15265888|         0|  25000600|  25000600|           0|  25116|        0|            0|        0|            0|      25000600|    0|      17951931|
+---+---+---------+---------+---------+---------+-----+--------+

In [14]:
from pyspark.sql.functions import from_unixtime, col

df = df.withColumn("formatted_ts", from_unixtime("ts").cast("timestamp"))
df.show(5)

+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+-------------------+
|                 ts|               uid|      id_orig_h|id_orig_p|      id_resp_h|id_resp_p|proto|service|duration|orig_bytes|resp_bytes|conn_state|local_orig|local_resp|missed_bytes|history|orig_pkts|orig_ip_bytes|resp_pkts|resp_ip_bytes|tunnel_parents|    label|      detailed-label|       formatted_ts|
+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+-------------------+
|1.525879831015811E9|CUmrqr4svHuSXJy5z7|192.168.100.103|    51524| 65.127.233.163|

## Data understanding

### dimension and content analysis

In [15]:
rows, cols = df.count(), len(df.columns)
print(f'Dimension of the Dataframe is: {(rows,cols)}')

Dimension of the Dataframe is: (25000600, 24)


In [16]:
num_cols = [item[0] for item in df.dtypes if item[1] != 'string']
print('Le colonne numeriche sono {}'.format(len(num_cols)))
print(num_cols)

Le colonne numeriche sono 10
['ts', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'formatted_ts']


In [17]:
non_num_cols = [item[0] for item in df.dtypes if item[1] == 'string']
print('Le colonne non numeriche sono {}'.format(len(non_num_cols)))
print(non_num_cols)

Le colonne non numeriche sono 14
['uid', 'id_orig_h', 'id_orig_p', 'id_resp_h', 'id_resp_p', 'proto', 'service', 'conn_state', 'local_orig', 'local_resp', 'history', 'tunnel_parents', 'label', 'detailed-label']


### Check for missing labels for each file to understandand labelling consistency

In [18]:
from pyspark.sql.functions import isnan, when, count, col, isnull


def check_nan(spark, column_to_check="", label_only=False):
    # List of file numbers
    file_numbers = [1, 3, 9, 20, 21, 34, 35, 42, 44, 48, 60]
    
    for number in tqdm(file_numbers, desc="Processing files", unit="file"):
    
        file_path = r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-{}-1conn.log.labeled.csv".format(number)
        df = spark.read.option("escape", "\"").option("delimiter", "|").csv(file_path, header='true', inferSchema='true')
        df = df.toDF(*(c.replace('.', '_') for c in df.columns))

        print("", flush=True)
        # Check for missing values
        if label_only:
            missing = df.select([count(when(isnull("detailed-label"), "detailed-label")).alias("missing_count")])
            print("Missing values in file {}: ".format(number))
            missing.select("missing_count").show()

        elif column_to_check:
            missing = df.select([count(when(isnull(column_to_check), column_to_check)).alias("missing_count")])
            print("Missing values in file {}: ".format(number))
            missing.select("missing_count").show()

        elif column_to_check == "all":
            missing = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
            print("Missing values in file {}: ".format(number))
            missing.show()

        else:
            print("Please enter a valid column name or enter True to check the label column only")

# Check for missing values in the label column only
check_nan(spark, label_only=True)


Processing files:   0%|          | 0/11 [00:00<?, ?file/s]


Missing values in file 1: 


Processing files:   9%|▉         | 1/11 [00:01<00:17,  1.78s/file]

+-------------+
|missing_count|
+-------------+
|            0|
+-------------+




Processing files:  18%|█▊        | 2/11 [00:02<00:09,  1.09s/file]

Missing values in file 3: 
+-------------+
|missing_count|
+-------------+
|            0|
+-------------+


Missing values in file 9: 


Processing files:  27%|██▋       | 3/11 [00:11<00:38,  4.86s/file]

+-------------+
|missing_count|
+-------------+
|            0|
+-------------+


Missing values in file 20: 


Processing files:  36%|███▋      | 4/11 [00:11<00:21,  3.04s/file]

+-------------+
|missing_count|
+-------------+
|            0|
+-------------+


Missing values in file 21: 


Processing files:  45%|████▌     | 5/11 [00:12<00:12,  2.03s/file]

+-------------+
|missing_count|
+-------------+
|            0|
+-------------+




Processing files:  55%|█████▍    | 6/11 [00:12<00:07,  1.45s/file]

Missing values in file 34: 
+-------------+
|missing_count|
+-------------+
|        21222|
+-------------+


Missing values in file 35: 


Processing files:  64%|██████▎   | 7/11 [00:26<00:22,  5.58s/file]

+-------------+
|missing_count|
+-------------+
|      2185386|
+-------------+




Processing files:  73%|███████▎  | 8/11 [00:26<00:11,  3.87s/file]

Missing values in file 42: 
+-------------+
|missing_count|
+-------------+
|            3|
+-------------+




Processing files:  82%|████████▏ | 9/11 [00:27<00:05,  2.72s/file]

Missing values in file 44: 
+-------------+
|missing_count|
+-------------+
|           15|
+-------------+


Missing values in file 48: 


Processing files:  91%|█████████ | 10/11 [00:32<00:03,  3.42s/file]

+-------------+
|missing_count|
+-------------+
|      3388871|
+-------------+


Missing values in file 60: 


Processing files: 100%|██████████| 11/11 [00:36<00:00,  3.35s/file]

+-------------+
|missing_count|
+-------------+
|      3578457|
+-------------+






### Check for distinct values and raw statistics

In [19]:
from pyspark.sql.functions import col, countDistinct

non_num_df = df.select(non_num_cols)

def count_distinct_values(df):
    result = {}
    columns = df.columns

    # Use tqdm to create a progress bar
    for column in tqdm(columns, desc="Counting Distinct Values", unit="column"):
        distinct_count = df.select(column).agg(countDistinct(column)).collect()[0][0]
        result[column] = distinct_count

    return result

distinct_counts = count_distinct_values(non_num_df)

for column, count in distinct_counts.items():
    print(f"Column '{column}' has {count} distinct values.")

Counting Distinct Values:   0%|          | 0/14 [00:00<?, ?column/s]

Counting Distinct Values: 100%|██████████| 14/14 [06:35<00:00, 28.27s/column]

Column 'uid' has 25000600 distinct values.
Column 'id_orig_h' has 21442 distinct values.
Column 'id_orig_p' has 102304 distinct values.
Column 'id_resp_h' has 11654577 distinct values.
Column 'id_resp_p' has 75872 distinct values.
Column 'proto' has 3 distinct values.
Column 'service' has 6 distinct values.
Column 'conn_state' has 13 distinct values.
Column 'local_orig' has 0 distinct values.
Column 'local_resp' has 0 distinct values.
Column 'history' has 263 distinct values.
Column 'tunnel_parents' has 0 distinct values.
Column 'label' has 7 distinct values.
Column 'detailed-label' has 6 distinct values.





In [20]:
# Use the describe function to get statistical summary

summary = df.describe()
summary.show()


+-------+--------------------+------------------+------------+------------------+-----------+------------------+--------+-------+-----------------+--------------------+--------------------+----------+----------+----------+-----------------+--------+------------------+-----------------+--------------------+------------------+--------------+--------------------+--------------+
|summary|                  ts|               uid|   id_orig_h|         id_orig_p|  id_resp_h|         id_resp_p|   proto|service|         duration|          orig_bytes|          resp_bytes|conn_state|local_orig|local_resp|     missed_bytes| history|         orig_pkts|    orig_ip_bytes|           resp_pkts|     resp_ip_bytes|tunnel_parents|               label|detailed-label|
+-------+--------------------+------------------+------------+------------------+-----------+------------------+--------+-------+-----------------+--------------------+--------------------+----------+----------+----------+-----------------+----

In [21]:
# num_summary = summary.select(*num_cols)
num_summary = summary.select(*(["summary"] + num_cols))
# non_num_summary = summary.select(*non_num_cols)
non_num_summary = summary.select(*(["summary"] + non_num_cols))

print("Summary of numeric columns:")
num_summary.show()

print("Summary of non-numeric columns:")
non_num_summary.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `formatted_ts` cannot be resolved. Did you mean one of the following? [`orig_pkts`, `duration`, `label`, `missed_bytes`, `orig_bytes`].;
'Project [summary#22825, ts#22459, duration#22467, orig_bytes#22468, resp_bytes#22469, missed_bytes#22473, orig_pkts#22475, orig_ip_bytes#22476, resp_pkts#22477, resp_ip_bytes#22478, 'formatted_ts]
+- Project [summary#22825, element_at(ts#22348, summary#22825, None, false) AS ts#22459, element_at(uid#22353, summary#22825, None, false) AS uid#22460, element_at(id_orig_h#22358, summary#22825, None, false) AS id_orig_h#22461, element_at(id_orig_p#22363, summary#22825, None, false) AS id_orig_p#22462, element_at(id_resp_h#22368, summary#22825, None, false) AS id_resp_h#22463, element_at(id_resp_p#22373, summary#22825, None, false) AS id_resp_p#22464, element_at(proto#22378, summary#22825, None, false) AS proto#22465, element_at(service#22383, summary#22825, None, false) AS service#22466, element_at(duration#22388, summary#22825, None, false) AS duration#22467, element_at(orig_bytes#22393, summary#22825, None, false) AS orig_bytes#22468, element_at(resp_bytes#22398, summary#22825, None, false) AS resp_bytes#22469, element_at(conn_state#22403, summary#22825, None, false) AS conn_state#22470, element_at(local_orig#22408, summary#22825, None, false) AS local_orig#22471, element_at(local_resp#22413, summary#22825, None, false) AS local_resp#22472, element_at(missed_bytes#22418, summary#22825, None, false) AS missed_bytes#22473, element_at(history#22423, summary#22825, None, false) AS history#22474, element_at(orig_pkts#22428, summary#22825, None, false) AS orig_pkts#22475, element_at(orig_ip_bytes#22433, summary#22825, None, false) AS orig_ip_bytes#22476, element_at(resp_pkts#22438, summary#22825, None, false) AS resp_pkts#22477, element_at(resp_ip_bytes#22443, summary#22825, None, false) AS resp_ip_bytes#22478, element_at(tunnel_parents#22448, summary#22825, None, false) AS tunnel_parents#22479, element_at(label#22453, summary#22825, None, false) AS label#22480, element_at(detailed-label#22458, summary#22825, None, false) AS detailed-label#22481]
   +- Project [ts#22348, uid#22353, id_orig_h#22358, id_orig_p#22363, id_resp_h#22368, id_resp_p#22373, proto#22378, service#22383, duration#22388, orig_bytes#22393, resp_bytes#22398, conn_state#22403, local_orig#22408, local_resp#22413, missed_bytes#22418, history#22423, orig_pkts#22428, orig_ip_bytes#22433, resp_pkts#22438, resp_ip_bytes#22443, tunnel_parents#22448, label#22453, detailed-label#22458, summary#22825]
      +- Generate explode([count,mean,stddev,min,max]), false, [summary#22825]
         +- Aggregate [map(cast(count as string), cast(count(ts#1612) as string), cast(mean as string), cast(avg(ts#1612) as string), cast(stddev as string), cast(stddev(ts#1612) as string), cast(min as string), cast(min(ts#1612) as string), cast(max as string), cast(max(ts#1612) as string)) AS ts#22348, map(cast(count as string), cast(count(uid#1613) as string), cast(mean as string), cast(avg(try_cast(uid#1613 as double)) as string), cast(stddev as string), cast(stddev(try_cast(uid#1613 as double)) as string), cast(min as string), cast(min(uid#1613) as string), cast(max as string), cast(max(uid#1613) as string)) AS uid#22353, map(cast(count as string), cast(count(id_orig_h#1614) as string), cast(mean as string), cast(avg(try_cast(id_orig_h#1614 as double)) as string), cast(stddev as string), cast(stddev(try_cast(id_orig_h#1614 as double)) as string), cast(min as string), cast(min(id_orig_h#1614) as string), cast(max as string), cast(max(id_orig_h#1614) as string)) AS id_orig_h#22358, map(cast(count as string), cast(count(id_orig_p#1615) as string), cast(mean as string), cast(avg(try_cast(id_orig_p#1615 as double)) as string), cast(stddev as string), cast(stddev(try_cast(id_orig_p#1615 as double)) as string), cast(min as string), cast(min(id_orig_p#1615) as string), cast(max as string), cast(max(id_orig_p#1615) as string)) AS id_orig_p#22363, map(cast(count as string), cast(count(id_resp_h#1616) as string), cast(mean as string), cast(avg(try_cast(id_resp_h#1616 as double)) as string), cast(stddev as string), cast(stddev(try_cast(id_resp_h#1616 as double)) as string), cast(min as string), cast(min(id_resp_h#1616) as string), cast(max as string), cast(max(id_resp_h#1616) as string)) AS id_resp_h#22368, map(cast(count as string), cast(count(id_resp_p#1617) as string), cast(mean as string), cast(avg(try_cast(id_resp_p#1617 as double)) as string), cast(stddev as string), cast(stddev(try_cast(id_resp_p#1617 as double)) as string), cast(min as string), cast(min(id_resp_p#1617) as string), cast(max as string), cast(max(id_resp_p#1617) as string)) AS id_resp_p#22373, map(cast(count as string), cast(count(proto#1618) as string), cast(mean as string), cast(avg(try_cast(proto#1618 as double)) as string), cast(stddev as string), cast(stddev(try_cast(proto#1618 as double)) as string), cast(min as string), cast(min(proto#1618) as string), cast(max as string), cast(max(proto#1618) as string)) AS proto#22378, map(cast(count as string), cast(count(service#8275) as string), cast(mean as string), cast(avg(try_cast(service#8275 as double)) as string), cast(stddev as string), cast(stddev(try_cast(service#8275 as double)) as string), cast(min as string), cast(min(service#8275) as string), cast(max as string), cast(max(service#8275) as string)) AS service#22383, map(cast(count as string), cast(count(duration#1620) as string), cast(mean as string), cast(avg(duration#1620) as string), cast(stddev as string), cast(stddev(duration#1620) as string), cast(min as string), cast(min(duration#1620) as string), cast(max as string), cast(max(duration#1620) as string)) AS duration#22388, map(cast(count as string), cast(count(orig_bytes#1621) as string), cast(mean as string), cast(avg(orig_bytes#1621) as string), cast(stddev as string), cast(stddev(orig_bytes#1621) as string), cast(min as string), cast(min(orig_bytes#1621) as string), cast(max as string), cast(max(orig_bytes#1621) as string)) AS orig_bytes#22393, map(cast(count as string), cast(count(resp_bytes#1622) as string), cast(mean as string), cast(avg(resp_bytes#1622) as string), cast(stddev as string), cast(stddev(resp_bytes#1622) as string), cast(min as string), cast(min(resp_bytes#1622) as string), cast(max as string), cast(max(resp_bytes#1622) as string)) AS resp_bytes#22398, map(cast(count as string), cast(count(conn_state#1623) as string), cast(mean as string), cast(avg(try_cast(conn_state#1623 as double)) as string), cast(stddev as string), cast(stddev(try_cast(conn_state#1623 as double)) as string), cast(min as string), cast(min(conn_state#1623) as string), cast(max as string), cast(max(conn_state#1623) as string)) AS conn_state#22403, map(cast(count as string), cast(count(local_orig#8299) as string), cast(mean as string), cast(avg(try_cast(local_orig#8299 as double)) as string), cast(stddev as string), cast(stddev(try_cast(local_orig#8299 as double)) as string), cast(min as string), cast(min(local_orig#8299) as string), cast(max as string), cast(max(local_orig#8299) as string)) AS local_orig#22408, map(cast(count as string), cast(count(local_resp#8323) as string), cast(mean as string), cast(avg(try_cast(local_resp#8323 as double)) as string), cast(stddev as string), cast(stddev(try_cast(local_resp#8323 as double)) as string), cast(min as string), cast(min(local_resp#8323) as string), cast(max as string), cast(max(local_resp#8323) as string)) AS local_resp#22413, map(cast(count as string), cast(count(missed_bytes#1626) as string), cast(mean as string), cast(avg(missed_bytes#1626) as string), cast(stddev as string), cast(stddev(missed_bytes#1626) as string), cast(min as string), cast(min(missed_bytes#1626) as string), cast(max as string), cast(max(missed_bytes#1626) as string)) AS missed_bytes#22418, map(cast(count as string), cast(count(history#8347) as string), cast(mean as string), cast(avg(try_cast(history#8347 as double)) as string), cast(stddev as string), cast(stddev(try_cast(history#8347 as double)) as string), cast(min as string), cast(min(history#8347) as string), cast(max as string), cast(max(history#8347) as string)) AS history#22423, map(cast(count as string), cast(count(orig_pkts#1628) as string), cast(mean as string), cast(avg(orig_pkts#1628) as string), cast(stddev as string), cast(stddev(orig_pkts#1628) as string), cast(min as string), cast(min(orig_pkts#1628) as string), cast(max as string), cast(max(orig_pkts#1628) as string)) AS orig_pkts#22428, map(cast(count as string), cast(count(orig_ip_bytes#1629) as string), cast(mean as string), cast(avg(orig_ip_bytes#1629) as string), cast(stddev as string), cast(stddev(orig_ip_bytes#1629) as string), cast(min as string), cast(min(orig_ip_bytes#1629) as string), cast(max as string), cast(max(orig_ip_bytes#1629) as string)) AS orig_ip_bytes#22433, map(cast(count as string), cast(count(resp_pkts#1630) as string), cast(mean as string), cast(avg(resp_pkts#1630) as string), cast(stddev as string), cast(stddev(resp_pkts#1630) as string), cast(min as string), cast(min(resp_pkts#1630) as string), cast(max as string), cast(max(resp_pkts#1630) as string)) AS resp_pkts#22438, map(cast(count as string), cast(count(resp_ip_bytes#1631) as string), cast(mean as string), cast(avg(resp_ip_bytes#1631) as string), cast(stddev as string), cast(stddev(resp_ip_bytes#1631) as string), cast(min as string), cast(min(resp_ip_bytes#1631) as string), cast(max as string), cast(max(resp_ip_bytes#1631) as string)) AS resp_ip_bytes#22443, map(cast(count as string), cast(count(tunnel_parents#8371) as string), cast(mean as string), cast(avg(try_cast(tunnel_parents#8371 as double)) as string), cast(stddev as string), cast(stddev(try_cast(tunnel_parents#8371 as double)) as string), cast(min as string), cast(min(tunnel_parents#8371) as string), cast(max as string), cast(max(tunnel_parents#8371) as string)) AS tunnel_parents#22448, map(cast(count as string), cast(count(label#1633) as string), cast(mean as string), cast(avg(try_cast(label#1633 as double)) as string), cast(stddev as string), cast(stddev(try_cast(label#1633 as double)) as string), cast(min as string), cast(min(label#1633) as string), cast(max as string), cast(max(label#1633) as string)) AS label#22453, map(cast(count as string), cast(count(detailed-label#8395) as string), cast(mean as string), cast(avg(try_cast(detailed-label#8395 as double)) as string), cast(stddev as string), cast(stddev(try_cast(detailed-label#8395 as double)) as string), cast(min as string), cast(min(detailed-label#8395) as string), cast(max as string), cast(max(detailed-label#8395) as string)) AS detailed-label#22458]
            +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, local_orig#8299, local_resp#8323, missed_bytes#1626, history#8347, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, tunnel_parents#8371, label#1633, detailed-label#8395, cast(from_unixtime(cast(ts#1612 as bigint), yyyy-MM-dd HH:mm:ss, Some(Europe/Berlin)) as timestamp) AS formatted_ts#15862]
               +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, local_orig#8299, local_resp#8323, missed_bytes#1626, history#8347, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, tunnel_parents#8371, label#1633, CASE WHEN (detailed-label#1634 = -) THEN cast(null as string) ELSE detailed-label#1634 END AS detailed-label#8395]
                  +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, local_orig#8299, local_resp#8323, missed_bytes#1626, history#8347, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, CASE WHEN (tunnel_parents#1632 = -) THEN cast(null as string) ELSE tunnel_parents#1632 END AS tunnel_parents#8371, label#1633, detailed-label#1634]
                     +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, local_orig#8299, local_resp#8323, missed_bytes#1626, CASE WHEN (history#1627 = -) THEN cast(null as string) ELSE history#1627 END AS history#8347, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, tunnel_parents#1632, label#1633, detailed-label#1634]
                        +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, local_orig#8299, CASE WHEN (local_resp#1625 = -) THEN cast(null as string) ELSE local_resp#1625 END AS local_resp#8323, missed_bytes#1626, history#1627, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, tunnel_parents#1632, label#1633, detailed-label#1634]
                           +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, CASE WHEN (local_orig#1624 = -) THEN cast(null as string) ELSE local_orig#1624 END AS local_orig#8299, local_resp#1625, missed_bytes#1626, history#1627, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, tunnel_parents#1632, label#1633, detailed-label#1634]
                              +- Project [ts#1612, uid#1613, id_orig_h#1614, id_orig_p#1615, id_resp_h#1616, id_resp_p#1617, proto#1618, CASE WHEN (service#1619 = -) THEN cast(null as string) ELSE service#1619 END AS service#8275, duration#1620, orig_bytes#1621, resp_bytes#1622, conn_state#1623, local_orig#1624, local_resp#1625, missed_bytes#1626, history#1627, orig_pkts#1628, orig_ip_bytes#1629, resp_pkts#1630, resp_ip_bytes#1631, tunnel_parents#1632, label#1633, detailed-label#1634]
                                 +- Project [ts#208 AS ts#1612, uid#209 AS uid#1613, id.orig_h#210 AS id_orig_h#1614, id.orig_p#211 AS id_orig_p#1615, id.resp_h#212 AS id_resp_h#1616, id.resp_p#213 AS id_resp_p#1617, proto#214 AS proto#1618, service#215 AS service#1619, duration#216 AS duration#1620, orig_bytes#217 AS orig_bytes#1621, resp_bytes#218 AS resp_bytes#1622, conn_state#219 AS conn_state#1623, local_orig#220 AS local_orig#1624, local_resp#221 AS local_resp#1625, missed_bytes#222 AS missed_bytes#1626, history#223 AS history#1627, orig_pkts#224 AS orig_pkts#1628, orig_ip_bytes#225 AS orig_ip_bytes#1629, resp_pkts#226 AS resp_pkts#1630, resp_ip_bytes#227 AS resp_ip_bytes#1631, tunnel_parents#228 AS tunnel_parents#1632, label#229 AS label#1633, detailed-label#230 AS detailed-label#1634]
                                    +- Union false, false
                                       :- LogicalRDD [ts#208, uid#209, id.orig_h#210, id.orig_p#211, id.resp_h#212, id.resp_p#213, proto#214, service#215, duration#216, orig_bytes#217, resp_bytes#218, conn_state#219, local_orig#220, local_resp#221, missed_bytes#222, history#223, orig_pkts#224, orig_ip_bytes#225, resp_pkts#226, resp_ip_bytes#227, tunnel_parents#228, label#229, detailed-label#230], false
                                       :- Relation [ts#254,uid#255,id.orig_h#256,id.orig_p#257,id.resp_h#258,id.resp_p#259,proto#260,service#261,duration#262,orig_bytes#263,resp_bytes#264,conn_state#265,local_orig#266,local_resp#267,missed_bytes#268,history#269,orig_pkts#270,orig_ip_bytes#271,resp_pkts#272,resp_ip_bytes#273,tunnel_parents#274,label#275,detailed-label#276] csv
                                       :- Relation [ts#323,uid#324,id.orig_h#325,id.orig_p#326,id.resp_h#327,id.resp_p#328,proto#329,service#330,duration#331,orig_bytes#332,resp_bytes#333,conn_state#334,local_orig#335,local_resp#336,missed_bytes#337,history#338,orig_pkts#339,orig_ip_bytes#340,resp_pkts#341,resp_ip_bytes#342,tunnel_parents#343,label#344,detailed-label#345] csv
                                       :- Relation [ts#392,uid#393,id.orig_h#394,id.orig_p#395,id.resp_h#396,id.resp_p#397,proto#398,service#399,duration#400,orig_bytes#401,resp_bytes#402,conn_state#403,local_orig#404,local_resp#405,missed_bytes#406,history#407,orig_pkts#408,orig_ip_bytes#409,resp_pkts#410,resp_ip_bytes#411,tunnel_parents#412,label#413,detailed-label#414] csv
                                       :- Relation [ts#461,uid#462,id.orig_h#463,id.orig_p#464,id.resp_h#465,id.resp_p#466,proto#467,service#468,duration#469,orig_bytes#470,resp_bytes#471,conn_state#472,local_orig#473,local_resp#474,missed_bytes#475,history#476,orig_pkts#477,orig_ip_bytes#478,resp_pkts#479,resp_ip_bytes#480,tunnel_parents#481,label#482,detailed-label#483] csv
                                       :- Relation [ts#530,uid#531,id.orig_h#532,id.orig_p#533,id.resp_h#534,id.resp_p#535,proto#536,service#537,duration#538,orig_bytes#539,resp_bytes#540,conn_state#541,local_orig#542,local_resp#543,missed_bytes#544,history#545,orig_pkts#546,orig_ip_bytes#547,resp_pkts#548,resp_ip_bytes#549,tunnel_parents#550,label#551,detailed-label#552] csv
                                       :- Relation [ts#599,uid#600,id.orig_h#601,id.orig_p#602,id.resp_h#603,id.resp_p#604,proto#605,service#606,duration#607,orig_bytes#608,resp_bytes#609,conn_state#610,local_orig#611,local_resp#612,missed_bytes#613,history#614,orig_pkts#615,orig_ip_bytes#616,resp_pkts#617,resp_ip_bytes#618,tunnel_parents#619,label#620,detailed-label#621] csv
                                       :- Relation [ts#668,uid#669,id.orig_h#670,id.orig_p#671,id.resp_h#672,id.resp_p#673,proto#674,service#675,duration#676,orig_bytes#677,resp_bytes#678,conn_state#679,local_orig#680,local_resp#681,missed_bytes#682,history#683,orig_pkts#684,orig_ip_bytes#685,resp_pkts#686,resp_ip_bytes#687,tunnel_parents#688,label#689,detailed-label#690] csv
                                       :- Relation [ts#737,uid#738,id.orig_h#739,id.orig_p#740,id.resp_h#741,id.resp_p#742,proto#743,service#744,duration#745,orig_bytes#746,resp_bytes#747,conn_state#748,local_orig#749,local_resp#750,missed_bytes#751,history#752,orig_pkts#753,orig_ip_bytes#754,resp_pkts#755,resp_ip_bytes#756,tunnel_parents#757,label#758,detailed-label#759] csv
                                       :- Relation [ts#806,uid#807,id.orig_h#808,id.orig_p#809,id.resp_h#810,id.resp_p#811,proto#812,service#813,duration#814,orig_bytes#815,resp_bytes#816,conn_state#817,local_orig#818,local_resp#819,missed_bytes#820,history#821,orig_pkts#822,orig_ip_bytes#823,resp_pkts#824,resp_ip_bytes#825,tunnel_parents#826,label#827,detailed-label#828] csv
                                       :- Relation [ts#875,uid#876,id.orig_h#877,id.orig_p#878,id.resp_h#879,id.resp_p#880,proto#881,service#882,duration#883,orig_bytes#884,resp_bytes#885,conn_state#886,local_orig#887,local_resp#888,missed_bytes#889,history#890,orig_pkts#891,orig_ip_bytes#892,resp_pkts#893,resp_ip_bytes#894,tunnel_parents#895,label#896,detailed-label#897] csv
                                       +- Relation [ts#944,uid#945,id.orig_h#946,id.orig_p#947,id.resp_h#948,id.resp_p#949,proto#950,service#951,duration#952,orig_bytes#953,resp_bytes#954,conn_state#955,local_orig#956,local_resp#957,missed_bytes#958,history#959,orig_pkts#960,orig_ip_bytes#961,resp_pkts#962,resp_ip_bytes#963,tunnel_parents#964,label#965,detailed-label#966] csv
