# Malware Detection in Network Traffic Data

## Imports and spark setup

In [41]:
import pyspark
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SQLContext, SparkSession

spark = SparkSession.builder.appName('Malware').getOrCreate()
#sc = spark.sparkContext
#sqlsc = SQLContext(spark)

import os
os.environ['HADOOP_HOME'] = 'C:/dummy/hadoop_home'

In [42]:
print(spark.sparkContext.getConf().getAll())

[('spark.sql.warehouse.dir', 'file:/C:/Users/Vincenzo/Projects/DDAM_Project_23-24/code/spark-warehouse'), ('spark.driver.port', '55917'), ('spark.executor.id', 'driver'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHa

## Presentation

In [43]:
data = r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv"

In [44]:
df = spark.read.option("escape","\"").option("delimiter", "|").csv(data, header='true', inferSchema='true')

In [45]:
rows, cols = df.count(), len(df.columns)
rows, cols

(1008748, 23)

In [46]:
df.printSchema()

root
 |-- ts: double (nullable = true)
 |-- uid: string (nullable = true)
 |-- id.orig_h: string (nullable = true)
 |-- id.orig_p: double (nullable = true)
 |-- id.resp_h: string (nullable = true)
 |-- id.resp_p: double (nullable = true)
 |-- proto: string (nullable = true)
 |-- service: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- orig_bytes: string (nullable = true)
 |-- resp_bytes: string (nullable = true)
 |-- conn_state: string (nullable = true)
 |-- local_orig: string (nullable = true)
 |-- local_resp: string (nullable = true)
 |-- missed_bytes: double (nullable = true)
 |-- history: string (nullable = true)
 |-- orig_pkts: double (nullable = true)
 |-- orig_ip_bytes: double (nullable = true)
 |-- resp_pkts: double (nullable = true)
 |-- resp_ip_bytes: double (nullable = true)
 |-- tunnel_parents: string (nullable = true)
 |-- label: string (nullable = true)
 |-- detailed-label: string (nullable = true)



In [47]:
df.show(5)

+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|                 ts|               uid|      id.orig_h|id.orig_p|      id.resp_h|id.resp_p|proto|service|duration|orig_bytes|resp_bytes|conn_state|local_orig|local_resp|missed_bytes|history|orig_pkts|orig_ip_bytes|resp_pkts|resp_ip_bytes|tunnel_parents|    label|      detailed-label|
+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|1.525879831015811E9|CUmrqr4svHuSXJy5z7|192.168.100.103|  51524.0| 65.127.233.163|     23.0|  tcp|      -|2.999051|         0|         0|     

## Dataset concatenation

In [48]:
# List of file paths
file_paths = [
    
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-9-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-48-1conn.log.labeled.csv",
    r"C:\Users\Vincenzo\Projects\DDAM_data\malware\CTU-IoT-Malware-Capture-60-1conn.log.labeled.csv",

]

# Read the first file into a DataFrame (to get the schema)
df = spark.read.option("escape","\"").option("delimiter", "|").csv(file_paths[0], header=True, inferSchema=True)

for file_path in file_paths[1:]:
    df_temp = spark.read.option("escape","\"").option("delimiter", "|").csv(file_path, header=True, inferSchema=True)
    df = df.union(df_temp)

df.show(20)


+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|                 ts|               uid|      id.orig_h|id.orig_p|      id.resp_h|id.resp_p|proto|service|duration|orig_bytes|resp_bytes|conn_state|local_orig|local_resp|missed_bytes|history|orig_pkts|orig_ip_bytes|resp_pkts|resp_ip_bytes|tunnel_parents|    label|      detailed-label|
+-------------------+------------------+---------------+---------+---------------+---------+-----+-------+--------+----------+----------+----------+----------+----------+------------+-------+---------+-------------+---------+-------------+--------------+---------+--------------------+
|1.525879831015811E9|CUmrqr4svHuSXJy5z7|192.168.100.103|  51524.0| 65.127.233.163|     23.0|  tcp|      -|2.999051|         0|         0|     

## Data understanding

In [49]:
rows, cols = df.count(), len(df.columns)
print(f'Dimension of the Dataframe is: {(rows,cols)}')

Dimension of the Dataframe is: (25000600, 23)


In [51]:
num_cols = [item[0] for item in df.dtypes if item[1] != 'string']
print('Le colonne numeriche sono')
print(num_cols)

Le colonne numeriche sono
['ts', 'id.orig_p', 'id.resp_p', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes']


In [54]:
non_num_cols = [item[0] for item in df.dtypes if item[1] == 'string']
print('Le colonne non numeriche sono')
print(non_num_cols)

Le colonne non numeriche sono
['uid', 'id.orig_h', 'id.resp_h', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'history', 'tunnel_parents', 'label', 'detailed-label']
