# Creating Clean TON_IoT Data
Source: https://research.unsw.edu.au/projects/toniot-datasets (See TON_IoT datasets/Processed_datasets/Processed_Network_dataset)
____

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Read datasets

When exporting is complete, please delete the Processed_Network_dataset folder. 

In [None]:
# process data
ds_array = []
for i in range(1, 24):
    try:
        ds_array.append(pd.read_csv(f"./raw/Processed_Network_dataset/Network_dataset_{i}.csv"))
    except FileNotFoundError as e:
        print("To run this notebook, extract the contents of Processed Network dataset.zip to 'models/datasets/raw'.")
        print("The notebook will now raise an error.")
        raise e

  ds_array.append(pd.read_csv(f"./raw/Processed_Network_dataset/Network_dataset_{i}.csv"))
  ds_array.append(pd.read_csv(f"./raw/Processed_Network_dataset/Network_dataset_{i}.csv"))
  ds_array.append(pd.read_csv(f"./raw/Processed_Network_dataset/Network_dataset_{i}.csv"))


# Analyzing data

## What's included.

In [None]:
# quick check to see available types
what_i_use = []
for i in range(0, 23):
    for e in ds_array[i]["type"].unique():
        if e not in what_i_use:
            what_i_use.append(e)
print(f"What the data finds: {what_i_use}")

What the data finds: ['normal', 'scanning', 'dos', 'injection', 'ddos', 'password', 'xss', 'ransomware', 'backdoor', 'mitm']


In [None]:
# print counts per type
for i in range(0, 23):
    print(f"Spreadsheet {i+1}") 
    print(ds_array[i]["type"].value_counts(), end="\n\n")

Spreadsheet 1
type
scanning    791321
normal      208679
Name: count, dtype: int64

Spreadsheet 2
type
scanning    994283
normal        5717
Name: count, dtype: int64

Spreadsheet 3
type
scanning    997180
normal        2820
Name: count, dtype: int64

Spreadsheet 4
type
scanning    993744
normal        6256
Name: count, dtype: int64

Spreadsheet 5
type
scanning    996343
normal        3657
Name: count, dtype: int64

Spreadsheet 6
type
scanning    986527
normal       13473
Name: count, dtype: int64

Spreadsheet 7
type
scanning    992501
normal        7499
Name: count, dtype: int64

Spreadsheet 8
type
dos         590432
scanning    388262
normal       21306
Name: count, dtype: int64

Spreadsheet 9
type
dos       975261
normal     24739
Name: count, dtype: int64

Spreadsheet 10
type
dos       969998
normal     30002
Name: count, dtype: int64

Spreadsheet 11
type
dos          839637
injection    125195
normal        35168
Name: count, dtype: int64

Spreadsheet 12
type
ddos         639730
i

# Data cleaning

## Removing unneccessary columns

In [None]:
# cleaning
for i in range(0, 23):
    ds_array[i] = ds_array[i].drop(columns=[
        "service","dns_query", "dns_AA", 
        "dns_RD", "dns_RA", "dns_rejected", 
        "ssl_version", "ssl_cipher", "ssl_resumed", 
        "ssl_established", "ssl_subject", "ssl_issuer", 
        "http_trans_depth", "http_method", "http_uri", 
        "http_version", "http_request_body_len", "http_response_body_len", 
        "http_status_code", "http_user_agent", "http_orig_mime_types", 
        "http_resp_mime_types", "weird_name", "weird_addl", 
        "weird_notice", "dns_qclass",
        "dns_qtype", "dns_rcode", "http_referrer", ])
    if "uid" in ds_array[i]:
        ds_array[i] = ds_array[i].drop(columns=["uid"])

## Drop specific attack types: 
* password
* xss
* ransomware
* backdoor
* injection
* mitm

In [None]:
# drop non-network based attacks
for i in range(0, 23):
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "password"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "xss"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "ransomware"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "backdoor"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "injection"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "mitm"]

### If you wish to see the individual cleaned CSV files:
change the following line to True.

In [9]:
EXPORT_INDIVIDUAL_CSV = False

In [None]:
# see above
if EXPORT_INDIVIDUAL_CSV == True:
    for i in range(0, 23):
        ds_array[i].to_csv(f"./team_11_individual_TON_IoT_unsw_edu_au_{i+1}.csv")

## Reduce dataset

In [None]:
# merge data
newDS  = pd.concat(ds_array, ignore_index=True)

### Drop unneeded scanning entries
Decided by taking a fraction close to, but less than the amount of 'normal' entries.

In [None]:
SCANNING_ENTRIES = 649106
D_DOS_ENTRIES = 474231
DOS_ENTRIES = 421916

### Drop unneeded scanning entries

In [None]:
temp = newDS[newDS["type"] == "scanning"]
temp = temp.reset_index()
# from the end of the data capture and up
newDS = newDS.drop(index=list(temp[-(len(temp)- SCANNING_ENTRIES):]["index"]))

### Drop unneeded DDoS entries

In [None]:
temp = newDS[newDS["type"] == "ddos"]
temp = temp.reset_index()
# from the end of the data capture and up
newDS = newDS.drop(index=list(temp[-(len(temp)- D_DOS_ENTRIES):]["index"]))

### Drop unneeded DoS entries

In [None]:
temp = newDS[newDS["type"] == "dos"]
temp = temp.reset_index()
# from the end of the data capture and up
newDS = newDS.drop(index=list(temp[-(len(temp)- DOS_ENTRIES):]["index"]))

In [16]:
newDS["type"].value_counts()

type
normal      796380
scanning    649106
ddos        474231
dos         421916
Name: count, dtype: int64

# Export

In [17]:
EXPORT_FULLY_CLEAN_CSV = False

In [18]:
if EXPORT_FULLY_CLEAN_CSV:
    newDS.to_csv("./team_11_TON_IoT_unsw_edu_au_cleaned.csv")

In [None]:
# export
train, test = train_test_split(newDS, test_size=0.2)
train.to_csv("./team_11_TON_IoT_unsw_edu_au_train.csv")
test.to_csv("./team_11_TON_IoT_unsw_edu_au_test.csv")