# Creating Clean TON_IoT Data
Source: https://research.unsw.edu.au/projects/toniot-datasets (See TON_IoT datasets/Processed_datasets/Processed_Network_dataset)


\- Team 11

In [None]:
import pandas as pd

## Read datasets

When exporting is complete, please delete the Processed_Network_dataset folder. 

In [None]:
ds_array = []
for i in range(1, 24):
    try:
        ds_array.append(pd.read_csv(f"./raw/Processed_Network_dataset/Network_dataset_{i}.csv"))
    except FileNotFoundError as e:
        print("To run this notebook, extract the contents of Processed Network dataset.zip to 'models/datasets/team11/raw'.")
        print("The notebook will now raise an error.")
        raise e


# Analyzing data

## What's included.

In [None]:
what_i_use = []
for i in range(0, 23):
    for e in ds_array[i]["type"].unique():
        if e not in what_i_use:
            what_i_use.append(e)
print(f"What the data finds: {what_i_use}")

In [None]:
for i in range(0, 23):
    print(f"Spreadsheet {i+1}") 
    print(ds_array[i]["type"].value_counts(), end="\n\n")

# Data cleaning

## Removing unneccessary columns

In [None]:
for i in range(0, 23):
    ds_array[i] = ds_array[i].drop(columns=[
        "service","dns_query", "dns_AA", 
        "dns_RD", "dns_RA", "dns_rejected", 
        "ssl_version", "ssl_cipher", "ssl_resumed", 
        "ssl_established", "ssl_subject", "ssl_issuer", 
        "http_trans_depth", "http_method", "http_uri", 
        "http_version", "http_request_body_len", "http_response_body_len", 
        "http_status_code", "http_user_agent", "http_orig_mime_types", 
        "http_resp_mime_types", "weird_name", "weird_addl", 
        "weird_notice", "dns_qclass",
        "dns_qtype", "dns_rcode", "http_referrer", ])
    if "uid" in ds_array[i]:
        ds_array[i] = ds_array[i].drop(columns=["uid"])

## Drop specific attack types: 
* password
* xss
* ransomware
* backdoor
* injection
* mitm

In [None]:
for i in range(0, 23):
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "password"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "xss"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "ransomware"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "backdoor"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "injection"]
    ds_array[i]= ds_array[i][ds_array[i]["type"] != "mitm"]

### If you wish to see the individual cleaned CSV files:
change the following line to True.

In [None]:
EXPORT_INDIVIDUAL_CSV = False

In [None]:
if EXPORT_INDIVIDUAL_CSV == True:
    for i in range(0, 23):
        ds_array[i].to_csv(f"./datasets/ds_{i+1}.csv")

## Reduce dataset

In [None]:
newDS  = pd.concat(ds_array, ignore_index=True)

### Drop unneeded scanning entries
Decided by taking a fraction close to, but less than the amount of 'normal' entries.

In [None]:
SCANNING_ENTRIES = 649106
D_DOS_ENTRIES = 474231
DOS_ENTRIES = 421916

### Drop unneeded scanning entries

In [None]:
temp = newDS[newDS["type"] == "scanning"]
temp = temp.reset_index()

newDS = newDS.drop(index=list(temp[-(len(temp)- SCANNING_ENTRIES):]["index"]))

### Drop unneeded DDoS entries

In [None]:
temp = newDS[newDS["type"] == "ddos"]
temp = temp.reset_index()

newDS = newDS.drop(index=list(temp[-(len(temp)- D_DOS_ENTRIES):]["index"]))

### Drop unneeded DoS entries

In [None]:
temp = newDS[newDS["type"] == "dos"]
temp = temp.reset_index()

newDS = newDS.drop(index=list(temp[-(len(temp)- DOS_ENTRIES):]["index"]))

In [None]:
newDS["type"].value_counts()

# Export

In [None]:
newDS.to_csv("./Group_11_TON_IoT_unsw_edu_au_cleaned.csv")