# Preprocessing of the IoT23 dataset:

In [21]:
import os
import io
import re
import pandas as pd
from pathlib import Path

data_dir = '../../../General/dataset/iot_23_datasets_small/'  

In [22]:
def get_files(data_dir):
    '''
    returns the list of all data files in the given data directory
    '''
    labeled_files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(data_dir):
        for file in f:
            if file.endswith(".labeled"):
                file_name = os.path.join(r, file)
                labeled_files.append(file_name)
                file_size = os.path.getsize(file_name)
                print(f"File: {file_name.rsplit('IoTScenarios', 2)[-1]} -> Size: {file_size} bytes")
    return labeled_files


def file2df(data):
    '''
    converts the file lines to a dataframe
    '''
    clean_lines = []
    first_flag = True
    headers = ''
    for x in data:
        if len(x) > 0:
            if x[0]=='#' or x[0]==' ':
                headers = x
                continue
            else:
                if first_flag:
                    first_flag = False
                    clean_lines.append(headers[1:])
                clean_lines.append(x)
                
    str_lines = "\n".join(str(x) for x in clean_lines)
    df = pd.read_csv(io.StringIO(str_lines), sep="\t|\s\s", header=None, engine='python')
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    return df

labeled_files = get_files(data_dir)

File: /CTU-IoT-Malware-Capture-48-1/bro/conn.log.labeled -> Size: 531738215 bytes
File: /CTU-IoT-Malware-Capture-42-1/bro/conn.log.labeled -> Size: 585829 bytes
File: /CTU-Honeypot-Capture-7-1/Somfy-01/bro/conn.log.labeled -> Size: 18024 bytes
File: /CTU-IoT-Malware-Capture-33-1/bro/conn.log.labeled -> Size: 7867920121 bytes
File: /CTU-IoT-Malware-Capture-39-1/bro/conn.log.labeled -> Size: 10885361439 bytes
File: /CTU-IoT-Malware-Capture-52-1/bro/conn.log.labeled -> Size: 3070319722 bytes
File: /CTU-IoT-Malware-Capture-20-1/bro/conn.log.labeled -> Size: 419604 bytes
File: /CTU-IoT-Malware-Capture-3-1/bro/conn.log.labeled -> Size: 24383537 bytes
File: /CTU-IoT-Malware-Capture-9-1/bro/conn.log.labeled -> Size: 993459767 bytes
File: /CTU-IoT-Malware-Capture-7-1/bro/conn.log.labeled -> Size: 1581263847 bytes
File: /CTU-IoT-Malware-Capture-60-1/bro/conn.log.labeled -> Size: 467559811 bytes
File: /CTU-Honeypot-Capture-4-1/bro/conn.log.labeled -> Size: 61653 bytes
File: /CTU-IoT-Malware-Captu

In [23]:
test_files =  ['../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1/bro/conn.log.labeled', 
                  '../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1/Somfy-01/bro/conn.log.labeled']
# test_files = labeled_files
df_master = pd.DataFrame()
for file in test_files:
    file_size = round(os.stat(file).st_size / (1024 * 1024), 2 )
    print(f'\nProcessing file: {file}\n File Size is {file_size} MB')
    with open(file, 'r') as f:
        data = f.readlines()
        df_file = file2df(data)
        print(df_file.shape)
        df_master = pd.concat([df_master, df_file])
df_master.to_csv(data_dir + 'iot_23_small.csv', index=False)


Processing file: ../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1/bro/conn.log.labeled
 File Size is 0.17 MB
(1374, 24)

Processing file: ../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1/Somfy-01/bro/conn.log.labeled
 File Size is 0.02 MB
(130, 24)


In [24]:
def line2row(str_line):
    '''
    converts a line of the file to a dataframe row
    '''
    row = None
    if len(str_line) > 0:
        if str_line[0]!='#' or str_line[0]!=' ':
            row = str_line.replace('#', '').replace('.', '_').strip()
            row = re.split(r'\t|\s+', row)
    return row


def get_columns(file):
    '''
    defines the columns for the dataframe
    '''
    cols = None
    with open(file, 'r') as f:
        head_rows = [next(f) for x in range(20)]

    if head_rows:
        for i in range(len(head_rows)):
            if head_rows[i][0]!='#' and head_rows[i][0]!=' ':
                cols = line2row(head_rows[i-2])  # -2 because the header line is located two lines above than first data row line. 
                break
    else:
        print('No header found!')
    return list(cols)


def save_df(dict_list, cols, csv_file):
    '''
    saves the dataframe as a csv file
    '''
    df_master = pd.DataFrame.from_dict(dict_list, columns=cols, orient='index')
    # csv_file = data_dir + 'iot_23_small.csv'
    df_master.to_csv(csv_file, index=False)
    print('The current chunk dataframe has been saved to: ', csv_file)
    print('The dataframe has shape: ', df_master.shape)
    print('Done!')


def file2rows(labeled_files, cols, chunk_size=1000000):
    # considering only two files for debugging purposes
    test_files =  ['../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1/bro/conn.log.labeled', 
                    '../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1/Somfy-01/bro/conn.log.labeled']
    # test_files = labeled_files 
    count_rows = 0
    fc = 1
    dict_list = []

    for file in test_files:
        file_size = round(os.stat(file).st_size / (1024 * 1024), 2 )
        print('\n' + '-'*60)
        print(f'\n{fc}. Processing file: {file}\n File Size is {file_size} MB')
        with open(file, 'r') as f:
            for line in f:
                row = line2row(line)
                if (len(row)== len(cols)) and (row is not None):
                    # df_master.append(row) # slower than dict_list.append(row)
                    dict_list.append(row)
                    count_rows = count_rows + 1
                    if count_rows % chunk_size == 0:
                        chunk_dir = data_dir + 'chunks'
                        Path(chunk_dir).mkdir(parents=True, exist_ok=True)
                        file_name = f'{chunk_dir}/iot_23_small_{fc}_{count_rows}.csv'
                        save_df(dict_list, cols, file_name)
                        print(f'Processed {count_rows} rows')
                        dict_list = [] # reset the list
                else: 
                    if len(row)>10:
                        print(f'\nError on appending the row, either by len not equals to cols len or None value:   len: {len(row)}\n  row: {row}')
        fc = fc + 1

labeled_files = test_files  # uncomment for debugging
chunk_size = 10000000
cols = get_columns(labeled_files[0])
cols.remove('fields')
file2rows(labeled_files, cols, chunk_size)


------------------------------------------------------------

1. Processing file: ../../../General/dataset/iot_23_datasets_small/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1/bro/conn.log.labeled
 File Size is 0.17 MB

Error on appending the row, either by len not equals to cols len or None value:   len: 24
  row: ['fields', 'ts', 'uid', 'id_orig_h', 'id_orig_p', 'id_resp_h', 'id_resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'label', 'detailed-label']

Error on appending the row, either by len not equals to cols len or None value:   len: 24
  row: ['types', 'time', 'string', 'addr', 'port', 'addr', 'port', 'enum', 'string', 'interval', 'count', 'count', 'string', 'bool', 'bool', 'count', 'string', 'count', 'count', 'count', 'count', 'set[string]', 'string', 'string']

-------------------------------