In [1]:
import os
import pandas as pd
import gc  # For garbage collection
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Define paths
folder = os.path.join("BigDataset", "IoTScenarios")
file = os.path.join("bro", "conn.log.labeled")
output_folder = os.path.join("combined_data")  # Combined data output folder
output_file = os.path.join(output_folder, "combined_data.csv")  # Final combined file

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

list_csv = ["CTU-Honeypot-Capture-4-1", "CTU-Honeypot-Capture-5-1", "CTU-Honeypot-Capture-7-1", "CTU-IoT-Malware-Capture-1-1", 
            "CTU-IoT-Malware-Capture-3-1", "CTU-IoT-Malware-Capture-7-1", "CTU-IoT-Malware-Capture-8-1", 
            "CTU-IoT-Malware-Capture-9-1", "CTU-IoT-Malware-Capture-17-1", "CTU-IoT-Malware-Capture-20-1", 
            "CTU-IoT-Malware-Capture-21-1", "CTU-IoT-Malware-Capture-33-1", "CTU-IoT-Malware-Capture-34-1", 
            "CTU-IoT-Malware-Capture-35-1", "CTU-IoT-Malware-Capture-36-1", "CTU-IoT-Malware-Capture-39-1", 
            "CTU-IoT-Malware-Capture-42-1", "CTU-IoT-Malware-Capture-43-1", "CTU-IoT-Malware-Capture-44-1", 
            "CTU-IoT-Malware-Capture-48-1", "CTU-IoT-Malware-Capture-49-1", "CTU-IoT-Malware-Capture-52-1", 
            "CTU-IoT-Malware-Capture-60-1"]

print(len(list_csv))

MAX_ROWS = 1000000  # Maximum number of rows to process per dataset

def process_file_in_chunks(data_path, output_file):
    print(f"Processing {data_path} in chunks")
    chunk_size = 15000000  # Adjust based on available memory
    total_rows_processed = 0  # Track how many rows have been processed

    for chunk in pd.read_csv(data_path, sep='\t', comment="#", header=None, chunksize=chunk_size):
        remaining_rows = MAX_ROWS - total_rows_processed
        if remaining_rows <= 0:
            break

        if len(chunk) > remaining_rows:
            chunk = chunk.iloc[:remaining_rows]

        # Append the chunk to output CSV
        chunk.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
        total_rows_processed += len(chunk)

        del chunk  # Free memory after processing the chunk
        gc.collect()  # Garbage collection after each chunk

def process_and_save(file_list, output_file):
    with pd.option_context('mode.chained_assignment', None):
        for folder_name in file_list:
            data_path = os.path.join(folder, folder_name, file)
            process_file_in_chunks(data_path, output_file)

# Process and save the combined data
process_and_save(list_csv, output_file)
print(f"Combined data has been saved to {output_file}")




23
Processing BigDataset\IoTScenarios\CTU-Honeypot-Capture-4-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-Honeypot-Capture-5-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-Honeypot-Capture-7-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-1-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-3-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-7-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-8-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-9-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-17-1\bro\conn.log.labeled in chunks


  for chunk in pd.read_csv(data_path, sep='\t', comment="#", header=None, chunksize=chunk_size):


Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-20-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-21-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-33-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-34-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-35-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-36-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-39-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-42-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-43-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-44-1\bro\conn.log.labeled in chunks
Processing BigDataset\IoTScenarios\CTU-IoT-Malware

  for chunk in pd.read_csv(data_path, sep='\t', comment="#", header=None, chunksize=chunk_size):


Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-49-1\bro\conn.log.labeled in chunks


  for chunk in pd.read_csv(data_path, sep='\t', comment="#", header=None, chunksize=chunk_size):


Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-52-1\bro\conn.log.labeled in chunks


  for chunk in pd.read_csv(data_path, sep='\t', comment="#", header=None, chunksize=chunk_size):


Processing BigDataset\IoTScenarios\CTU-IoT-Malware-Capture-60-1\bro\conn.log.labeled in chunks
Combined data has been saved to combined_data\combined_data.csv


In [3]:
data = pd.read_csv("combined_data/combined_data.csv")
#Set the column names
data.columns = ["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "service", "duration", "orig_bytes", "resp_bytes", "conn_state", "local_orig", "local_resp", "missed_bytes", "history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes", "tunnel_parents   label   detailed-label"]

#Split the last combined column into three ones
tunnel_parents_column = data.iloc[:,-1].apply(lambda x: x.split()[0])
label_column = data.iloc[:,-1].apply(lambda x: x.split()[1])
detailed_label_column = data.iloc[:,-1].apply(lambda x: x.split()[2])
data.drop(["tunnel_parents   label   detailed-label"], axis=1, inplace=True)
data["tunnel_parents"] = tunnel_parents_column
data["label"] = label_column
data["detailed_label"] = detailed_label_column

print(data.head())

  data = pd.read_csv("combined_data/combined_data.csv")


             ts                 uid      id.orig_h  id.orig_p  \
0  1.540469e+09  CGm6jB4dXK71ZDWUDh  192.168.1.132      58687   
1  1.540469e+09  CnaDAG3n5r8eiG4su2  192.168.1.132       1900   
2  1.540469e+09  CUrxU238nt0m6yTgKf  192.168.1.132      32893   
3  1.540470e+09  CGQf8t1kjdxB5PHXL4  192.168.1.132      53395   
4  1.540470e+09  CUo9DH2QDnCaBIGjkg  192.168.1.132      52801   

         id.resp_h  id.resp_p proto service    duration orig_bytes  ...  \
0     216.239.35.4        123   udp       -    0.114184         48  ...   
1  239.255.255.250       1900   udp       -  160.367579       7536  ...   
2     216.239.35.8        123   udp       -    0.016986         48  ...   
3       2.16.60.82        443   tcp       -    0.003497          0  ...   
4      192.168.1.1         53   udp     dns    0.036724         34  ...   

  local_resp missed_bytes history orig_pkts  orig_ip_bytes resp_pkts  \
0          -            0      Dd         1             76         1   
1          -  

In [4]:
#Unique values in each column
data.nunique().sort_values(ascending=False)

ts                13202765
uid               13202765
id.resp_h         10518724
duration            112718
id.orig_p            65536
id.resp_p            65421
id.orig_h            18539
resp_ip_bytes         1866
orig_ip_bytes         1815
resp_bytes             900
orig_bytes             605
history                287
orig_pkts              198
resp_pkts              151
missed_bytes            24
conn_state              13
detailed_label          13
service                  7
proto                    3
label                    3
tunnel_parents           2
local_resp               1
local_orig               1
dtype: int64

In [5]:
#Removing columns with only one unique value, except ts. And removing variables with only one unique value
data.drop(["uid", "local_orig","local_resp", "tunnel_parents", "id.orig_h", "id.resp_h"], axis=1, inplace=True)

#Replace "-" and "(empty)" with np.nan
data.replace({'-':np.nan, "(empty)":np.nan}, inplace=True)

#convert the columns to their appropriate data types
dtype_convert_dict = {
    "duration": float,
    "orig_bytes": float,
    "resp_bytes": float
}
data = data.astype(dtype_convert_dict)

data


Unnamed: 0,ts,id.orig_p,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,detailed_label
0,1.540469e+09,58687,123,udp,,0.114184,48.0,48.0,SF,0,Dd,1,76,1,76,benign,
1,1.540469e+09,1900,1900,udp,,160.367579,7536.0,0.0,S0,0,D,24,8208,0,0,benign,
2,1.540469e+09,32893,123,udp,,0.016986,48.0,48.0,SF,0,Dd,1,76,1,76,benign,
3,1.540470e+09,53395,443,tcp,,0.003497,0.0,0.0,SF,0,ShAFf,5,212,3,144,benign,
4,1.540470e+09,52801,53,udp,dns,0.036724,34.0,311.0,SF,0,Dd,1,62,1,339,benign,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13202760,1.569018e+09,11584,62336,tcp,,,,,OTH,0,C,0,0,0,0,Malicious,DDoS
13202761,1.569018e+09,11584,62336,tcp,,,,,OTH,0,C,0,0,0,0,Malicious,DDoS
13202762,1.569018e+09,41036,62336,tcp,,,,,OTH,0,C,0,0,0,0,Malicious,DDoS
13202763,1.569018e+09,41036,62336,tcp,,,,,OTH,0,C,0,0,0,0,Malicious,DDoS


In [6]:
#Remove rows from "detail_label" column that have less than 100 instances
print(data["detailed_label"].unique())
print(data["detailed_label"].value_counts())


[nan 'PartOfAHorizontalPortScan' 'C&C' 'Attack' 'C&C-HeartBeat' 'Okiru'
 'DDoS' 'C&C-Torii' 'C&C-FileDownload' 'Okiru-Attack' 'FileDownload'
 'C&C-HeartBeat-FileDownload' 'C&C-Mirai']
detailed_label
PartOfAHorizontalPortScan     7459217
Okiru                         2626252
DDoS                          1263023
C&C                             15527
Attack                           6943
C&C-HeartBeat                    2563
C&C-FileDownload                   50
C&C-Torii                          30
FileDownload                       18
C&C-HeartBeat-FileDownload         11
Okiru-Attack                        3
C&C-Mirai                           1
Name: count, dtype: int64


In [7]:
#encode the detailed_label column
label_encoder = LabelEncoder()
data["label"] = label_encoder.fit_transform(data["label"])
#save name of the classes
classes = label_encoder.classes_
data["detailed_label"] = label_encoder.fit_transform(data["detailed_label"])
#save name of the detailed classes
detailed_classes = label_encoder.classes_



#data = pd.get_dummies(data, columns=["proto", "service", "conn_state", "history"])
#data["service"] = label_encoder.fit_transform(data["service"])
#data["proto"] = label_encoder.fit_transform(data["proto"])
#data["conn_state"] = label_encoder.fit_transform(data["conn_state"])
#data["history"] = label_encoder.fit_transform(data["history"])
#data

print(classes)
print(detailed_classes)

#save the classe and detailed classes in a file
np.save("combined_data/classes.npy", classes)
np.save("combined_data/detailed_classes.npy", detailed_classes)

data

['Benign' 'Malicious' 'benign']
['Attack' 'C&C' 'C&C-FileDownload' 'C&C-HeartBeat'
 'C&C-HeartBeat-FileDownload' 'C&C-Mirai' 'C&C-Torii' 'DDoS'
 'FileDownload' 'Okiru' 'Okiru-Attack' 'PartOfAHorizontalPortScan' nan]


Unnamed: 0,ts,id.orig_p,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,detailed_label
0,1.540469e+09,58687,123,udp,,0.114184,48.0,48.0,SF,0,Dd,1,76,1,76,2,12
1,1.540469e+09,1900,1900,udp,,160.367579,7536.0,0.0,S0,0,D,24,8208,0,0,2,12
2,1.540469e+09,32893,123,udp,,0.016986,48.0,48.0,SF,0,Dd,1,76,1,76,2,12
3,1.540470e+09,53395,443,tcp,,0.003497,0.0,0.0,SF,0,ShAFf,5,212,3,144,2,12
4,1.540470e+09,52801,53,udp,dns,0.036724,34.0,311.0,SF,0,Dd,1,62,1,339,2,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13202760,1.569018e+09,11584,62336,tcp,,,,,OTH,0,C,0,0,0,0,1,7
13202761,1.569018e+09,11584,62336,tcp,,,,,OTH,0,C,0,0,0,0,1,7
13202762,1.569018e+09,41036,62336,tcp,,,,,OTH,0,C,0,0,0,0,1,7
13202763,1.569018e+09,41036,62336,tcp,,,,,OTH,0,C,0,0,0,0,1,7


In [8]:
#Save the data to a csv file
data.to_csv("combined_data/cleaned_data.csv", index=False)

#Free memory
del data