In [38]:
import os
import pandas as pd
import gc
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tqdm import tqdm

In [39]:
#Load the data
data_columns = pd.read_csv("BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-34-1/bro/conn.log.labeled", sep='\t', skiprows=6, nrows=1, header=None).iloc[0][1:]
data = pd.read_csv("BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-34-1/bro/conn.log.labeled", sep='\t', comment="#", header=None)
data.columns = data_columns
data


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents label detailed-label
0,1.545404e+09,CrDn63WjJEmrWGjqf,192.168.1.195,41040,185.244.25.235,80,tcp,-,3.139211,0,...,S0,-,-,0,S,3,180,0,0,- Benign -
1,1.545404e+09,CY9lJW3gh1Eje4usP6,192.168.1.195,41040,185.244.25.235,80,tcp,-,-,-,...,S0,-,-,0,S,1,60,0,0,- Benign -
2,1.545404e+09,CcFXLynukEDnUlvgl,192.168.1.195,41040,185.244.25.235,80,tcp,-,-,-,...,S0,-,-,0,S,1,60,0,0,- Benign -
3,1.545404e+09,CDrkrSobGYxHhYfth,192.168.1.195,41040,185.244.25.235,80,tcp,http,1.477656,149,...,SF,-,-,2896,ShADadttcfF,94,5525,96,139044,- Benign -
4,1.545404e+09,CTWZQf2oJSvq6zmPAc,192.168.1.195,41042,185.244.25.235,80,tcp,-,3.147116,0,...,S0,-,-,0,S,3,180,0,0,- Benign -
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23140,1.545490e+09,C2F17zSUnGOcWzBa7,192.168.1.195,57110,185.244.25.235,6667,tcp,irc,32.840994,62,...,S3,-,-,0,ShAdDaf,7,434,6,589,- Malicious C&C
23141,1.545490e+09,C93P4z4k5IRJD1rXJg,192.168.1.195,57092,185.244.25.235,6667,tcp,irc,36.290833,62,...,S3,-,-,0,ShAdDaf,10,606,7,632,- Malicious C&C
23142,1.545490e+09,CXLZ3A2QY5E8weqpDk,192.168.1.195,123,147.251.48.140,123,udp,-,-,-,...,S0,-,-,0,D,1,76,0,0,- Benign -
23143,1.545490e+09,CuXpFN3fWesWBXUhq1,192.168.1.195,123,82.113.53.40,123,udp,-,-,-,...,S0,-,-,0,D,1,76,0,0,- Benign -


In [40]:
#Split the last combined column into three ones
tunnel_parents_column = data.iloc[:,-1].apply(lambda x: x.split()[0])
label_column = data.iloc[:,-1].apply(lambda x: x.split()[1])
detailed_label_column = data.iloc[:,-1].apply(lambda x: x.split()[2])
data.drop(["tunnel_parents   label   detailed-label"], axis=1, inplace=True)
data["tunnel_parents"] = tunnel_parents_column
data["label"] = label_column
data["detailed_label"] = detailed_label_column
data


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed_label
0,1.545404e+09,CrDn63WjJEmrWGjqf,192.168.1.195,41040,185.244.25.235,80,tcp,-,3.139211,0,...,-,0,S,3,180,0,0,-,Benign,-
1,1.545404e+09,CY9lJW3gh1Eje4usP6,192.168.1.195,41040,185.244.25.235,80,tcp,-,-,-,...,-,0,S,1,60,0,0,-,Benign,-
2,1.545404e+09,CcFXLynukEDnUlvgl,192.168.1.195,41040,185.244.25.235,80,tcp,-,-,-,...,-,0,S,1,60,0,0,-,Benign,-
3,1.545404e+09,CDrkrSobGYxHhYfth,192.168.1.195,41040,185.244.25.235,80,tcp,http,1.477656,149,...,-,2896,ShADadttcfF,94,5525,96,139044,-,Benign,-
4,1.545404e+09,CTWZQf2oJSvq6zmPAc,192.168.1.195,41042,185.244.25.235,80,tcp,-,3.147116,0,...,-,0,S,3,180,0,0,-,Benign,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23140,1.545490e+09,C2F17zSUnGOcWzBa7,192.168.1.195,57110,185.244.25.235,6667,tcp,irc,32.840994,62,...,-,0,ShAdDaf,7,434,6,589,-,Malicious,C&C
23141,1.545490e+09,C93P4z4k5IRJD1rXJg,192.168.1.195,57092,185.244.25.235,6667,tcp,irc,36.290833,62,...,-,0,ShAdDaf,10,606,7,632,-,Malicious,C&C
23142,1.545490e+09,CXLZ3A2QY5E8weqpDk,192.168.1.195,123,147.251.48.140,123,udp,-,-,-,...,-,0,D,1,76,0,0,-,Benign,-
23143,1.545490e+09,CuXpFN3fWesWBXUhq1,192.168.1.195,123,82.113.53.40,123,udp,-,-,-,...,-,0,D,1,76,0,0,-,Benign,-


In [41]:
#Unique values in each column
data.nunique().sort_values(ascending=False)

0
ts                23145
uid               23145
duration           4654
id.orig_p          4383
orig_ip_bytes       108
resp_ip_bytes        62
orig_pkts            53
id.resp_h            49
resp_bytes           44
orig_bytes           29
resp_pkts            28
history              26
id.resp_p            10
conn_state            6
service               5
detailed_label        4
missed_bytes          3
proto                 2
id.orig_h             2
label                 2
local_orig            1
local_resp            1
tunnel_parents        1
dtype: int64

In [42]:
#Removing columns with only one unique value, except ts and variables that only have one unique value. Furthermore, removing "id.orig_h" and "id.resp_h" for avoiding bias in the model.
data.drop(["uid", "local_orig","local_resp", "tunnel_parents", "id.orig_h", "id.resp_h"], axis=1, inplace=True)

#Replace "-" and "(empty)" with np.nan
data.replace({'-':np.nan, "(empty)":np.nan}, inplace=True)

#convert the columns to their appropriate data types
dtype_convert_dict = {
    "duration": float,
    "orig_bytes": float,
    "resp_bytes": float
}
data = data.astype(dtype_convert_dict)

data

Unnamed: 0,ts,id.orig_p,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,detailed_label
0,1.545404e+09,41040,80,tcp,,3.139211,0.0,0.0,S0,0,S,3,180,0,0,Benign,
1,1.545404e+09,41040,80,tcp,,,,,S0,0,S,1,60,0,0,Benign,
2,1.545404e+09,41040,80,tcp,,,,,S0,0,S,1,60,0,0,Benign,
3,1.545404e+09,41040,80,tcp,http,1.477656,149.0,128252.0,SF,2896,ShADadttcfF,94,5525,96,139044,Benign,
4,1.545404e+09,41042,80,tcp,,3.147116,0.0,0.0,S0,0,S,3,180,0,0,Benign,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23140,1.545490e+09,57110,6667,tcp,irc,32.840994,62.0,269.0,S3,0,ShAdDaf,7,434,6,589,Malicious,C&C
23141,1.545490e+09,57092,6667,tcp,irc,36.290833,62.0,260.0,S3,0,ShAdDaf,10,606,7,632,Malicious,C&C
23142,1.545490e+09,123,123,udp,,,,,S0,0,D,1,76,0,0,Benign,
23143,1.545490e+09,123,123,udp,,,,,S0,0,D,1,76,0,0,Benign,


In [43]:
#encode the detailed_label column
label_encoder = LabelEncoder()
data["label"] = label_encoder.fit_transform(data["label"])
#save name of the classes
classes = label_encoder.classes_
data["detailed_label"] = label_encoder.fit_transform(data["detailed_label"])
#save name of the detailed classes
detailed_classes = label_encoder.classes_



#data = pd.get_dummies(data, columns=["proto", "service", "conn_state", "history"])
#data["service"] = label_encoder.fit_transform(data["service"])
#data["proto"] = label_encoder.fit_transform(data["proto"])
#data["conn_state"] = label_encoder.fit_transform(data["conn_state"])
#data["history"] = label_encoder.fit_transform(data["history"])
#data

print(classes)
print(detailed_classes)

#save the classe and detailed classes in a file
np.save("combined_data/classes.npy", classes)
np.save("combined_data/detailed_classes.npy", detailed_classes)

data

['Benign' 'Malicious']
['C&C' 'DDoS' 'PartOfAHorizontalPortScan' nan]


Unnamed: 0,ts,id.orig_p,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,detailed_label
0,1.545404e+09,41040,80,tcp,,3.139211,0.0,0.0,S0,0,S,3,180,0,0,0,3
1,1.545404e+09,41040,80,tcp,,,,,S0,0,S,1,60,0,0,0,3
2,1.545404e+09,41040,80,tcp,,,,,S0,0,S,1,60,0,0,0,3
3,1.545404e+09,41040,80,tcp,http,1.477656,149.0,128252.0,SF,2896,ShADadttcfF,94,5525,96,139044,0,3
4,1.545404e+09,41042,80,tcp,,3.147116,0.0,0.0,S0,0,S,3,180,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23140,1.545490e+09,57110,6667,tcp,irc,32.840994,62.0,269.0,S3,0,ShAdDaf,7,434,6,589,1,0
23141,1.545490e+09,57092,6667,tcp,irc,36.290833,62.0,260.0,S3,0,ShAdDaf,10,606,7,632,1,0
23142,1.545490e+09,123,123,udp,,,,,S0,0,D,1,76,0,0,0,3
23143,1.545490e+09,123,123,udp,,,,,S0,0,D,1,76,0,0,0,3


In [44]:
#Save the data to a csv file
data.to_csv("combined_data/cleaned_data.csv", index=False)

#Free memory
del data