In [1]:
"""
-------------------------------------------Imports-------------------------------------------
"""
import pandas as pd
import keras
import seaborn as sns
import tensorflow
from keras import layers, regularizers, Input, Model
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder

## Variables

In [2]:
"""
-------------------------------------------Variables--------------------------------
"""
# -------- Pandas Dataframe
df = pd.read_csv("dataset/UNSW_2018_IoT_Botnet_Full5pc_4.csv")

# columns
columns = df.columns.tolist()

# column names categorized into three kinds of datatypes: nominal, discrete, continuous
nominal = [col for col in columns if df[col].dtype == 'object']
discrete = [col for col in columns if df[col].dtype == 'int64']
continuous = [col for col in columns if df[col].dtype == 'float']

# categorized dataframe
df_nominal = df.select_dtypes(include=['object'])
df_discrete = df.select_dtypes(include=['int64'])
df_continuous = df.select_dtypes(include=['float'])

#------------- enumerate nominal categories using integers
df_categorical = df.select_dtypes(include=['int8', 'int16'])

# -------- Balanced subset
df_class_0 = df[df['attack'] == 0]
df_class_1 = df[df['attack'] == 1]
df_class_1_subset = df_class_1.sample(n=447, random_state=100)
df_balanced=pd.concat([df_class_1_subset, df_class_0])

  df = pd.read_csv("dataset/UNSW_2018_IoT_Botnet_Full5pc_4.csv")


## Explore

In [3]:
print(df.info())
print(df_class_1.shape)
print(df_class_0.shape)
print(df_class_1_subset.shape)
print(df_balanced.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668522 entries, 0 to 668521
Data columns (total 46 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   pkSeqID                           668522 non-null  int64  
 1   stime                             668522 non-null  float64
 2   flgs                              668522 non-null  object 
 3   flgs_number                       668522 non-null  int64  
 4   proto                             668522 non-null  object 
 5   proto_number                      668522 non-null  int64  
 6   saddr                             668522 non-null  object 
 7   sport                             668522 non-null  object 
 8   daddr                             668522 non-null  object 
 9   dport                             668522 non-null  object 
 10  pkts                              668522 non-null  int64  
 11  bytes                             668522 non-null  i

In [4]:
for col in df:
    print(df[col].value_counts(), '\n')

pkSeqID
3000001    1
3445695    1
3445677    1
3445678    1
3445679    1
          ..
3222843    1
3222844    1
3222845    1
3222846    1
3668522    1
Name: count, Length: 668522, dtype: int64 

stime
1.528099e+09    29
1.528099e+09    28
1.528099e+09    28
1.528099e+09    27
1.528099e+09    27
                ..
1.526345e+09     1
1.526345e+09     1
1.526345e+09     1
1.526345e+09     1
1.529381e+09     1
Name: count, Length: 88102, dtype: int64 

flgs
e        654807
eU        11159
e s        2331
e d         113
e g          99
e &           5
e   t         4
e *           3
e  D          1
Name: count, dtype: int64 

flgs_number
1    654807
6     11159
2      2331
3       113
5        99
7         5
8         4
4         3
9         1
Name: count, dtype: int64 

proto
udp          592145
tcp           67202
icmp           9030
arp             137
ipv6-icmp         8
Name: count, dtype: int64 

proto_number
3    592145
1     67202
4      9030
2       137
5         8
Name: count, dt

## Transform Data

### Normalization

Min is 0
Max is 1
x is an element of [0,1]

In [13]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler, RobustScaler
min_max_scaler = MinMaxScaler().fit(df_discrete)

# # # Apply normalisation to dataset
# X_train = min_max_scaler.transform(X_train)
#
# # # All values between 0 and 1
# pd.Series(X_train.flatten()).describe()
df_discrete_normalized = min_max_scaler.transform(df_discrete)
pd.DataFrame(df_discrete_normalized, columns=discrete).describe()

Unnamed: 0,pkSeqID,flgs_number,proto_number,pkts,bytes,state_number,seq,spkts,dpkts,sbytes,...,TnBPDstIP,TnP_PSrcIP,TnP_PDstIP,TnP_PerProto,TnP_Per_Dport,N_IN_Conn_P_DstIP,N_IN_Conn_P_SrcIP,Pkts_P_State_P_Protocol_P_DestIP,Pkts_P_State_P_Protocol_P_SrcIP,attack
count,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,...,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0,668522.0
mean,0.5,0.010999,0.45307,0.000141,2.8e-05,0.276171,0.439465,0.000265,1.8e-05,4.1e-05,...,0.000286,0.003466,0.004107,0.004334,0.003696,0.977201,0.816867,0.008171,0.006568,0.999286
std,0.288676,0.080699,0.154235,0.003857,0.003661,0.090049,0.300135,0.005144,0.00332,0.004464,...,0.004408,0.00505,0.005063,0.014353,0.005524,0.12425,0.265339,0.005826,0.005215,0.026702
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.5,8.6e-05,5e-06,0.3,0.14415,0.000171,0.0,1e-05,...,0.00019,0.002106,0.0031,0.003061,0.00286,1.0,0.676768,0.006211,0.004028,1.0
50%,0.5,0.0,0.5,0.0001,6e-06,0.3,0.447383,0.0002,0.0,1.2e-05,...,0.000247,0.0031,0.004009,0.003967,0.003678,1.0,1.0,0.008024,0.005927,1.0
75%,0.75,0.0,0.5,0.000157,1e-05,0.3,0.681236,0.000285,0.0,1.6e-05,...,0.000332,0.004873,0.005388,0.00532,0.00493,1.0,1.0,0.010769,0.009318,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
_scaler = MinMaxScaler()
for _col in continuous:
    print(f'before normalizing: {_col}', df[_col].describe(), '\n', sep='\n')
    df[_col] = _scaler.fit_transform(df[_col].values.reshape(-1, 1))
    print(f'after normalizing: {_col}', df[_col].describe(), '\n', sep='\n')
    print(sns.displot(data=df, x=_col, kind='kde'))

### Standardization

mean is 0
standard deviation is 1

In [8]:
_scaler = StandardScaler()
for _col in discrete:
    print(f'before standardizing: {_col}', df[_col].value_counts(), '\n', sep='\n')
    df[_col] = _scaler.fit_transform(df[_col].values.reshape(-1, 1))
    print(f'after standardizing: {_col}', df[_col].value_counts(), '\n', sep='\n')

before standardizing: pkSeqID
3000001    1
3445695    1
3445677    1
3445678    1
3445679    1
          ..
3222843    1
3222844    1
3222845    1
3222846    1
3668522    1
Name: pkSeqID, Length: 668522, dtype: int64


after standardizing: pkSeqID
-1.732048    1
 0.577418    1
 0.577325    1
 0.577330    1
 0.577336    1
            ..
-0.577341    1
-0.577336    1
-0.577330    1
-0.577325    1
 1.732048    1
Name: pkSeqID, Length: 668522, dtype: int64


before standardizing: flgs_number
1    654807
6     11159
2      2331
3       113
5        99
7         5
8         4
4         3
9         1
Name: flgs_number, dtype: int64


after standardizing: flgs_number
-0.136293     654807
 7.608529      11159
 1.412672       2331
 2.961636        113
 6.059565         99
 9.157493          5
 10.706458         4
 4.510600          3
 12.255422         1
Name: flgs_number, dtype: int64


before standardizing: proto_number
3    592145
1     67202
4      9030
2       137
5         8
Name: proto_nu

### Enumerate nominal columns

In [9]:
le = LabelEncoder()
for _col in nominal:
    print(f'before enumerating: {_col}', df[_col].value_counts(), '\n', sep='\n')
    try:
        df[_col] = le.fit_transform(df[_col])
    except TypeError:
        df[_col] = le.fit_transform(df[_col].astype(str))
    print(f'after enumerating: {_col}', df[_col].value_counts(), '\n',  sep='\n')

before enumerating: flgs
e        654807
eU        11159
e s        2331
e d         113
e g          99
e &           5
e   t         4
e *           3
e  D          1
Name: flgs, dtype: int64


after enumerating: flgs
0    654807
8     11159
7      2331
5       113
6        99
3         5
1         4
4         3
2         1
Name: flgs, dtype: int64


before enumerating: proto
udp          592145
tcp           67202
icmp           9030
arp             137
ipv6-icmp         8
Name: proto, dtype: int64


after enumerating: proto
4    592145
3     67202
1      9030
0       137
2         8
Name: proto, dtype: int64


before enumerating: saddr
192.168.100.147              182905
192.168.100.148              167582
192.168.100.150              161085
192.168.100.149              147091
192.168.100.5                  5029
192.168.100.3                  4673
192.168.100.6                    38
192.168.100.7                    30
192.168.100.4                    19
192.168.100.1               

## Auto Encoder

### Baseline
latent space dimension = 22
1 hidden layer

In [10]:
# TODO: Set up the encoder to compute the reconstruction error
# This is the size of our encoded representations
encoding_dim = 30  # compression factor of ~2/3 per layer, assuming the input is 45 floats

# This is our input
Inputs = keras.Input(shape=(45,))
# "encoded" is the encoded representation of the input
encoded = layers.Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(10e-5))(Inputs)
# "decoded" is the lossy reconstruction of the input
decoded = layers.Dense(45, activation='sigmoid')(encoded)

# This model maps an input to its reconstruction
autoencoder = keras.Model(Inputs, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

Stratify

In [14]:
# Split the data into smaller subsets with stratified samples
def stratify(X=None, y=None, size=None):
    """ Generates 5 subsets of the input data with the corresponding class labels
    :param X: Dataframe with 45 features (exclude 'attack' column)
    :param y: Dataframe with 1 feature
    :param size: cap on the size of the dataset
    :return: generator object (tuple) of 5 stratified subsets
    """
    df = pd.read_csv('dataset/UNSW_2018_IoT_Botnet_Full5pc_4.csv')
    if not X and not y and size:
        X=df.drop(columns=["attack"])[size:]
        y=df['attack'][size:]
    sss= StratifiedShuffleSplit(n_splits=5, random_state=0)
    sss.get_n_splits(X, y)
    return sss.split(X, y)

splits = stratify(size=10000)
stratified_data = {}
for i, (Xtr, Xte) in enumerate(splits):
    stratified_data[i]={'train': Xtr,
                        'test': Xte}

  df = pd.read_csv('dataset/UNSW_2018_IoT_Botnet_Full5pc_4.csv')
