In [92]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import random
import matplotlib.pyplot as plt
import seaborn as sns
import keras

In [2]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.sparse import csr_matrix  # For sparse matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
# Reading datasets
dfs = []
for i in range(1,5):
    path = '/kaggle/input/unsw-nb15/UNSW-NB15_{}.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None))
df = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

  dfs.append(pd.read_csv(path.format(i), header = None))
  dfs.append(pd.read_csv(path.format(i), header = None))


In [4]:
# This csv file contains names of all the features
df_col = pd.read_csv('/kaggle/input/unsw-nb15/NUSW-NB15_features.csv', encoding='ISO-8859-1')

In [5]:
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())

In [6]:
# Renaming our dataframe with proper column names
df.columns = df_col['Name']

In [7]:
del df_col

In [8]:
df.shape

(2540047, 49)

In [9]:
df.head()

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


In [10]:
# checking for null values
df.isnull().sum()

Name
srcip                     0
sport                     0
dstip                     0
dsport                    0
proto                     0
state                     0
dur                       0
sbytes                    0
dbytes                    0
sttl                      0
dttl                      0
sloss                     0
dloss                     0
service                   0
sload                     0
dload                     0
spkts                     0
dpkts                     0
swin                      0
dwin                      0
stcpb                     0
dtcpb                     0
smeansz                   0
dmeansz                   0
trans_depth               0
res_bdy_len               0
sjit                      0
djit                      0
stime                     0
ltime                     0
sintpkt                   0
dintpkt                   0
tcprtt                    0
synack                    0
ackdat                    0
is_sm_ips_ports

In [11]:
df['attack_cat'].value_counts()

Generic             215481
Exploits             44525
 Fuzzers             19195
DoS                  16353
 Reconnaissance      12228
 Fuzzers              5051
Analysis              2677
Backdoor              1795
Reconnaissance        1759
 Shellcode            1288
Backdoors              534
Shellcode              223
Worms                  174
Name: attack_cat, dtype: int64

In [12]:
# We don't have "normal" values for "attack_cat", so we must fill Null values with "normal"
df['attack_cat'] = df.attack_cat.fillna(value='normal').apply(lambda x: x.strip().lower())

In [13]:
df['attack_cat'].value_counts()

normal            2218764
generic            215481
exploits            44525
fuzzers             24246
dos                 16353
reconnaissance      13987
analysis             2677
backdoor             1795
shellcode            1511
backdoors             534
worms                 174
Name: attack_cat, dtype: int64

In [14]:
df['ct_flw_http_mthd'] = df.ct_flw_http_mthd.fillna(value=0)

In [15]:
# Even though it's a binary column, but there're values like 2 and 4
df['is_ftp_login'].value_counts()

0.0    1066593
1.0      43389
4.0        156
2.0         30
Name: is_ftp_login, dtype: int64

In [16]:
df['is_ftp_login'] = (df.is_ftp_login.fillna(value=0)).astype(int)

In [17]:
# There are no Null values in the train data anymore
df.isnull().sum().sum()

0

In [18]:
# Getting name of all the columns
df.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'label'],
      dtype='object', name='Name')

In [19]:
df_0, df_1 = df['label'].value_counts()[0] / len(df.index), df['label'].value_counts()[1] / len(df.index)
print("In data: there are {} % of class 0 and {} % of class 1".format(df_0, df_1))

In data: there are 0.8735129704292873 % of class 0 and 0.12648702957071267 % of class 1


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2540047 entries, 0 to 2540046
Data columns (total 49 columns):
 #   Column            Dtype  
---  ------            -----  
 0   srcip             object 
 1   sport             object 
 2   dstip             object 
 3   dsport            object 
 4   proto             object 
 5   state             object 
 6   dur               float64
 7   sbytes            int64  
 8   dbytes            int64  
 9   sttl              int64  
 10  dttl              int64  
 11  sloss             int64  
 12  dloss             int64  
 13  service           object 
 14  sload             float64
 15  dload             float64
 16  spkts             int64  
 17  dpkts             int64  
 18  swin              int64  
 19  dwin              int64  
 20  stcpb             int64  
 21  dtcpb             int64  
 22  smeansz           int64  
 23  dmeansz           int64  
 24  trans_depth       int64  
 25  res_bdy_len       int64  
 26  sjit          

In [21]:
# All the datatypes in our dataset
df.dtypes.value_counts()

int64      29
float64    11
object      9
dtype: int64

In [22]:
# Categorical feature names
df.select_dtypes(exclude=np.number).columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service',
       'ct_ftp_cmd', 'attack_cat'],
      dtype='object', name='Name')

In [23]:
# In the research paper it was mentioned that, this is a numerical feature not a categorical
df['ct_ftp_cmd'].unique()

array([0, 1, 6, 2, 4, 8, 5, 3, '0', '1', ' ', '2', '4'], dtype=object)

In [24]:
# Removing empty space and converting it to numerical
df['ct_ftp_cmd'] = df['ct_ftp_cmd'].replace(to_replace=' ', value=0).astype(int)

In [25]:
df['ct_ftp_cmd'].unique()

array([0, 1, 6, 2, 4, 8, 5, 3])

In [26]:
# Categorical feature names
df.select_dtypes(exclude=np.number).columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service',
       'attack_cat'],
      dtype='object', name='Name')

In [27]:
# Numeric features names
df.select_dtypes(include=np.number).columns

Index(['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'sload',
       'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'],
      dtype='object', name='Name')

In [28]:
# see if binary columns are really binary
for col in 'is_sm_ips_ports', 'is_ftp_login':
    print(df[col].value_counts())
    print()

0    2535852
1       4195
Name: is_sm_ips_ports, dtype: int64

0    2496472
1      43389
4        156
2         30
Name: is_ftp_login, dtype: int64



In [29]:
df['is_ftp_login'] = np.where(df['is_ftp_login']>1, 1, df['is_ftp_login'])

In [30]:
df['is_ftp_login'].value_counts()

0    2496472
1      43575
Name: is_ftp_login, dtype: int64

In [31]:
df['service'].value_counts()

-           1246397
dns          781668
http         206273
ftp-data     125783
smtp          81645
ftp           49090
ssh           47160
pop3           1533
dhcp            172
ssl             142
snmp            113
radius           40
irc              31
Name: service, dtype: int64

In [32]:
# removing all the "-" and replacing those with "None"
df['service'] = df['service'].apply(lambda x:"None" if x=="-" else x)

In [33]:
df['service'].value_counts()

None        1246397
dns          781668
http         206273
ftp-data     125783
smtp          81645
ftp           49090
ssh           47160
pop3           1533
dhcp            172
ssl             142
snmp            113
radius           40
irc              31
Name: service, dtype: int64

In [34]:
# In the research paper there are not 10 unique values
df['attack_cat'].nunique()

11

In [35]:
df['attack_cat'].value_counts()

normal            2218764
generic            215481
exploits            44525
fuzzers             24246
dos                 16353
reconnaissance      13987
analysis             2677
backdoor             1795
shellcode            1511
backdoors             534
worms                 174
Name: attack_cat, dtype: int64

In [36]:
df['attack_cat'] = df['attack_cat'].replace('backdoors','backdoor', regex=True).apply(lambda x: x.strip().lower())

In [37]:
df.shape

(2540047, 49)

In [38]:
# Utility function
def multi_corr(col1, col2="label", df=df):
    '''
    This function returns correlation between 2 given features.
    Also gives corr of the given features with "label" afetr applying log1p to it.
    '''
    corr = df[[col1, col2]].corr().iloc[0,1]
    log_corr = df[col1].apply(np.log1p).corr(df[col2])

    print("Correlation : {}\nlog_Correlation: {}".format(corr, log_corr))

In [39]:
def corr(col1, col2="label", df=df):
    """
    This function returns correlation between 2 given features
    """
    return df[[col1, col2]].corr().iloc[0,1]

In [40]:
# Selecting all the features with high correlation values with other features
# Refer: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

  corr_matrix = df.corr().abs()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [41]:
# We don't want to use these features for plotting because these are having high corr
# And most likely have same kind of plots with already plotted feature
print(to_drop)

['sloss', 'dloss', 'dpkts', 'dwin', 'ltime', 'ct_srv_dst', 'ct_src_dport_ltm', 'ct_dst_src_ltm']


In [42]:
# removing the features from train and test data
df.drop(columns=to_drop, inplace=True)

In [43]:
df.shape

(2540047, 41)

In [44]:
# creating new features
df['network_bytes'] = df['sbytes'] + df['dbytes']

In [45]:
df.shape

(2540047, 42)

In [46]:
# Dropping columns which are not useful for the classification
# attack_cat is for multiclass classification
# all the other columns are address related and not present in sample train data
df.drop(['srcip', 'sport', 'dstip', 'dsport', 'attack_cat'], axis=1, inplace=True)

In [47]:
df.shape

(2540047, 37)

In [48]:
# Getting number of unique values of all the columns
# If the unique values are high that means it has continuous set of values
col_unique_values = df.nunique()

In [49]:
# If the unique values are getter than some threshould than we will check its corr
col = col_unique_values[col_unique_values>200].index

In [50]:
# Checking corr value of original col and log1p applied col
# Taking those columns whose unique values are getter than some threshould
for column in col:
    print("{:-^30}".format(column))
    multi_corr(column)

-------------dur--------------
Correlation : 0.0019589972041426317
log_Correlation: -0.03236993809890607
------------sbytes------------
Correlation : 0.010180321142271112
log_Correlation: -0.35616309278591746
------------dbytes------------
Correlation : -0.07554308366285474
log_Correlation: -0.5191874410649446
------------sload-------------
Correlation : 0.19227837760254102
log_Correlation: 0.34698617748318455
------------dload-------------
Correlation : -0.21971018813281115
log_Correlation: -0.6030599800899222
------------spkts-------------
Correlation : -0.12088031292918162
log_Correlation: -0.3162904934976556
------------stcpb-------------
Correlation : -0.23342211730231804
log_Correlation: -0.3132591147974861
------------dtcpb-------------
Correlation : -0.2335125339601424
log_Correlation: -0.31311869240806245
-----------smeansz------------
Correlation : -0.06536344918561701
log_Correlation: -0.1512402209417535
-----------dmeansz------------
Correlation : -0.272397044017891
log_Cor

In [51]:
# Will apply log1p on this columns and remove original columns
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit', 'network_bytes']

In [52]:
def log1p_transform(col, df=df):
    '''
    Apply log1p on given column.
    Remove the original cola and keep log1p applied col
    '''
    new_col = col+'_log1p'
    df[new_col] = df[col].apply(np.log1p)
    df.drop(col, axis=1, inplace=True)

In [53]:
# Transforming columns with log1p
for col in log1p_col:
    log1p_transform(col, df=df)

In [54]:
df.shape

(2540047, 37)

In [55]:
df.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'label', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p', 'network_bytes_log1p'],
      dtype='object', name='Name')

In [56]:
df.shape

(2540047, 37)

In [57]:
# creating x and y set from the dataset
X, y = df.drop(columns=['label']), df['label']

In [58]:
# getting categorical and numerical columns in 2 diff lists
cat_col = ['proto', 'service', 'state']
num_col = list(set(X.columns) - set(cat_col))

In [59]:
X.head()

Name,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,31,29,dns,0,0,0,1421927414,0.017,...,13.123313,13.340377,1.098612,0.0,0.0,4.204693,4.418841,0.0,0.0,5.693732
1,udp,CON,31,29,,0,0,0,1421927414,7.005,...,11.381416,10.829356,1.609438,0.0,0.0,4.890349,4.343805,2.387938,2.458112,6.725034
2,udp,CON,31,29,dns,0,0,0,1421927414,0.017,...,13.165223,13.363399,1.098612,0.0,0.0,4.304065,4.49981,0.0,0.0,5.783825
3,udp,CON,31,29,dns,0,0,0,1421927414,0.043,...,12.98706,13.204124,1.098612,0.0,0.0,4.204693,4.418841,0.0,0.0,5.693732
4,udp,CON,31,29,dns,0,0,0,1421927414,0.005,...,13.12151,13.319686,1.098612,0.0,0.0,4.304065,4.49981,0.0,0.0,5.783825


In [60]:
# Standardizing the data
scaler = StandardScaler()
scaler = scaler.fit(X[num_col])

In [61]:
X[num_col] = scaler.transform(X[num_col])

In [62]:
X.head()

Name,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,-0.425902,-0.041232,dns,-1.196094,-0.237863,-0.089307,-1.175319,-0.069555,...,-0.159354,0.46534,-1.081106,-1.190074,-1.189778,-0.534196,0.061438,-0.995473,-0.899954,-1.013666
1,udp,CON,-0.425902,-0.041232,,-1.196094,-0.237863,-0.089307,-1.175319,-0.067041,...,-0.713789,0.023832,-0.7052,-1.190074,-1.189778,0.565458,0.028967,-0.256628,-0.026414,-0.572489
2,udp,CON,-0.425902,-0.041232,dns,-1.196094,-0.237863,-0.089307,-1.175319,-0.069555,...,-0.146014,0.469388,-1.081106,-1.190074,-1.189778,-0.374823,0.096478,-0.995473,-0.899954,-0.975125
3,udp,CON,-0.425902,-0.041232,dns,-1.196094,-0.237863,-0.089307,-1.175319,-0.069546,...,-0.202722,0.441383,-1.081106,-1.190074,-1.189778,-0.534196,0.061438,-0.995473,-0.899954,-1.013666
4,udp,CON,-0.425902,-0.041232,dns,-1.196094,-0.237863,-0.089307,-1.175319,-0.06956,...,-0.159928,0.461702,-1.081106,-1.190074,-1.189778,-0.374823,0.096478,-0.995473,-0.899954,-0.975125


In [63]:
# Onehot Encoding
service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(X.service.values.reshape(-1,1))
ohe_proto = proto_.fit(X.proto.values.reshape(-1,1))
ohe_state = state_.fit(X.state.values.reshape(-1,1))

In [64]:
# We are onehot encoding the given column
# Remove the original categorical column
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(X[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
    X = pd.concat([X.drop(col, axis=1), tmp_df], axis=1)

In [65]:
X.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.069555,-0.05499,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.067041,-0.049722,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.069555,-0.05499,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.069546,-0.05499,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.06956,-0.054997,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# Making the train data sparse matrix
x_train_csr = csr_matrix(X.values)

col = X.columns

# Creating sparse dataframe with x_train sparse matrix
X = pd.DataFrame.sparse.from_spmatrix(x_train_csr, columns=col)

In [67]:
X.sparse.to_dense().head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.069555,-0.05499,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.067041,-0.049722,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.069555,-0.05499,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.069546,-0.05499,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.425902,-0.041232,-1.196094,-0.237863,-0.089307,-1.175319,-0.06956,-0.054997,-0.133896,-0.126759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
X.shape, y.shape

((2540047, 197), (2540047,))

In [74]:
# Number of features to select
num_features = 100
# Select K Best features based on Mutual Information
selector = SelectKBest(score_func=mutual_info_classif, k=num_features)
# Fit to data
selector.fit(X.iloc[:, 0:197], y)
# Get the scores for each feature
feature_scores = pd.Series(selector.scores_, index=X.columns[:197])
# Get the top num_features based on the scores
top_features = feature_scores.nlargest(num_features)
# Print the top features
print("Top 100 features:")
for feature in top_features.index:
    print(feature, end=", ")



Top 100 features:
sload_log1p, network_bytes_log1p, sbytes_log1p, ct_state_ttl, sttl, smeansz_log1p, dur_log1p, dload_log1p, dintpkt, dbytes_log1p, dttl, dmeansz_log1p, sintpkt, djit_log1p, sjit_log1p, stcpb_log1p, dtcpb_log1p, stime, tcprtt, synack, ackdat, state_INT, ct_dst_sport_ltm, spkts_log1p, ct_dst_ltm, ct_src_ltm, ct_srv_src, proto_tcp, swin, state_FIN, service_dns, state_CON, proto_udp, service_None, res_bdy_len, proto_unas, service_ftp-data, service_ssh, proto_sctp, service_pop3, ct_flw_http_mthd, trans_depth, proto_ospf, service_smtp, ct_ftp_cmd, is_ftp_login, proto_arp, service_http, service_ftp, proto_any, proto_gre, is_sm_ips_ports, proto_rsvp, proto_ipv6, proto_mobile, proto_pim, proto_sun-nd, proto_swipe, proto_sep, service_dhcp, service_ssl, proto_3pc, proto_a/n, proto_aes-sp3-d, proto_argus, proto_aris, proto_ax.25, proto_bbn-rcc, proto_bna, proto_br-sat-mon, proto_cbt, proto_cftp, proto_chaos, proto_compaq-peer, proto_cphb, proto_cpnx, proto_crtp, proto_crudp, proto

In [75]:
X_sel = X[top_features.index]

In [76]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)

# Splitting the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

In [77]:
print("Training set:", len(X_train))
print("Validation set:", len(X_val))
print("Testing set:", len(X_test))

Training set: 1778032
Validation set: 254005
Testing set: 508010


In [86]:
# Make sure to replace input_shape and output_classes with the correct values for your dataset
input_shape = (X_train.shape[1], 1)
output_classes = 1

In [87]:
def create_1d_cnn_model(params):
    model = Sequential()
    
    model.add(Conv1D(filters=params['num_filters'], kernel_size=params['kernel_size'], activation='relu', input_shape=(input_shape)))
    model.add(MaxPooling1D(pool_size=params['pooling_size']))
    
    model.add(Conv1D(filters=params['num_filters']*2, kernel_size=params['kernel_size'], activation='relu'))
    model.add(MaxPooling1D(pool_size=params['pooling_size']))    
       
    model.add(Dropout(params['dropout_rate']))    
    model.add(Flatten())

    for _ in range(params['num_dense_layers']):
        model.add(Dense(params['num_neurons_in_dense_layers'], activation='relu'))
        model.add(Dropout(params['dropout_rate']))

    model.add(Dense(output_classes, activation='sigmoid'))

    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [88]:
def generate_random_parameters():
    params = {
        'num_filters': random.choice([16, 32, 64, 128, 256]),
        'kernel_size': random.choice([3, 5, 7, 9, 11]),
        'pooling_size': random.randint(2, 6),
        'num_dense_layers': random.randint(1, 5),
        'num_neurons_in_dense_layers': random.choice([16, 32, 64, 128, 256]),
        'dropout_rate': random.uniform(0.1, 0.5),
        'learning_rate': random.uniform(1e-5, 1e-2),
        'batch_size': random.choice([16, 32, 64, 128, 256]),
        'num_epochs': random.randint(10, 100)
    }
    return params

In [89]:
def genetic_algorithm(population_size, generations, mutation_rate, crossover_rate):
    population = [generate_random_parameters() for _ in range(population_size)]

    for generation in range(generations):
        print(f"Generation {generation+1}")

        # Evaluate fitness of the population
        fitness_scores = []
        for i, params in enumerate(population):
            print(f"  Evaluating individual {i+1}/{population_size}")
            fitness_score = evaluate_fitness(params)
            fitness_scores.append(fitness_score)
                     
        # Select parents
        parents = selection(population, fitness_scores, crossover_rate)

        # Crossover
        offspring = crossover(parents)

        # Mutation
        offspring = mutation(offspring, mutation_rate)

        # Replace the worst individuals
        population = update_population(population, fitness_scores, offspring)

    return max(population, key=evaluate_fitness)

In [90]:
def evaluate_fitness(params):
    model = create_1d_cnn_model(params)
    
    # Create a custom callback to print epoch progress
    class PrintEpochProgress(keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            print(f"  Epoch {epoch+1}/{params['num_epochs']} - val_accuracy: {logs['val_accuracy']:.4f}")
    
    history = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['num_epochs'], verbose=0, validation_data=(X_val, y_val), callbacks=[PrintEpochProgress()])
    return history.history['val_accuracy'][-1]

def selection(population, fitness_scores, crossover_rate):
    parents = []
    for _ in range(int(crossover_rate * len(population))):
        selected = random.choices(population, weights=fitness_scores, k=2)
        parents.append(selected)
    return parents

def crossover(parents):
    offspring = []
    for parent1, parent2 in parents:
        child = {}
        for key in parent1.keys():
            child[key] = random.choice([parent1[key], parent2[key]])
        offspring.append(child)
    return offspring

def mutation(offspring, mutation_rate):
    mutated_offspring = []
    for child in offspring:
        mutated_child = {}
        for key in child.keys():
            if random.random() < mutation_rate:
                mutated_child[key] = generate_random_parameters()[key]
            else:
                mutated_child[key] = child[key]
        mutated_offspring.append(mutated_child)
    return mutated_offspring

def update_population(population, fitness_scores, offspring):
    sorted_indices = np.argsort(fitness_scores)
    for i in range(len(offspring)):
        population[sorted_indices[i]] = offspring[i]
    return population

In [None]:
population_size = 20 
generations = 10
mutation_rate = 0.1
crossover_rate = 0.5

best_parameters = genetic_algorithm(population_size, generations, mutation_rate, crossover_rate)

Generation 1
  Evaluating individual 1/20
  Epoch 1/39 - val_accuracy: 0.9905
  Epoch 2/39 - val_accuracy: 0.9910


In [None]:
print("Best parameters found:")
for key, value in best_parameters.items():
    print(f"{key}: {value}")

In [None]:
best_model = create_1d_cnn_model(best_parameters)
history = best_model.fit(X_train, y_train, batch_size=best_parameters['batch_size'], epochs=best_parameters['num_epochs'], validation_data=(X_val, y_val))

In [None]:
plot_model(best_model, to_file='model.png', show_shapes=True, show_layer_names=True)

In [None]:
# Plot the training and validation accuracy over the epochs
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('acc_plo.jpg')

In [None]:
# Plot the training and validation loss over the epochs
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('los_plo.jpg')

In [None]:
# Print train, test and validation accuracy
train_loss, train_acc = best_model.evaluate(np.expand_dims(X_train, axis=2), y_train, verbose=0)
print('Train Loss: {:.2f}%'.format(train_loss*100))
print('Train Accuracy: {:.2f}%'.format(train_acc*100))

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = best_model.evaluate(np.expand_dims(X_test, axis=2), y_test, verbose=0)
print("Test Loss: {:.2f}%".format(test_loss*100))
print("Test Accuracy: {:.2f}%".format(test_accuracy*100))

In [None]:
# Generate predictions on the test set
y_pred = best_model.predict(np.expand_dims(X_test, axis=2))

In [None]:
# Performance comparison table
print(classification_report(y_test, np.round(y_pred)))

In [None]:
# Compute the confusion matrix
conf_mat = confusion_matrix(y_test, np.round(y_pred))

# Define the class labels
class_labels = ['No Intrusion', 'Intrusion']

# Create a heatmap plot of the confusion matrix
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)

# Set the plot labels and title
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Show the plot
plt.show()
plt.savefig('con_max.jpg')

In [None]:
# Predict the test set
y_pred = best_model.predict(np.expand_dims(X_test, axis=2))
y_pred = (y_pred > 0.5)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Print confusion matrix as heatmap with percentages
plt.figure(figsize=(8,6))
sns.heatmap(cm_norm, annot=True, cmap='Blues', fmt='.2f')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix with Percentages')
plt.show()
plt.savefig('con_muxf.jpg')

In [None]:
# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title("ROC Curve")
plt.show()
plt.savefig('roc_cur.jpg')

In [None]:
# Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision)
plt.title("Precision-Recall Curve")
plt.show()
plt.savefig('pre_rec.jpg')

In [None]:
# Get the test accuracy and loss for each epoch from the history object
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

# Plot the test accuracy and loss
plt.plot(test_acc, label='Test Accuracy')
plt.plot(test_loss, label='Test Loss')

# Add axis labels and a legend
plt.xlabel('Epoch')
plt.ylabel('Accuracy/Loss')
plt.title('Test Accuracy and Loss')
plt.legend()

# Show the plot
plt.show()
plt.savefig('tes_los.jpg')