<a href="https://colab.research.google.com/github/Nitesh-Kumar-074/NetwrokIntrusionDetectionSystem/blob/main/NIDS3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pickle

In [None]:

from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb


from sklearn.metrics import accuracy_score,confusion_matrix,make_scorer,auc,f1_score,roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate, cross_val_predict

In [None]:
train_data = pd.read_csv(r"/content/drive/MyDrive/train_alldata3_EDA.csv",chunksize=10000)
train = pd.concat(train_data,ignore_index=True)

In [None]:
train.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'label'],
      dtype='object')

In [None]:
test_data = pd.read_csv(r"/content/drive/MyDrive/test_alldata_EDA.csv",chunksize=10000)
test = pd.concat(test_data,ignore_index=True)

In [None]:
def multi_corr(col1, col2="label", df=train):
    '''
    This function returns correlation between 2 given features.
    Also gives corr of the given features with "label" afetr applying log1p to it.
    '''
    corr = df[[col1, col2]].corr().iloc[0,1]
    log_corr = df[col1].apply(np.log1p).corr(df[col2])

    print("Correlation : {}\nlog_Correlation: {}".format(corr, log_corr))

In [None]:
def corr(col1, col2="label", df=train):
    """
    This function returns correlation between 2 given features
    """
    return df[[col1, col2]].corr().iloc[0,1]

In [None]:
non_numeric_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()

print(non_numeric_cols)

['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service', 'attack_cat']


In [None]:
train_numeric = train.drop(columns=non_numeric_cols)

In [None]:
# Selecting all the features with high correlation values with other features
# Refer: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
corr_matrix = train_numeric.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
# We don't want to use these features for plotting because these are having high corr
# And most likely have same kind of plots with already plotted feature
print(to_drop)

['sloss', 'dloss', 'dpkts', 'dwin', 'ltime', 'ct_srv_dst', 'ct_src_dport_ltm', 'ct_dst_src_ltm']


In [None]:
saved_dict = {}
with open('/content/drive/My Drive/saved_dict.pkl', 'rb') as file:
    saved_dict = pickle.load(file)

In [None]:
len(saved_dict['columns'])

48

In [None]:

saved_dict['corr_col'] = to_drop

In [None]:
# removing the features from train and test data
train.drop(columns=to_drop, inplace=True)

In [None]:

train.shape, test.shape

((1778032, 41), (762015, 49))

In [None]:

# creating new features
train['network_bytes'] = train['sbytes'] + train['dbytes']

In [None]:

train.shape, test.shape

((1778032, 42), (762015, 49))

In [None]:
# Dropping columns which are not useful for the classification
# attack_cat is for multiclass classification
# all the other columns are address related and not present in sample train data
train.drop(['srcip', 'sport', 'dstip', 'dsport', 'attack_cat'], axis=1, inplace=True)

In [None]:
# To use during test data transformation
saved_dict['to_drop'] = ['srcip', 'sport', 'dstip', 'dsport', 'attack_cat']

In [None]:
train.shape, test.shape

((1778032, 37), (762015, 49))

Applying log1p on Numerical columns

During EDA we found that few numerical columns shows better visualization for pdf curves if we apply log1p to the columns.

So I thought to try log1p on all the columns and check the correlation value of the original column and log1p column with target column i.e. "label"

In [None]:
# Getting number of unique values of all the columns
# If the unique values are high that means it has continuous set of values
col_unique_values = train.nunique()

In [None]:
# If the unique values are getter than some threshould than we will check its corr
col = col_unique_values[col_unique_values>200].index

In [None]:
# Checking corr value of original col and log1p applied col
# Taking those columns whose unique values are getter than some threshould
for column in col:
    print("{:-^30}".format(column))
    multi_corr(column)

-------------dur--------------
Correlation : 0.0019274028701131475
log_Correlation: -0.032544137564606314
------------sbytes------------
Correlation : 0.010344749695229565
log_Correlation: -0.356163155589846
------------dbytes------------
Correlation : -0.07641408324436148
log_Correlation: -0.5193868283741504
------------sload-------------
Correlation : 0.19211948100086756
log_Correlation: 0.34746601450349446
------------dload-------------
Correlation : -0.21978094390126515
log_Correlation: -0.6033545881626384
------------spkts-------------
Correlation : -0.12200425437154418
log_Correlation: -0.31635338269675845
------------stcpb-------------
Correlation : -0.23365153315010911
log_Correlation: -0.3135563222142899
------------dtcpb-------------
Correlation : -0.23346071773809843
log_Correlation: -0.31340064798120926
-----------smeansz------------
Correlation : -0.06517990378993671
log_Correlation: -0.15111450989648403
-----------dmeansz------------
Correlation : -0.27230605607442226
log

In [None]:
# Will apply log1p on this columns and remove original columns
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit', 'network_bytes']

In [None]:
saved_dict['log1p_col'] = log1p_col

In [None]:
# mode values of every features, will use to fill Null values of test
mode_dict = train.mode().iloc[0].to_dict()

In [None]:
def log1p_transform(col, df=train):
    '''
    Apply log1p on given column.
    Remove the original cola and keep log1p applied col
    '''
    new_col = col+'_log1p'
    df[new_col] = df[col].apply(np.log1p)
    df.drop(col, axis=1, inplace=True)

In [None]:

# Transforming columns with log1p
for col in log1p_col:
    log1p_transform(col, df=train)

In [None]:
train.shape

(1778032, 37)

In [None]:
train.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'label', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:

train.shape, test.shape

((1778032, 37), (762015, 49))

In [None]:

# creating x and y set from the dataset
x_train, y_train = train.drop(columns=['label']), train['label']
x_test, y_test = test.drop(columns=['label']), test['label']


In [None]:

print(x_train.shape, y_train.shape)
print()
print(x_test.shape, y_test.shape)

(1778032, 36) (1778032,)

(762015, 48) (762015,)


In [None]:

# Saving all the files to disk to use later
pickle.dump((x_train, y_train), open('/content/drive/My Drive/final_train.pkl', 'wb'))
pickle.dump((x_test, y_test), open('/content/drive/My Drive/final_test.pkl', 'wb'))

In [None]:

# getting categorical and numerical columns in 2 diff lists
cat_col = ['proto', 'service', 'state']
num_col = list(set(x_train.columns) - set(cat_col))

In [None]:
# To use later, during test data cleaning
saved_dict['cat_col'] = cat_col
saved_dict['num_col'] = num_col

In [None]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,254,0,,0,0,0,1421930643,33.479,...,9.2756,0.0,1.609438,0.0,0.0,3.828641,0.0,3.878042,0.0,5.187386
1,udp,INT,60,0,dns,0,0,0,1424246229,0.008,...,18.698312,0.0,1.098612,0.0,0.0,4.890349,0.0,0.0,0.0,5.57973
2,tcp,FIN,31,29,,255,0,0,1421948071,0.372205,...,14.105347,16.314201,3.713572,20.196135,21.733479,4.174387,6.313548,0.0,3.01207,10.152883
3,tcp,FIN,31,29,ftp,255,0,0,1421971944,16.14474,...,10.258074,10.501435,3.970292,21.803017,20.49442,4.043051,4.248495,7.264606,3.984562,8.806124
4,tcp,FIN,31,29,,255,0,0,1421963050,1.2188,...,13.339317,13.412088,2.833213,20.673269,21.855078,4.574711,4.521789,4.309533,1.138118,8.066208


Standardizing
As we have seen that the range of few features in this dataset is very large. So we will keep everything within certain range by applying standardscaler. After this all the features will have mean 0 and std 1

In [None]:
# Standardizing the data
scaler = StandardScaler()
scaler = scaler.fit(x_train[num_col])

In [None]:
x_train[num_col] = scaler.transform(x_train[num_col])

In [None]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,2.561444,-0.71776,,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,...,-1.383776,-1.879995,-0.704801,-1.190007,-1.189697,-1.137341,-1.850553,0.20427,-0.899657,-1.229918
1,udp,INT,-0.037542,-0.71776,dns,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,...,1.614925,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657,-1.062092
2,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,...,0.153252,0.988351,0.843701,0.741629,0.888925,-0.582651,0.881301,-0.995343,0.170283,0.894089
3,tcp,FIN,-0.426051,-0.041365,ftp,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,...,-1.071111,-0.033644,1.03263,0.895318,0.77042,-0.793357,-0.012242,1.251851,0.51573,0.318009
4,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,...,-0.090531,0.478104,0.195816,0.787264,0.900555,0.059601,0.106012,0.337745,-0.495378,0.001507


We used Binary encoding, as it is RAM efficient but have lesser number of attributes

In [None]:
!pip install category_encoders



In [None]:
import category_encoders as ce

In [None]:
# Initialize the Binary Encoder
encoder = ce.BinaryEncoder(cols=['proto', 'service', 'state'])

# Fit and transform the data
x_train_encoded = encoder.fit_transform(x_train)

# Now x_train_encoded contains the binary encoded columns
print(x_train_encoded.head())

   proto_0  proto_1  proto_2  proto_3  proto_4  proto_5  proto_6  proto_7  \
0        0        0        0        0        0        0        0        1   
1        0        0        0        0        0        0        0        1   
2        0        0        0        0        0        0        1        0   
3        0        0        0        0        0        0        1        0   
4        0        0        0        0        0        0        1        0   

   state_0  state_1  ...  sload_log1p  dload_log1p  spkts_log1p  stcpb_log1p  \
0        0        0  ...    -1.383776    -1.879995    -0.704801    -1.190007   
1        0        0  ...     1.614925    -1.879995    -1.080734    -1.190007   
2        0        0  ...     0.153252     0.988351     0.843701     0.741629   
3        0        0  ...    -1.071111    -0.033644     1.032630     0.895318   
4        0        0  ...    -0.090531     0.478104     0.195816     0.787264   

   dtcpb_log1p  smeansz_log1p  dmeansz_log1p  sjit_log1p

In [None]:
file_path = '/content/drive/My Drive/'

In [None]:
pickle.dump(scaler, open(file_path+'scaler.pkl', 'wb'))  # Standard scaler
pickle.dump(saved_dict, open(file_path+'saved_dict.pkl', 'wb'))  # Dictionary with important parameters
pickle.dump(mode_dict, open(file_path+'mode_dict.pkl', 'wb'))  #  Dictionary with most frequent values of columns

Loading the Binary Encoder Later
When you want to use the saved binary encoder to transform new data (like your test set), you can load it back into your script as follows:

python


In [None]:
# #Load the binary encoder from the file
# loaded_encoder = pickle.load(open(file_path + 'binary_encoder.pkl', 'rb'))

# # Transform the test data using the loaded encoder
# x_test_encoded = loaded_encoder.transform(x_test)

# # Now x_test_encoded contains the binary encoded columns for the test set
# print(x_test_encoded.head())

In [None]:

x_test.shape

(762015, 48)

In [None]:

# Resetting index of test data
x_test.reset_index(drop=True, inplace=True)

In [None]:
x_test.shape

(762015, 48)

In [None]:
x_test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat'],
      dtype='object')

In [None]:
def clean_data(data):
    '''
    Cleans given raw data. Performs various cleaning, removes Null and wrong values.
    Check for columns datatype and fix them.
    '''
    numerical_col = data.select_dtypes(include=np.number).columns  # All the numerical columns list
    categorical_col = data.select_dtypes(exclude=np.number).columns  # All the categorical columns list

    # Cleaning the data
    for col in data.columns:
        val = mode_dict[col]  # Mode value of the column in train data
        data[col] = data[col].fillna(value=val)
        data[col] = data[col].replace(' ', value=val)
        data[col] = data[col].apply(lambda x:"None" if x=="-" else x)

        # Fixing binary columns
        if col in saved_dict['binary_col']:
            data[col] = np.where(data[col]>1, val, data[col])

    # Fixing datatype of columns
    bad_dtypes = list(set(categorical_col) - set(saved_dict['cat_col']))
    for bad_col in bad_dtypes:
        data[col] = data[col].astype(float)

    return data

In [None]:
def apply_log1p(data):
    '''
    Performs FE on the data. Apply log1p on the specified columns create new column and remove those original columns.
    '''
    for col in saved_dict['log1p_col']:
        new_col = col + '_log1p'  # New col name
        data[new_col] = data[col].apply(np.log1p)  # Creating new column on transformed data
        data.drop(col, axis=1, inplace=True)  # Removing old columns
    return data

In [None]:
def standardize(data):
    '''
    Stanardize the given data. Performs mean centering and varience scaling.
    Using stanardscaler object trained on train data.
    '''
    data[saved_dict['num_col']] = scaler.transform(data[saved_dict['num_col']])
    return data

In [None]:
len(x_train_encoded.columns)

50

In [None]:
len(saved_dict['columns'])

48

In [None]:
# Parametrs
saved_dict = pickle.load(open(file_path+'saved_dict.pkl', 'rb'))
# Mode value of all the columns
mode_dict = pickle.load(open(file_path+'mode_dict.pkl', 'rb'))
# Stanardscaler object
scaler = pickle.load(open(file_path+'scaler.pkl', 'rb'))

In [None]:
len(saved_dict['columns'])

48

In [None]:
pickle.dump(encoder, open(file_path + 'binary_encoder.pkl', 'wb'))

In [None]:
x_test.shape

(762015, 48)

In [None]:
# Resetting index of test data
x_test.reset_index(drop=True, inplace=True)

In [None]:
x_test.shape

(762015, 48)

In [None]:
x_test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat'],
      dtype='object')

In [None]:
x_test.columns = saved_dict['columns']

In [None]:
# Creating new Feature
x_test['network_bytes'] = x_test['dbytes'] + x_test['sbytes']

In [None]:
# Droping all the unwanted columns
dropable_col = saved_dict['to_drop'] + saved_dict['corr_col']
x_test.drop(columns=dropable_col, inplace=True)

In [None]:
x_test.shape

(762015, 36)

In [None]:
# Cleaning data using clean_data()
x_test = clean_data(x_test)

In [None]:
x_test.shape

(762015, 36)

In [None]:
# FE: applying log1p using apply_log1p()
x_test = apply_log1p(x_test)

In [None]:
x_test.shape

(762015, 36)

In [None]:
x_test.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'dur_log1p', 'sbytes_log1p', 'dbytes_log1p',
       'sload_log1p', 'dload_log1p', 'spkts_log1p', 'stcpb_log1p',
       'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p', 'sjit_log1p',
       'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:
# Standardscaling using stanardize()
x_test = standardize(x_test)

In [None]:
x_test.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,-0.426051,-0.041365,dns,-1.196045,-0.225343,-0.089113,-1.148407,-0.069615,...,-0.121716,0.482924,-1.080734,-1.190007,-1.189697,-0.374605,0.096502,-0.995343,-0.899657,-0.974789
1,udp,CON,-0.426051,-0.041365,dns,-1.196045,-0.225343,-0.089113,-1.163715,-0.069617,...,-0.112689,0.487911,-1.080734,-1.190007,-1.189697,-0.374605,0.096502,-0.995343,-0.899657,-0.974789
2,udp,INT,-0.037542,-0.71776,dns,-1.196045,-0.225343,-0.089113,0.855736,-0.069616,...,1.577442,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657,-1.062092
3,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.157963,-0.069518,...,0.619603,0.775428,0.277671,0.898135,0.898496,1.366104,0.278308,-0.163891,-0.794201,0.298473
4,udp,CON,-0.426051,-0.041365,dns,-1.196045,-0.225343,-0.089113,0.864749,-0.069616,...,-0.150173,0.467203,-1.080734,-1.190007,-1.189697,-0.374605,0.096502,-0.995343,-0.899657,-0.974789


In [None]:
x_test_encoded = encoder.fit_transform(x_test)

In [None]:
x_test_encoded.shape

(762015, 50)

In [None]:

# Matching test data columns with train data columns
all(x_train_encoded.columns == x_test_encoded.columns)

True

In [None]:
x_train = x_train_encoded

In [None]:
x_test = x_test_encoded

In [None]:
# Cleaned and processed train data
pickle.dump((x_train, y_train), open(file_path+'final_train.pkl', 'wb'))

In [None]:
pickle.dump((x_test, y_test), open(file_path+'final_test.pkl', 'wb'))

In [None]:
saved_dict['columns'] = x_train_encoded.columns

In [None]:
pickle.dump(saved_dict, open(file_path+'saved_dict.pkl', 'wb'))  # Dictionary with important parameters