In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from imblearn.under_sampling import NearMiss
from collections import Counter
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [2]:
# Attribute Information:

data_info = pd.read_csv("../dataset/NUSW-NB15_features.csv", encoding = "ISO-8859-1")

In [3]:
# Retrieve Attribute Names

column_attributes = list (data_info['Name']) 

In [4]:
# Read the data available in 4 csv files using temporary data frames

dataframe1 = pd.read_csv("../dataset/UNSW-NB15_1.csv",low_memory=False, header=None)
dataframe2 = pd.read_csv("../dataset/UNSW-NB15_2.csv", low_memory=False, header=None)
dataframe3 = pd.read_csv("../dataset/UNSW-NB15_3.csv", low_memory=False, header=None)
dataframe4 = pd.read_csv("../dataset/UNSW-NB15_4.csv",low_memory=False, header=None)

In [5]:
# Name the columns of all data frames using attribute names from features.csv

df_list = [dataframe1,dataframe2,dataframe3,dataframe4]
for df in df_list:
    name = [v.lower() for k, v in enumerate(column_attributes)]
    df.columns = name

In [6]:
# Concatenate the 4 temporary data frames into one single data frame

dataset = pd.concat([dataframe1, dataframe2, dataframe3, dataframe4])
dataset.shape

(2540047, 49)

In [7]:
# Delete temporary data frames

del dataframe1
del dataframe2
del dataframe3
del dataframe4

In [None]:
# Function to show the different attack cateegories present in the dataset

stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):

    attack_catgs = data.loc[:,'attack_cat'].replace('Backdoor','Backdoors', regex=False)
    attack_catgs = attack_catgs.dropna().to_numpy()
    attack_catgs = [x.strip(' ') for x in attack_catgs]
    attack_catgs = np.unique(attack_catgs)
    text = " ".join(str(item) for item in attack_catgs)
    wordcloud = WordCloud(
        background_color='white',
        collocations=False,
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(text)
 
    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
    
show_wordcloud(dataset)

In [None]:
# Plot to show the distribution of all attack categories

def plot_attack_category(data):
    sns.set(rc={'figure.figsize':(15,10),"font.size":70,"axes.titlesize":40,"axes.labelsize":20},style="white")
    catg_plot = sns.countplot(y='attack_cat',data = data)
    
plot_attack_category(dataset)

In [None]:
# Do this later
'''
def plot_correlation(data):
    corr = data.corr()
    #print(corr)
    plt.figure(figsize=(19, 15))
    ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0,square=True,annot_kws={"size":12},cmap="YlGnBu")
    ax.set_xticklabels(ax.get_xticklabels(),rotation=45,horizontalalignment='right',fontsize = 30)
    
plot_correlation(dataset)
'''


In [8]:
# Provided data is not consistent. Check and make relevant changes

columns_with_space = dataset.columns[dataset.isin([' ']).any()].tolist()
print('columns_with_space ',columns_with_space)
columns_with_null = dataset.columns[dataset.isna().any()].tolist()
print('columns_with_null ', columns_with_null)
columns_with_dash = dataset.columns[dataset.isin(['-']).any()].tolist()
print('columns_with_dash', columns_with_dash)

columns_with_space  ['ct_ftp_cmd']
columns_with_null  ['ct_flw_http_mthd', 'is_ftp_login', 'attack_cat']
columns_with_dash ['sport', 'dsport', 'service']


In [9]:
# Handle null and '-' values

# Where attack_cat not mentioned, replace with 'Normal'
dataset.loc[:,'attack_cat'] = dataset.loc[:,'attack_cat'].replace(np.nan,'Normal', regex=True)

dataset.loc[:,'ct_flw_http_mthd'] = dataset.loc[:,'ct_flw_http_mthd'].replace(np.nan,0, regex=True)
dataset.loc[:,'is_ftp_login'] = dataset.loc[:,'is_ftp_login'].replace(np.nan,0, regex=True)

dataset.loc[:,'ct_ftp_cmd'] = dataset.loc[:,'ct_ftp_cmd'].replace(' ',0, regex=True)

dataset.loc[:,'sport'] = dataset.loc[:,'sport'].replace('-',0, regex=False)
dataset.loc[:,'dsport'] = dataset.loc[:,'dsport'].replace('-',0, regex=False)
dataset.loc[:,'service'] = dataset.loc[:,'service'].replace('-','Others', regex=False)

dataset.loc[:,'attack_cat'] = dataset.loc[:,'attack_cat'].replace('Backdoor','Backdoors', regex=False)

In [10]:
# Columns that have String values

categorical_values = dataset.select_dtypes(exclude=['number']).keys()
print('categorical_values ', categorical_values)

categorical_values  Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service',
       'ct_ftp_cmd', 'attack_cat'],
      dtype='object')


In [11]:
# ct_ftp_cmd field has erroneous String values, such as '0' instead of 0. Handle those values

arrg = dataset.loc[:, 'ct_ftp_cmd'].astype('str')
dataset.loc[:, 'ct_ftp_cmd'] = pd.to_numeric(arrg, errors='coerce').fillna(0).astype(np.int32)
print(dataset.loc[:, 'ct_ftp_cmd'].unique())
print('ct_ftp_cmd', dataset.loc[:, 'ct_ftp_cmd'].dtypes)

[0 1 6 2 4 8 5 3]
ct_ftp_cmd int32


In [12]:
# Label Encoding for categorical values. Can't do one hot encoding as it will result in huge increase in column size.

dataset.loc[:,'srcip'] = dataset.loc[:,'srcip'].astype('category')
# Assigning numerical values
dataset.loc[:,'srcip'] = dataset.loc[:,'srcip'].cat.codes

dataset.loc[:,'sport'] = dataset.loc[:,'sport'].astype('category')
# Assigning numerical values 
dataset.loc[:,'sport'] = dataset.loc[:,'sport'].cat.codes

dataset.loc[:,'dstip'] = dataset.loc[:,'dstip'].astype('category')
# Assigning numerical values
dataset.loc[:,'dstip'] = dataset.loc[:,'dstip'].cat.codes

dataset.loc[:,'dsport'] = dataset.loc[:,'dsport'].astype('category')
# Assigning numerical values
dataset.loc[:,'dsport'] = dataset.loc[:,'dsport'].cat.codes

In [13]:
dataset.loc[:,'proto'] = dataset.loc[:,'proto'].astype('category')
# Assigning numerical values
dataset.loc[:,'proto'] = dataset.loc[:,'proto'].cat.codes

dataset.loc[:,'state'] = dataset.loc[:,'state'].astype('category')
# Assigning numerical values 
dataset.loc[:,'state'] = dataset.loc[:,'state'].cat.codes

dataset.loc[:,'service'] = dataset.loc[:,'service'].astype('category')
# Assigning numerical values
dataset.loc[:,'service'] = dataset.loc[:,'service'].cat.codes

In [14]:
# Print dataframe after label encoding

dataset[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,33,68854,24,47344,120,2,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,Normal,0
1,33,90600,27,253,120,2,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,Normal,0
2,39,69669,25,47344,120,2,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,Normal,0
3,38,93095,23,47344,120,2,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,Normal,0
4,36,108205,8,47344,120,2,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,Normal,0


In [15]:
# Store input columns in X

X = dataset.iloc[:,0:47]
print(X[0:5])

   srcip   sport  dstip  dsport  proto  state       dur  sbytes  dbytes  sttl  \
0     33   68854     24   47344    120      2  0.001055     132     164    31   
1     33   90600     27     253    120      2  0.036133     528     304    31   
2     39   69669     25   47344    120      2  0.001119     146     178    31   
3     38   93095     23   47344    120      2  0.001209     132     164    31   
4     36  108205      8   47344    120      2  0.001169     146     178    31   

   ...  ct_flw_http_mthd  is_ftp_login  ct_ftp_cmd  ct_srv_src  ct_srv_dst  \
0  ...               0.0           0.0           0           3           7   
1  ...               0.0           0.0           0           2           4   
2  ...               0.0           0.0           0          12           8   
3  ...               0.0           0.0           0           6           9   
4  ...               0.0           0.0           0           7           9   

   ct_dst_ltm  ct_src_ ltm  ct_src_dport_ltm

In [18]:
# Store output columns in Y

Y = dataset.iloc[:,-2:]
print(Y[0:5])

  attack_cat  label
0     Normal      0
1     Normal      0
2     Normal      0
3     Normal      0
4     Normal      0


In [None]:
# One Hot Encoding of those columns that have lesser unique values:

'''print('Unique categories count: proto ', len(X.loc[:,'proto'].unique()))
print('Unique categories count: state  ',len(X.loc[:,'state'].unique()))
print('Unique categories count: service ', len(X.loc[:,'service'].unique()))
dummy_cols = ['proto', 'state', 'service']
X = pd.get_dummies(X, columns=dummy_cols)
print('Data After one hot encoding\n', X[0:5])'''

In [19]:
# Encoding of output column 'attack_cat'

Y.loc[:,'attack_cat'] = [i.strip(' ') for i in Y.loc[:,'attack_cat']]
print('attack cat ', (Y.loc[:,'attack_cat'].unique()))


encoding = { 'attack_cat':{'Normal':0, 'Exploits':1, 'Reconnaissance':2, 'DoS':3, 'Generic':4,
       'Shellcode':5, 'Fuzzers':6, 'Worms':7, 'Backdoors':8, 'Analysis':9}}
Y.replace(encoding, inplace=True)


'''
Y.loc[:,'attack_cat'] = Y.loc[:,'attack_cat'].astype('category')
# Assigning numerical values
Y.loc[:,'attack_cat'] = Y.loc[:,'attack_cat'].cat.codes
'''
'''Y_enc = pd.get_dummies(Y, columns=['attack_cat'])'''

attack cat  ['Normal' 'Exploits' 'Reconnaissance' 'DoS' 'Generic' 'Shellcode'
 'Fuzzers' 'Worms' 'Backdoors' 'Analysis']


"Y_enc = pd.get_dummies(Y, columns=['attack_cat'])"

In [20]:
print(Y[0:5])

   attack_cat  label
0           0      0
1           0      0
2           0      0
3           0      0
4           0      0


In [None]:
from feature_selector import FeatureSelector
fs = FeatureSelector(data = X, labels = Y.loc[:,'attack_cat'])

In [None]:
fs.identify_missing(missing_threshold=0.2)

In [None]:
fs.identify_single_unique()

In [None]:
fs.identify_collinear(correlation_threshold=0.975)

In [None]:
fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', 
                            n_iterations = 10, early_stopping = True)

In [None]:
one_hot_features = fs.one_hot_features
base_features = fs.base_features
print('There are %d original features' % len(base_features))
print('There are %d one-hot features' % len(one_hot_features))

In [None]:
zero_importance_features = fs.ops['zero_importance']
zero_importance_features

In [None]:
fs.feature_importances

In [None]:
fs.identify_low_importance(cumulative_importance = 0.99)

In [None]:
low_importance_features = fs.ops['low_importance']
low_importance_features

In [None]:
all_to_remove = fs.check_removal()
all_to_remove

In [21]:
# Drop the features having low importance
X = X.drop(['is_ftp_login',
 'ct_src_dport_ltm',
 'ct_ftp_cmd',
 'dwin',
 'stime',
 'ltime', 'ct_src_ ltm', 'ct_dst_sport_ltm', 'ct_dst_ltm', 'trans_depth','is_sm_ips_ports' ], axis = 1)
X.shape

(2540047, 36)

In [22]:
# Normalization

from scipy import stats
from sklearn.preprocessing import MinMaxScaler
# Pass parameter type = 'statistical' or 'minmax'
# Note: statistical is better for SVM and KNN.
def normalization(data, type='minmax'):
    if type == 'statistical':
        return stats.zscore(data)
    elif type == 'minmax':
        scaler = MinMaxScaler()
        return scaler.fit_transform(data)
    else:
        print('\n Norm type not found! \n')

In [141]:
'''count = 0
X_normal = []
y_normal = []
for i in Y['attack_cat']:
    if (i == 0):
        X_normal.append(X.loc[count])
        y_normal.append(i)
    count = count + 1
len(X_normal)'''

"count = 0\nX_normal = []\ny_normal = []\nfor i in Y['attack_cat']:\n    if (i == 0):\n        X_normal.append(X.loc[count])\n        y_normal.append(i)\n    count = count + 1\nlen(X_normal)"

In [23]:
print(Counter(Y.loc[:,'label']))
print(Counter(Y.loc[:,'attack_cat']))
print(X.shape)
'''X_normal = pd.DataFrame(X_normal)
y_normal = pd.DataFrame(y_normal)
'''

Counter({0: 2218764, 1: 321283})
Counter({0: 2218764, 4: 215481, 1: 44525, 6: 24246, 3: 16353, 2: 13987, 9: 2677, 8: 2329, 5: 1511, 7: 174})
(2540047, 36)


'X_normal = pd.DataFrame(X_normal)\ny_normal = pd.DataFrame(y_normal)\n'

In [30]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr_undersample = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)
X_ncr_undersample, y_ncr_undersample = ncr_undersample.fit_resample(X, Y.loc[:,'label'])

In [24]:
'''undersample = NearMiss(sampling_strategy=1.0, version=3, n_jobs=5)
X_undersample, y_undersample = undersample.fit_resample(X, Y.loc[:,'label'])'''

  "The number of the samples to be selected is larger"


In [34]:
len(y_ncr_undersample)

2463065

In [35]:
len(X_ncr_undersample)

2463065

In [36]:
print(y_ncr_undersample.unique())
Counter(y_ncr_undersample)

[0 1]


Counter({0: 2141782, 1: 321283})

In [37]:
y_ncr_undersample.shape

(2463065,)

In [38]:
X_ncr_undersample[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,sintpkt,dintpkt,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm
0,33,68854,24,47344,120,2,0.001055,132,164,31,...,0.017,0.013,0.0,0.0,0.0,0,0.0,3,7,1
1,33,90600,27,253,120,2,0.036133,528,304,31,...,7.005,7.564333,0.0,0.0,0.0,0,0.0,2,4,2
2,39,69669,25,47344,120,2,0.001119,146,178,31,...,0.017,0.013,0.0,0.0,0.0,0,0.0,12,8,1
3,38,93095,23,47344,120,2,0.001209,132,164,31,...,0.043,0.014,0.0,0.0,0.0,0,0.0,6,9,1
4,36,108205,8,47344,120,2,0.001169,146,178,31,...,0.005,0.003,0.0,0.0,0.0,0,0.0,7,9,1


In [39]:
y_ncr_undersample[0:5]

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [40]:
y_ncr_undersample_normal = []
X_ncr_undersample_normal = []
for i, val in enumerate(y_ncr_undersample):
    if (val == 0):
        y_ncr_undersample_normal.append(val)
        X_ncr_undersample_normal.append(X_ncr_undersample.loc[i])

In [163]:
'''y_undersample_normal = []
X_undersample_normal = []
for i, val in enumerate(y_undersample):
    if (val == 0):
        y_undersample_normal.append(val)
        X_undersample_normal.append(X_undersample.loc[i])'''

In [41]:
print(len(y_ncr_undersample_normal))
print(len(X_ncr_undersample_normal))

2141782
2141782


In [43]:
X_ncr_undersample_normal_pd = pd.DataFrame(X_ncr_undersample_normal)
X_ncr_undersample_normal_pd.shape

(2141782, 36)

In [44]:
X_ncr_undersample_normal_pd[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,sintpkt,dintpkt,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm
0,33.0,68854.0,24.0,47344.0,120.0,2.0,0.001055,132.0,164.0,31.0,...,0.017,0.013,0.0,0.0,0.0,0.0,0.0,3.0,7.0,1.0
1,33.0,90600.0,27.0,253.0,120.0,2.0,0.036133,528.0,304.0,31.0,...,7.005,7.564333,0.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0
2,39.0,69669.0,25.0,47344.0,120.0,2.0,0.001119,146.0,178.0,31.0,...,0.017,0.013,0.0,0.0,0.0,0.0,0.0,12.0,8.0,1.0
3,38.0,93095.0,23.0,47344.0,120.0,2.0,0.001209,132.0,164.0,31.0,...,0.043,0.014,0.0,0.0,0.0,0.0,0.0,6.0,9.0,1.0
4,36.0,108205.0,8.0,47344.0,120.0,2.0,0.001169,146.0,178.0,31.0,...,0.005,0.003,0.0,0.0,0.0,0.0,0.0,7.0,9.0,1.0


In [95]:
y_ncr_undersampled_normal_pd = pd.DataFrame(y_ncr_undersample_normal)
y_ncr_undersampled_normal_pd.columns = ['label']

In [101]:
dataset_normal = X_ncr_undersample_normal_pd
dataset_normal['attack_cat'] = 0
#dataset_normal['label'] = y_ncr_undersampled_normal_pd['label']
dataset_normal['label'] = 0
print(dataset_normal.shape)
dataset_normal[0:5]

(2141782, 38)


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm,attack_cat,label
0,33.0,68854.0,24.0,47344.0,120.0,2.0,0.001055,132.0,164.0,31.0,...,0.0,0.0,0.0,0.0,0.0,3.0,7.0,1.0,0,0
1,33.0,90600.0,27.0,253.0,120.0,2.0,0.036133,528.0,304.0,31.0,...,0.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0,0,0
2,39.0,69669.0,25.0,47344.0,120.0,2.0,0.001119,146.0,178.0,31.0,...,0.0,0.0,0.0,0.0,0.0,12.0,8.0,1.0,0,0
3,38.0,93095.0,23.0,47344.0,120.0,2.0,0.001209,132.0,164.0,31.0,...,0.0,0.0,0.0,0.0,0.0,6.0,9.0,1.0,0,0
4,36.0,108205.0,8.0,47344.0,120.0,2.0,0.001169,146.0,178.0,31.0,...,0.0,0.0,0.0,0.0,0.0,7.0,9.0,1.0,0,0


In [102]:
print(dataset_normal.loc[:,'label'])
dataset_normal.shape

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
30         0
31         0
32         0
          ..
2463019    0
2463020    0
2463021    0
2463022    0
2463023    0
2463024    0
2463025    0
2463026    0
2463027    0
2463028    0
2463029    0
2463030    0
2463031    0
2463032    0
2463033    0
2463049    0
2463050    0
2463051    0
2463052    0
2463053    0
2463054    0
2463055    0
2463056    0
2463057    0
2463058    0
2463059    0
2463060    0
2463061    0
2463062    0
2463063    0
Name: label, Length: 2141782, dtype: int64


(2141782, 38)

In [46]:
'''dataset_normal.to_csv(r'..\ncr_undersample_normal.csv', index = False)'''

In [49]:
Counter(Y['attack_cat'])

Counter({0: 2218764,
         1: 44525,
         2: 13987,
         3: 16353,
         4: 215481,
         5: 1511,
         6: 24246,
         7: 174,
         8: 2329,
         9: 2677})

In [50]:
sm = SMOTE(sampling_strategy = {1:100000, 2:80000, 3:80000, 5:60000, 6:80000, 7:60000,8:70000, 9:70000},random_state=42)
X_sm_oversample_attack_, Y_sm_oversample_attack_ = sm.fit_sample(X,Y['attack_cat'])

In [52]:
Counter(Y_sm_oversample_attack_)

Counter({0: 2218764,
         1: 100000,
         2: 80000,
         3: 80000,
         4: 215481,
         5: 60000,
         6: 80000,
         7: 60000,
         8: 70000,
         9: 70000})

In [53]:
X_sm_oversample_attack_.shape

(3034245, 36)

In [54]:
Y_sm_oversample_attack_.shape

(3034245,)

In [56]:
count = 0
X_sm_os_without_normal = []
y_sm_os_without_normal = []
for i in Y_sm_oversample_attack_:
    if (i != 0):
        X_sm_os_without_normal.append(X_sm_oversample_attack_.loc[count])
        y_sm_os_without_normal.append(i)
    count = count + 1
len(X_sm_os_without_normal)

815481

In [59]:
print(X_sm_os_without_normal[0:5])

[srcip               3.100000e+01
sport               7.691300e+04
dstip               1.800000e+01
dsport              2.507600e+04
proto               1.200000e+02
state               6.000000e+00
dur                 2.100000e-05
sbytes              7.280000e+02
dbytes              0.000000e+00
sttl                2.540000e+02
dttl                0.000000e+00
sloss               0.000000e+00
dloss               0.000000e+00
service             0.000000e+00
sload               1.386667e+08
dload               0.000000e+00
spkts               2.000000e+00
dpkts               0.000000e+00
swin                0.000000e+00
stcpb               0.000000e+00
dtcpb               0.000000e+00
smeansz             3.640000e+02
dmeansz             0.000000e+00
res_bdy_len         0.000000e+00
sjit                0.000000e+00
djit                0.000000e+00
sintpkt             2.100000e-02
dintpkt             0.000000e+00
tcprtt              0.000000e+00
synack              0.000000e+00
ackdat   

In [57]:
pd.DataFrame(X_sm_os_without_normal).shape

(815481, 36)

In [62]:
X_sm_os_without_normal_pd=pd.DataFrame(X_sm_os_without_normal)
X_sm_os_without_normal_pd[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,sintpkt,dintpkt,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm
20,31.0,76913.0,18.0,25076.0,120.0,6.0,2.1e-05,728.0,0.0,254.0,...,0.021,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0
21,30.0,79263.0,16.0,62606.0,114.0,5.0,0.240139,918.0,25552.0,62.0,...,21.830818,9.570304,0.051475,0.006528,0.044947,1.0,1.0,3.0,2.0,1.0
22,28.0,68176.0,16.0,62606.0,114.0,5.0,2.39039,1362.0,268.0,254.0,...,183.579303,474.259406,0.066088,0.017959,0.048129,1.0,1.0,5.0,2.0,1.0
39,30.0,68736.0,16.0,50155.0,114.0,5.0,0.17519,8168.0,268.0,254.0,...,11.837692,33.287,0.054878,0.008744,0.046134,1.0,0.0,1.0,1.0,1.0
40,30.0,83203.0,10.0,62606.0,114.0,5.0,0.1906,844.0,268.0,254.0,...,18.573778,36.845602,0.050675,0.006354,0.044321,1.0,1.0,3.0,1.0,1.0


In [63]:
X_sm_os_without_normal_pd.reset_index(drop=True, inplace=True)
X_sm_os_without_normal_pd[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,sintpkt,dintpkt,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm
0,31.0,76913.0,18.0,25076.0,120.0,6.0,2.1e-05,728.0,0.0,254.0,...,0.021,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0
1,30.0,79263.0,16.0,62606.0,114.0,5.0,0.240139,918.0,25552.0,62.0,...,21.830818,9.570304,0.051475,0.006528,0.044947,1.0,1.0,3.0,2.0,1.0
2,28.0,68176.0,16.0,62606.0,114.0,5.0,2.39039,1362.0,268.0,254.0,...,183.579303,474.259406,0.066088,0.017959,0.048129,1.0,1.0,5.0,2.0,1.0
3,30.0,68736.0,16.0,50155.0,114.0,5.0,0.17519,8168.0,268.0,254.0,...,11.837692,33.287,0.054878,0.008744,0.046134,1.0,0.0,1.0,1.0,1.0
4,30.0,83203.0,10.0,62606.0,114.0,5.0,0.1906,844.0,268.0,254.0,...,18.573778,36.845602,0.050675,0.006354,0.044321,1.0,1.0,3.0,1.0,1.0


In [64]:
y_sm_os_without_normal_pd = pd.DataFrame(y_sm_os_without_normal)

In [67]:
y_sm_os_without_normal_pd[0:5]

Unnamed: 0,attack_cat
0,1
1,1
2,2
3,1
4,1


In [65]:
print(y_sm_os_without_normal_pd.shape)
print(y_sm_os_without_normal_pd.columns)

(815481, 1)
RangeIndex(start=0, stop=1, step=1)


In [None]:
X_sm_os_without_normal_pd = X_sm_os_without_normal_pd.Drop(['index'], axis=1)

In [68]:
y_sm_os_without_normal_pd.columns = ['attack_cat']
print(y_sm_os_without_normal_pd.loc[:,'attack_cat'].value_counts())
print(y_sm_os_without_normal_pd.columns)

4    215481
1    100000
6     80000
3     80000
2     80000
9     70000
8     70000
7     60000
5     60000
Name: attack_cat, dtype: int64
Index(['attack_cat'], dtype='object')


In [69]:
y_sm_os_without_normal_pd['label'] = 1
y_sm_os_without_normal_pd.shape

(815481, 2)

In [70]:
y_sm_os_without_normal_pd[0:5]

Unnamed: 0,attack_cat,label
0,1,1
1,1,1
2,2,1
3,1,1
4,1,1


In [71]:
dataset_attack = X_sm_os_without_normal_pd

In [72]:
print(dataset_attack.shape)
dataset_attack['attack_cat'] = y_sm_os_without_normal_pd['attack_cat']
dataset_attack['label'] = y_sm_os_without_normal_pd['label']
print(dataset_attack.shape)

(815481, 36)
(815481, 38)


In [73]:
dataset_attack.to_csv(r'..\sm_oversample_attack.csv', index = False)

In [76]:
dataset_attack[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm,attack_cat,label
0,31.0,76913.0,18.0,25076.0,120.0,6.0,2.1e-05,728.0,0.0,254.0,...,0.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,1,1
1,30.0,79263.0,16.0,62606.0,114.0,5.0,0.240139,918.0,25552.0,62.0,...,0.051475,0.006528,0.044947,1.0,1.0,3.0,2.0,1.0,1,1
2,28.0,68176.0,16.0,62606.0,114.0,5.0,2.39039,1362.0,268.0,254.0,...,0.066088,0.017959,0.048129,1.0,1.0,5.0,2.0,1.0,2,1
3,30.0,68736.0,16.0,50155.0,114.0,5.0,0.17519,8168.0,268.0,254.0,...,0.054878,0.008744,0.046134,1.0,0.0,1.0,1.0,1.0,1,1
4,30.0,83203.0,10.0,62606.0,114.0,5.0,0.1906,844.0,268.0,254.0,...,0.050675,0.006354,0.044321,1.0,1.0,3.0,1.0,1.0,1,1


In [77]:
dataset_normal[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm,label,attack_cat
0,33.0,68854.0,24.0,47344.0,120.0,2.0,0.001055,132.0,164.0,31.0,...,0.0,0.0,0.0,0.0,0.0,3.0,7.0,1.0,0.0,0
1,33.0,90600.0,27.0,253.0,120.0,2.0,0.036133,528.0,304.0,31.0,...,0.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0,0.0,0
2,39.0,69669.0,25.0,47344.0,120.0,2.0,0.001119,146.0,178.0,31.0,...,0.0,0.0,0.0,0.0,0.0,12.0,8.0,1.0,0.0,0
3,38.0,93095.0,23.0,47344.0,120.0,2.0,0.001209,132.0,164.0,31.0,...,0.0,0.0,0.0,0.0,0.0,6.0,9.0,1.0,0.0,0
4,36.0,108205.0,8.0,47344.0,120.0,2.0,0.001169,146.0,178.0,31.0,...,0.0,0.0,0.0,0.0,0.0,7.0,9.0,1.0,0.0,0


In [103]:
dataset_all = pd.concat([dataset_attack, dataset_normal])
from sklearn.utils import shuffle
dataset_final = shuffle(dataset_all).reset_index(drop=True)
print(dataset_attack.shape)
print(dataset_normal.shape)
dataset_final.shape

(815481, 38)
(2141782, 38)


(2957263, 38)

In [104]:
dataset_final[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,ct_state_ttl,ct_flw_http_mthd,ct_srv_src,ct_srv_dst,ct_dst_src_ltm,attack_cat,label
0,30.0,44760.0,15.0,50060.0,114.0,5.0,0.563389,917.0,354.0,254.0,...,0.103581,0.060203,0.043379,1.0,0.795682,1.0,1.0,1.0,2,1
1,29.0,1578.0,14.0,62606.0,114.0,5.0,8.658444,5117.0,906106.0,254.0,...,0.132516,0.077684,0.054832,1.0,1.0,1.0,1.0,1.0,7,1
2,37.0,101326.0,24.0,46132.0,114.0,5.0,0.299911,1470.0,1728.0,31.0,...,0.000713,0.000587,0.000126,0.0,0.0,4.0,19.0,4.0,0,0
3,31.0,16307.0,12.0,30252.0,120.0,6.0,2e-06,252.0,0.0,254.0,...,0.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,5,1
4,36.0,87088.0,27.0,61478.0,114.0,5.0,0.018423,1540.0,1644.0,31.0,...,0.0006,0.00047,0.00013,0.0,0.0,15.0,14.0,7.0,0,0


In [105]:
dataset_final.to_csv(r'..\dataset_final_ncr_sm.csv', index = False)