In [23]:
import pandas as pd
import numpy as np

import copy
import warnings
warnings.filterwarnings('ignore')

path = "./UNSW-NB15 - CSV Files/"

# 1. Combine Data and Preprocessing

## 1.1 Import Datasets

In [36]:
# Construct an empty DF for storing the processed data
df = pd.DataFrame()

# Read code book from system
codebook = pd.read_csv(path+'NUSW-NB15_features.csv', encoding='cp1252')
codebook = codebook.drop('No.',axis=1)
codebook

Unnamed: 0,Name,Type,Description
0,srcip,nominal,Source IP address
1,sport,integer,Source port number
2,dstip,nominal,Destination IP address
3,dsport,integer,Destination port number
4,proto,nominal,Transaction protocol
5,state,nominal,Indicates to the state and its dependent proto...
6,dur,Float,Record total duration
7,sbytes,Integer,Source to destination transaction bytes
8,dbytes,Integer,Destination to source transaction bytes
9,sttl,Integer,Source to destination time to live value


In [38]:
# Read each datasets from system and append to the empty df
for i in range(1,4):
    temp = pd.read_csv(path+'UNSW-NB15_%d.csv'%i, names = codebook['Name'] )
    df = df.append(temp)
    df = df.reset_index(drop = True)

## 1.2 Preprocessing

### 1.2.1 attack_cat
- fill na as normal
- clean attack labels 

In [66]:
# During the prelimilary data exploration, 
# we found that there are some labels from attack_cat need to be gathered
# eg:("backdoor" and "backdoors"; " Fuzzers" & " Fuzzers " )
def attack_cat_processing(s):
    s = s.strip(' ')
    if s[-1]=="s" and s != 'Analysis':
        s = s[:-1]
    return s
    
# Fill normal attack cat 
df['attack_cat'] = df['attack_cat'].fillna('Normal')

# clean attack labels
df['attack_cat'] = df['attack_cat'].apply(lambda x: attack_cat_processing(x))

#### Needs to be noticed:
- ip(srcip & dstip), proto, and time service needs to recode
- Stime & Ltime?

- Extreme values needs to be clamp
- Balancing data 

In [70]:
print("Samples in total:", len(df))

# 0 for normal and 1 for attack records
print("Normal samples:", len(df[df.Label==0]))
print("Attack samples:", len(df[df.Label==1]))

Samples in total: 2100003
Normal samples: 1867614
Attack samples: 232389


In [68]:
# Data Desciption
df.describe(include='all')

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,2100003,2100003.0,2100003,2100003.0,2100003,2100003,2100003.0,2100003.0,2100003.0,2100003.0,...,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003,2100003.0
unique,43,100325.0,47,127484.0,135,16,,,,,...,13.0,,,,,,,,10,
top,59.166.0.4,1043.0,149.171.126.2,53.0,tcp,FIN,,,,,...,0.0,,,,,,,,Normal,
freq,170743,149468.0,170658,436149.0,1280203,1266257,,,,,...,1056339.0,,,,,,,,1867614,
mean,,,,,,,0.6751155,4451.429,38417.03,58.64869,...,,8.723637,8.492137,6.029005,6.504727,4.142399,3.210584,6.096303,,0.1106613
std,,,,,,,15.22083,54365.47,165482.3,70.68249,...,,10.3558,10.32129,7.790589,7.868388,8.022338,5.840231,10.67967,,0.3137123
min,,,,,,,0.0,0.0,0.0,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,0.0
25%,,,,,,,0.00106,264.0,178.0,31.0,...,,2.0,2.0,2.0,2.0,1.0,1.0,1.0,,0.0
50%,,,,,,,0.017921,1540.0,2260.0,31.0,...,,5.0,5.0,3.0,4.0,1.0,1.0,2.0,,0.0
75%,,,,,,,0.223476,3390.0,15258.0,31.0,...,,10.0,9.0,6.0,7.0,2.0,1.0,4.0,,0.0


# 2. Data Processing

## 2.1 Clamping

To reduce the skewness of some distributions, it is recommended to remove the extreme values. A common approach is to remove the features that have a maximum value more than ten times higher than the median value, and set them to the 95th percentile. However, if the 95th percentile is very close to the maximum value, it may indicate that the tail of the distribution contains valuable information, and should not be pruned.

It's important to note that this approach should only be used on features with a maximum value exceeding ten times the median. This avoids over-pruning of bimodal and small value distributions.

In [69]:
# Clamp extreme Values
df_numeric = df.select_dtypes(include=[np.number])
df_numeric.describe(include='all')

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,ct_flw_http_mthd,is_ftp_login,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label
count,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,...,1158342.0,1103536.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0
mean,0.6751155,4451.429,38417.03,58.64869,31.04114,5.373348,17.18504,30672840.0,2555565.0,34.77248,...,0.2091153,0.03380678,8.723637,8.492137,6.029005,6.504727,4.142399,3.210584,6.096303,0.1106613
std,15.22083,54365.47,165482.3,70.68249,40.90932,21.84343,58.06628,108963400.0,4279519.0,76.79977,...,0.7839408,0.184267,10.3558,10.32129,7.790589,7.868388,8.022338,5.840231,10.67967,0.3137123
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,0.00106,264.0,178.0,31.0,29.0,0.0,0.0,118366.2,34590.92,2.0,...,0.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0
50%,0.017921,1540.0,2260.0,31.0,29.0,3.0,4.0,571988.2,618320.6,14.0,...,0.0,0.0,5.0,5.0,3.0,4.0,1.0,1.0,2.0,0.0
75%,0.223476,3390.0,15258.0,31.0,29.0,7.0,14.0,1824163.0,3237420.0,46.0,...,0.0,0.0,10.0,9.0,6.0,7.0,2.0,1.0,4.0,0.0
max,8786.638,14355770.0,14657530.0,255.0,254.0,5319.0,5507.0,5988000000.0,128761900.0,10646.0,...,36.0,4.0,67.0,67.0,67.0,67.0,67.0,60.0,67.0,1.0


In [75]:
# Clamping the data directly, inplace original 
for col in df_numeric.columns:
    if df_numeric[col].max()>10*df_numeric[col].median() and df_numeric[col].max()>10 :
        df[col] = np.where(df[col]<df[col].quantile(0.95), df[col], df[col].quantile(0.95))

In [73]:
df.describe(include='all')

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,2100003,2100003.0,2100003,2100003.0,2100003,2100003,2100003.0,2100003.0,2100003.0,2100003.0,...,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003,2100003.0
unique,43,100325.0,47,127484.0,135,16,,,,,...,13.0,,,,,,,,10,
top,59.166.0.4,1043.0,149.171.126.2,53.0,tcp,FIN,,,,,...,0.0,,,,,,,,Normal,
freq,170743,149468.0,170658,436149.0,1280203,1266257,,,,,...,1056339.0,,,,,,,,1867614,
mean,,,,,,,0.2608372,3056.101,12825.4,58.64869,...,,8.351984,8.12186,5.531559,6.042557,3.659807,3.033895,5.724219,,0.1106613
std,,,,,,,0.4792901,4715.772,21096.89,70.68249,...,,9.122398,9.078506,5.892815,6.118841,6.103987,5.150554,9.38271,,0.3137123
min,,,,,,,0.0,0.0,0.0,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,0.0
25%,,,,,,,0.00106,264.0,178.0,31.0,...,,2.0,2.0,2.0,2.0,1.0,1.0,1.0,,0.0
50%,,,,,,,0.017921,1540.0,2260.0,31.0,...,,5.0,5.0,3.0,4.0,1.0,1.0,2.0,,0.0
75%,,,,,,,0.223476,3390.0,15258.0,31.0,...,,10.0,9.0,6.0,7.0,2.0,1.0,4.0,,0.0


In [76]:
print("Samples in total:", len(df))

# 0 for normal and 1 for attack records
print("Normal samples:", len(df[df.Label==0]))
print("Attack samples:", len(df[df.Label==1]))

Samples in total: 2100003
Normal samples: 1867614
Attack samples: 232389


## 2.2 Log

In [77]:
df_numeric = df.select_dtypes(include=[np.number])
df_before = df_numeric.copy()
DEBUG = 0

for feature in df_numeric.columns:
    if df_numeric[feature].nunique()>50:
        if df_numeric[feature].min()==0:
            df[feature] = np.log(df[feature]+1)
        else:
            df[feature] = np.log(df[feature])

df_numeric = df.select_dtypes(include=[np.number])

In [78]:
df.describe(include='all')

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,2100003,2100003.0,2100003,2100003.0,2100003,2100003,2100003.0,2100003.0,2100003.0,2100003.0,...,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003.0,2100003,2100003.0
unique,43,100325.0,47,127484.0,135,16,,,,,...,13.0,,,,,,,,10,
top,59.166.0.4,1043.0,149.171.126.2,53.0,tcp,FIN,,,,,...,0.0,,,,,,,,Normal,
freq,170743,149468.0,170658,436149.0,1280203,1266257,,,,,...,1056339.0,,,,,,,,1867614,
mean,,,,,,,0.178714,6.950937,6.731545,58.64869,...,,8.351984,8.12186,5.531559,6.042557,3.659807,3.033895,5.724219,,0.1106613
std,,,,,,,0.3013446,1.577997,3.577663,70.68249,...,,9.122398,9.078506,5.892815,6.118841,6.103987,5.150554,9.38271,,0.3137123
min,,,,,,,0.0,0.0,0.0,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,0.0
25%,,,,,,,0.001059439,5.57973,5.187386,31.0,...,,2.0,2.0,2.0,2.0,1.0,1.0,1.0,,0.0
50%,,,,,,,0.01776231,7.340187,7.723562,31.0,...,,5.0,5.0,3.0,4.0,1.0,1.0,2.0,,0.0
75%,,,,,,,0.201696,8.12888,9.632925,31.0,...,,10.0,9.0,6.0,7.0,2.0,1.0,4.0,,0.0


## 2.3 Reducing cardinalities

We are going to reduce the cardinality of some features to 5 or 6 by selecting the top 5 occurring labels and setting the remainder to seldom used '-' labels. This prevents the encoding process from causing a large increase in dimensionality.

In [95]:
df_cat = df.select_dtypes(exclude=[np.number])
df_cat.describe(include='all')

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,service,ct_ftp_cmd,attack_cat
count,2100003,2100003,2100003,2100003,2100003,2100003,2100003,2100003,2100003
unique,6,6,6,6,6,6,5,6,6
top,-,-,-,-,tcp,FIN,-,0,Normal
freq,1247592,1770611,1248296,1228064,1280203,1266257,1152172,1056339,1867614


In [80]:
for feature in df_cat.columns:
    if df_cat[feature].nunique()>6:
        df[feature] = np.where(df[feature].isin(df[feature].value_counts().head().index), df[feature], '-')

In [82]:
print("Samples in total:", len(df))

# 0 for normal and 1 for attack records
print("Normal samples:", len(df[df.Label==0]))
print("Attack samples:", len(df[df.Label==1]))

Samples in total: 2100003
Normal samples: 1867614
Attack samples: 232389


## 2.? Balancing (optional)

In [94]:

from imblearn.combine import SMOTEENN

def balancing(x,y,target):
    '''
    balance the data
    
    input:  x -- the x values(dataframe)
            y -- the y values
            target -- the name of the target
            
    return: balanced dataframe
    '''    
    sme = SMOTEENN(random_state=42)
    x_new, Y_new = sme.fit_resample(x,y)
    
    x_new = pd.DataFrame(x_new,columns = x.columns.tolist() )
    Y_new = pd.DataFrame(Y_new,columns =target )
    temp_df = pd.concat([x_new,Y_new],axis = 1)
    return temp_df

ValueError: could not convert string to float: '59.166.0.0'