In [1]:

import warnings; warnings.simplefilter('ignore')
from sklearn.preprocessing import StandardScaler
import numpy as np 
import pandas as pd 



In [3]:
# loading data
train = pd.read_csv('../data/raw/UNSW_NB15_training-set.csv')
test = pd.read_csv('../data/raw/UNSW_NB15_testing-set.csv')
if train.shape[0]<100000:
    print("Train test sets are reversed. Fixing. ")
    train, test = test, train

# dropping columns with output message
drop_columns = ['attack_cat', 'id'] + ['response_body_len', 'is_sm_ips_ports', 'ct_flw_http_mthd', 'trans_depth', 'dwin', 'ct_ftp_cmd', 'is_ftp_login']
for df in [train, test]:
    for col in drop_columns:
        if col in df.columns:
            print('Dropping '+col)
            df.drop([col], axis=1, inplace=True)

Train test sets are reversed. Fixing. 
Dropping attack_cat
Dropping id
Dropping response_body_len
Dropping is_sm_ips_ports
Dropping ct_flw_http_mthd
Dropping trans_depth
Dropping dwin
Dropping ct_ftp_cmd
Dropping is_ftp_login
Dropping attack_cat
Dropping id
Dropping response_body_len
Dropping is_sm_ips_ports
Dropping ct_flw_http_mthd
Dropping trans_depth
Dropping dwin
Dropping ct_ftp_cmd
Dropping is_ftp_login


In [4]:
# Cleaning categories and subcategories

def feature_engineer(df):
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    df.loc[~df['proto'].isin(['tcp', 'udp', 'arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df

def get_cat_columns(train):
    categorical = []
    for col in train.columns:
        if train[col].dtype == 'object':
            categorical.append(col)
    return categorical



In [5]:
# preprocesing
x_train, y_train = train.drop(['label'], axis=1), train['label']
x_test, y_test = test.drop(['label'], axis=1), test['label']

x_train, x_test = feature_engineer(x_train), feature_engineer(x_test)

categorical_columns = get_cat_columns(x_train)
non_categorical_columns = [x for x in x_train.columns if x not in categorical_columns]

scaler = StandardScaler()
x_train[non_categorical_columns] = scaler.fit_transform(x_train[non_categorical_columns])
x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])


x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
features = list(set(x_train.columns) & set(x_test.columns))

print(f"Number of features {len(features)}")
x_train = x_train[features]
x_test = x_test[features]

Column mismatch set(), set()
Number of features 53


In [6]:

# naming y
x_train['label'] = y_train
x_test['label'] = y_test


In [11]:

x_train.to_csv('../data/processed/train.csv', index=False)
x_test.to_csv('../data/processed/test.csv', index=False)



In [7]:
pd.set_option('display.max.columns', None)

In [8]:
x_train

Unnamed: 0,service_dns,dpkts,state_CON,service_ftp,sttl,tcprtt,ct_dst_src_ltm,ct_src_ltm,synack,sload,ct_srv_dst,service_pop3,rate,proto_udp,state_others,sinpkt,proto_ospf,ct_dst_ltm,proto_others,ackdat,djit,proto_tcp,dtcpb,state_FIN,ct_dst_sport_ltm,dttl,proto_igmp_icmp_rtp,sloss,dload,dmean,state_RST,smean,dur,proto_arp,service_smtp,state_REQ,service_ftp-data,dbytes,dloss,swin,ct_src_dport_ltm,service_-,dinpkt,ct_srv_src,sbytes,service_http,service_others,stcpb,ct_state_ttl,service_ssh,state_INT,sjit,spkts,label
0,0,-0.135769,0,0,0.703839,-0.521660,-0.705529,-0.715714,-0.484346,-0.389897,-0.753074,0,-0.576371,0,0,-0.132788,0,-0.645013,0,-0.503014,-0.145905,1,0.911123,1,-0.554373,1.578100,0,-0.075040,-0.273700,-0.314240,0,-0.458048,-0.191029,0,0,0,0,-0.102726,-0.131759,1.092456,-0.544736,1,-0.080885,-0.775991,-0.049134,0,0,-0.256392,-1.366486,0,0,-0.109997,-0.104456,0
1,0,0.172599,0,0,-1.141901,-0.521660,-0.614256,-0.715714,-0.484346,-0.389928,-0.288257,0,-0.576345,0,0,-0.129251,0,-0.645013,0,-0.503014,0.192913,1,1.557251,1,-0.554373,1.560002,0,-0.044739,-0.069233,3.800869,0,-0.414076,-0.109485,0,0,0,0,0.188544,0.190621,1.092456,-0.544736,1,-0.073735,3.147666,-0.046410,0,0,0.331031,-0.318711,0,0,-0.109302,-0.046014,0
2,0,-0.026933,0,0,-1.141901,0.888444,-0.522983,-0.595543,0.931748,-0.389964,-0.288257,0,-0.576734,0,0,-0.104126,0,-0.520827,0,0.742202,2.663504,1,1.472854,1,-0.554373,1.560002,0,-0.059890,-0.252044,2.709185,0,-0.443391,0.040699,0,0,0,0,-0.012133,-0.017978,1.092456,-0.544736,1,0.014711,-0.215468,-0.048527,0,0,0.846258,-0.318711,0,0,0.271392,-0.089845,0
3,0,-0.063212,0,1,-1.141901,-0.521660,-0.522983,-0.595543,-0.484346,-0.389958,-0.753074,0,-0.576737,0,0,-0.115034,0,-0.520827,0,-0.503014,1.080373,1,0.058025,1,-0.554373,1.560002,0,-0.059890,-0.275821,-0.232945,0,-0.414076,0.049729,0,0,0,0,-0.098563,-0.074868,1.092456,-0.544736,0,0.002046,-0.775991,-0.047016,0,0,0.101729,-0.318711,0,0,-0.104906,-0.060624,0
4,0,-0.117630,0,0,0.723268,1.096172,2.854115,-0.595543,1.154999,-0.389927,2.779535,0,-0.576617,0,0,-0.129549,0,-0.520827,0,0.909954,-0.120301,1,0.744668,1,-0.554373,1.560002,0,-0.044739,-0.275561,-0.306498,0,-0.409190,-0.140417,0,0,0,0,-0.102057,-0.112795,1.092456,-0.420468,1,-0.012721,3.147666,-0.047554,0,0,1.082366,-0.318711,0,0,-0.056942,-0.075235,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,1,-0.172047,0,0,0.723268,-0.521660,1.393748,2.048221,-0.484346,-0.120980,1.385084,0,0.094951,1,0,-0.136142,0,2.211259,0,-0.503014,-0.148818,0,-0.715569,0,1.520470,-0.720406,0,-0.075040,-0.277208,-0.480703,0,-0.389647,-0.209773,0,0,0,0,-0.103923,-0.131759,-0.915407,2.313443,0,-0.089370,1.372678,-0.049958,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,1
175337,0,-0.099490,0,0,0.723268,0.731463,-0.614256,-0.715714,0.365776,-0.389926,-0.753074,0,-0.576616,0,0,-0.128631,0,-0.645013,0,1.041069,-0.119225,1,1.834729,1,-0.554373,1.560002,0,-0.044739,-0.275183,-0.310369,0,-0.365219,-0.131728,0,0,0,0,-0.101459,-0.112795,1.092456,-0.544736,1,-0.021513,-0.775991,-0.047062,0,0,1.881207,-0.318711,0,0,-0.027914,-0.075235,1
175338,1,-0.172047,0,0,0.723268,-0.521660,0.389746,-0.475371,-0.484346,-0.120980,0.269523,0,0.094951,1,0,-0.136142,0,-0.396641,0,-0.503014,-0.148818,0,-0.715569,0,-0.208566,-0.720406,0,-0.075040,-0.277208,-0.480703,0,-0.389647,-0.209773,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.296199,0,-0.089370,0.251634,-0.049958,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,1
175339,1,-0.172047,0,0,0.723268,-0.521660,1.941386,2.769248,-0.484346,-0.120980,1.942865,0,0.094951,1,0,-0.136142,0,2.956374,0,-0.503014,-0.148818,0,-0.715569,0,1.693374,-0.720406,0,-0.075040,-0.277208,-0.480703,0,-0.389647,-0.209773,0,0,0,0,-0.103923,-0.131759,-0.915407,3.059055,0,-0.089370,1.933201,-0.049958,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,1


In [12]:
train

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,tcprtt,synack,ackdat,smean,dmean,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_src_ltm,ct_srv_dst,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.087490,252,254,1.415894e+04,8495.365234,0,0,24.295600,8.375000,30.177547,11.830604,255,621772692,2202533631,0.000000,0.000000,0.000000,43,43,1,0,1,1,1,1,1,1,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,252,8.395112e+03,503571.312500,2,17,49.915000,15.432865,61.426934,1387.778330,255,1417884146,3077387971,0.000000,0.000000,0.000000,52,1106,43,1,1,1,1,2,1,6,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,252,1.572272e+03,60929.230470,1,6,231.875571,102.737203,17179.586860,11420.926230,255,2116150707,2963114973,0.111897,0.061458,0.050439,46,824,7,1,2,1,1,3,2,6,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,252,2.740179e+03,3358.622070,1,3,152.876547,90.235726,259.080172,4991.784669,255,1107119177,1047442890,0.000000,0.000000,0.000000,52,64,1,1,2,1,1,3,2,1,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,252,8.561499e+03,3987.059814,2,1,47.750333,75.659602,2415.837634,115.807000,255,2436137549,1977154190,0.128381,0.071147,0.057234,53,45,43,1,2,2,1,40,2,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,0,5.066666e+07,0.000000,0,0,0.009000,0.000000,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,57,0,24,2,24,24,13,24,24,24,1
175337,0.505762,tcp,-,FIN,10,8,620,354,33.612649,254,252,8.826286e+03,4903.492188,2,1,54.400111,66.980570,3721.068786,120.177727,255,3518776216,3453092386,0.099440,0.036895,0.062545,62,44,1,1,1,1,1,2,1,1,1
175338,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,0,5.066666e+07,0.000000,0,0,0.009000,0.000000,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,57,0,12,2,3,3,3,13,3,12,1
175339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,0,5.066666e+07,0.000000,0,0,0.009000,0.000000,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,57,0,30,2,30,30,14,30,30,30,1


In [13]:
x_test

Unnamed: 0,service_dns,dpkts,state_CON,service_ftp,sttl,tcprtt,ct_dst_src_ltm,ct_src_ltm,synack,sload,ct_srv_dst,service_pop3,rate,proto_udp,state_others,sinpkt,proto_ospf,ct_dst_ltm,proto_others,ackdat,djit,proto_tcp,dtcpb,state_FIN,ct_dst_sport_ltm,dttl,proto_igmp_icmp_rtp,sloss,dload,dmean,state_RST,smean,dur,proto_arp,service_smtp,state_REQ,service_ftp-data,dbytes,dloss,swin,ct_src_dport_ltm,service_-,dinpkt,ct_srv_src,sbytes,service_http,service_others,stcpb,ct_state_ttl,service_ssh,state_INT,sjit,spkts,label
0,0,-0.172047,0,0,0.723268,-0.521660,-0.614256,-0.715714,-0.484346,0.567591,-0.660111,0,-0.027189,1,0,-0.136141,0,-0.645013,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,0.543531,-0.209773,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.544736,1,-0.089370,-0.682570,-0.047772,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,0
1,0,-0.172047,0,0,0.723268,-0.521660,-0.614256,-0.715714,-0.484346,4.287318,-0.660111,0,0.178922,1,0,-0.136142,0,-0.645013,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,3.636212,-0.209774,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.544736,1,-0.089370,-0.682570,-0.040528,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,0
2,0,-0.172047,0,0,0.723268,-0.521660,-0.522983,-0.715714,-0.484346,4.146097,-0.567147,0,0.632367,1,0,-0.136142,0,-0.645013,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,1.940856,-0.209774,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.544736,1,-0.089370,-0.589150,-0.044499,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,0
3,0,-0.172047,0,0,0.723268,-0.521660,-0.522983,-0.595543,-0.484346,2.795469,-0.567147,0,0.430836,1,0,-0.136142,0,-0.520827,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,1.530453,-0.209774,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.420468,1,-0.089370,-0.589150,-0.045460,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,0
4,0,-0.172047,0,0,0.723268,-0.521660,-0.522983,-0.595543,-0.484346,4.124860,-0.567147,0,0.027774,1,0,-0.136141,0,-0.520827,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,4.525419,-0.209773,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.420468,1,-0.089370,-0.589150,-0.038445,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0,-0.172047,0,0,0.723268,-0.521660,-0.614256,-0.595543,-0.484346,0.051742,-0.753074,0,0.632367,1,0,-0.136142,0,-0.520827,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,-0.414076,-0.209774,0,0,0,0,-0.103923,-0.131759,-0.915407,-0.544736,1,-0.089370,-0.775991,-0.050015,0,0,-0.715177,0.729064,0,1,-0.110668,-0.133677,0
82328,0,-0.099490,0,0,0.723268,1.661073,-0.705529,-0.475371,1.824221,-0.389314,-0.660111,0,-0.576672,0,0,-0.128427,0,-0.520827,0,1.299597,-0.101790,1,1.710060,1,-0.554373,1.560002,0,0.031013,-0.276282,-0.310369,0,3.743699,-0.039086,0,0,0,0,-0.101459,-0.112795,1.092456,-0.544736,1,0.056209,-0.775991,0.052740,0,0,0.076210,-0.318711,0,0,-0.003961,-0.002182,0
82329,0,-0.172047,0,0,-1.744196,-0.521660,-0.705529,-0.715714,-0.484346,-0.389973,-0.753074,0,-0.576819,0,0,8.148703,0,-0.645013,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,-0.443391,-0.209775,1,0,0,0,-0.103923,-0.131759,-0.915407,-0.544736,1,-0.089370,-0.775991,-0.050347,0,0,-0.715177,0.729064,0,1,-0.110668,-0.140982,0
82330,0,-0.172047,0,0,-1.744196,-0.521660,-0.705529,-0.715714,-0.484346,-0.389973,-0.753074,0,-0.576819,0,0,8.148705,0,-0.645013,0,-0.503014,-0.148818,0,-0.715569,0,-0.554373,-0.720406,0,-0.075040,-0.277208,-0.480703,0,-0.443391,-0.209775,1,0,0,0,-0.103923,-0.131759,-0.915407,-0.544736,1,-0.089370,-0.775991,-0.050347,0,0,-0.715177,0.729064,0,1,-0.110424,-0.140982,0
