In [1]:

import warnings; warnings.simplefilter('ignore')
from sklearn.preprocessing import StandardScaler
import numpy as np 
import pandas as pd 



In [2]:
# loading data
train = pd.read_csv('../data/UNSW_NB15_training-set.csv')
test = pd.read_csv('../data/UNSW_NB15_training-set.csv')
if train.shape[0]<100000:
    print("Train test sets are reversed. Fixing. ")
    train, test = test, train

# dropping columns with output message
drop_columns = ['attack_cat', 'id'] + ['response_body_len', 'is_sm_ips_ports', 'ct_flw_http_mthd', 'trans_depth', 'dwin', 'ct_ftp_cmd', 'is_ftp_login']
for df in [train, test]:
    for col in drop_columns:
        if col in df.columns:
            print('Dropping '+col)
            df.drop([col], axis=1, inplace=True)

Train test sets are reversed. Fixing. 
Dropping attack_cat
Dropping id
Dropping response_body_len
Dropping is_sm_ips_ports
Dropping ct_flw_http_mthd
Dropping trans_depth
Dropping dwin
Dropping ct_ftp_cmd
Dropping is_ftp_login
Dropping attack_cat
Dropping id
Dropping response_body_len
Dropping is_sm_ips_ports
Dropping ct_flw_http_mthd
Dropping trans_depth
Dropping dwin
Dropping ct_ftp_cmd
Dropping is_ftp_login


In [3]:
# Cleaning categories and subcategories

def feature_engineer(df):
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    df.loc[~df['proto'].isin(['tcp', 'udp', 'arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df

def get_cat_columns(train):
    categorical = []
    for col in train.columns:
        if train[col].dtype == 'object':
            categorical.append(col)
    return categorical



In [4]:
# preprocesing
x_train, y_train = train.drop(['label'], axis=1), train['label']
x_test, y_test = test.drop(['label'], axis=1), test['label']

x_train, x_test = feature_engineer(x_train), feature_engineer(x_test)

categorical_columns = get_cat_columns(x_train)
non_categorical_columns = [x for x in x_train.columns if x not in categorical_columns]

scaler = StandardScaler()
x_train[non_categorical_columns] = scaler.fit_transform(x_train[non_categorical_columns])
x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])


x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
features = list(set(x_train.columns) & set(x_test.columns))

print(f"Number of features {len(features)}")
x_train = x_train[features]
x_test = x_test[features]

Column mismatch set(), set()
Number of features 53


In [5]:

# naming y
x_train['label'] = y_train
x_test['label'] = y_test
#x_train.to_csv('../data/train.csv', index=False)
#x_test.to_csv('../data/test.csv', index=False)



In [8]:
pd.set_option('display.max.columns', None)

In [9]:
x_train

Unnamed: 0,service_pop3,ct_srv_dst,spkts,dtcpb,service_smtp,state_FIN,state_RST,proto_others,synack,service_others,state_INT,dbytes,proto_ospf,state_CON,dpkts,sjit,dur,service_http,proto_tcp,rate,sload,proto_udp,dinpkt,service_ssh,smean,tcprtt,sbytes,ct_dst_ltm,proto_igmp_icmp_rtp,ct_state_ttl,dttl,service_-,state_REQ,service_ftp,service_dns,stcpb,ct_src_ltm,dmean,djit,ackdat,sinpkt,dload,ct_dst_sport_ltm,dloss,swin,service_ftp-data,ct_srv_src,sttl,sloss,state_others,ct_src_dport_ltm,ct_dst_src_ltm,proto_arp,label
0,0,-0.644190,-0.124455,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213727,0,0,0.057181,0.643913,1,-0.094169,0,0.520319,-0.482025,-0.043684,-0.563660,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.640033,-0.475371,-0.147218,-0.484073,-0.122179,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.680474,0.719440,-0.073531,0,-0.468312,-0.477994,0,0
1,0,-0.644190,-0.124455,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213728,0,0,0.286565,4.539351,1,-0.094169,0,3.556716,-0.482025,-0.036308,-0.563660,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.640033,-0.475371,-0.147218,-0.484073,-0.122180,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.680474,0.719440,-0.073531,0,-0.468312,-0.477994,0,0
2,0,-0.554273,-0.124455,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213729,0,0,0.791209,4.391459,1,-0.094169,0,1.892214,-0.482025,-0.040351,-0.563660,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.640033,-0.475371,-0.147218,-0.484073,-0.122180,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.590304,0.719440,-0.073531,0,-0.468312,-0.390391,0,0
3,0,-0.554273,-0.124455,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213729,0,0,0.566923,2.977031,1,-0.094169,0,1.489280,-0.482025,-0.041330,-0.444868,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.522990,-0.475371,-0.147218,-0.484073,-0.122180,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.590304,0.719440,-0.073531,0,-0.349115,-0.390391,0,0
4,0,-0.554273,-0.124455,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213728,0,0,0.118350,4.369219,1,-0.094169,0,4.429740,-0.482025,-0.034187,-0.444868,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.522990,-0.475371,-0.147218,-0.484073,-0.122179,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.590304,0.719440,-0.073531,0,-0.349115,-0.390391,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0,-0.734107,-0.124455,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213729,0,0,0.791209,0.103697,1,-0.094169,0,-0.419860,-0.482025,-0.045967,-0.444868,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.522990,-0.475371,-0.147218,-0.484073,-0.122180,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.770643,0.719440,-0.073531,0,-0.468312,-0.477994,0,0
82328,0,-0.644190,0.009958,1.599744,0,1,0,0,1.001157,0,0,-0.085032,0,0,-0.082596,-0.027589,0.021090,0,1,-0.554345,-0.358193,0,0.017022,0,3.662246,1.010876,0.058658,-0.444868,0,-0.346027,1.339599,1,0,0,0,-0.008704,-0.405947,-0.295484,-0.094683,0.841263,-0.113143,-0.262561,-0.450186,-0.095293,0.954338,0,-0.770643,0.719440,0.034746,0,-0.468312,-0.565597,0,0
82329,0,-0.734107,-0.131922,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.112177,-0.213730,0,0,-0.554509,-0.358883,0,-0.094169,0,-0.448641,-0.482025,-0.046305,-0.563660,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.640033,-0.475371,-0.147218,-0.484073,9.582625,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.770643,-1.782709,-0.073531,0,-0.468312,-0.565597,1,0
82330,0,-0.734107,-0.131922,-0.776754,0,0,0,0,-0.412910,0,1,-0.087369,0,0,-0.151816,-0.111984,-0.213730,0,0,-0.554509,-0.358883,0,-0.094169,0,-0.448641,-0.482025,-0.046305,-0.563660,0,0.591021,-0.820395,1,0,0,0,-0.779840,-0.640033,-0.475371,-0.147218,-0.484073,9.582627,-0.263498,-0.450186,-0.113244,-1.047920,0,-0.770643,-1.782709,-0.073531,0,-0.468312,-0.565597,1,0


In [10]:
train

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,tcprtt,synack,ackdat,smean,dmean,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_src_ltm,ct_srv_dst,label
0,0.000011,udp,-,INT,2,0,496,0,90909.090200,254,0,1.803636e+08,0.000000,0,0,0.011000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,248,0,2,2,1,1,1,2,1,2,0
1,0.000008,udp,-,INT,2,0,1762,0,125000.000300,254,0,8.810000e+08,0.000000,0,0,0.008000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,881,0,2,2,1,1,1,2,1,2,0
2,0.000005,udp,-,INT,2,0,1068,0,200000.005100,254,0,8.544000e+08,0.000000,0,0,0.005000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,534,0,3,2,1,1,1,3,1,3,0
3,0.000006,udp,-,INT,2,0,900,0,166666.660800,254,0,6.000000e+08,0.000000,0,0,0.006000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,450,0,3,2,2,2,1,3,2,3,0
4,0.000010,udp,-,INT,2,0,2126,0,100000.002500,254,0,8.504000e+08,0.000000,0,0,0.010000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,1063,0,3,2,2,2,1,3,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0.000005,udp,-,INT,2,0,104,0,200000.005100,254,0,8.320000e+07,0.000000,0,0,0.005000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,52,0,1,2,2,1,1,2,2,1,0
82328,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,254,252,1.241044e+05,2242.109863,7,1,55.880051,143.7,4798.130981,190.980813,255,1072535109,3284291478,0.173208,0.100191,0.073017,903,44,1,1,2,1,1,1,3,2,0
82329,0.000000,arp,-,INT,1,0,46,0,0.000000,0,0,0.000000e+00,0.000000,0,0,60000.720000,0.0,0.000000,0.000000,0,0,0,0.000000,0.000000,0.000000,46,0,1,2,1,1,1,1,1,1,0
82330,0.000000,arp,-,INT,1,0,46,0,0.000000,0,0,0.000000e+00,0.000000,0,0,60000.732000,0.0,10.954518,0.000000,0,0,0,0.000000,0.000000,0.000000,46,0,1,2,1,1,1,1,1,1,0
