# UNSW-NB15: Feature Engineering

In [None]:
# Loading data from disk
train = pd.read_csv('./train_alldata_EDA.csv')
test = pd.read_csv('./test_alldata_EDA.csv')

In [None]:
# Utility function
def multi_corr(col1, col2="label", df=train):
    '''
    This function returns correlation between 2 given features.
    Also gives corr of the given features with "label" afetr applying log1p to it.
    '''
    corr = df[[col1, col2]].corr().iloc[0,1]
    log_corr = df[col1].apply(np.log1p).corr(df[col2])

    print("Correlation : {}\nlog_Correlation: {}".format(corr, log_corr))

In [None]:
def corr(col1, col2="label", df=train):
    """
    This function returns correlation between 2 given features
    """
    return df[[col1, col2]].corr().iloc[0,1]

## Removing highly correlated features

In [None]:
# Selecting all the features with high correlation values with other features
# Refer: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
corr_matrix = train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
# We don't want to use these features for plotting because these are having high corr
# And most likely have same kind of plots with already plotted feature
print(to_drop)

['sloss', 'dloss', 'dpkts', 'dwin', 'ltime', 'ct_srv_dst', 'ct_src_dport_ltm', 'ct_dst_src_ltm']


In [None]:
saved_dict['corr_col'] = to_drop

In [None]:
# removing the features from train and test data
train.drop(columns=to_drop, inplace=True)

In [None]:
train.shape, test.shape

((1778032, 41), (762015, 49))

In [None]:
# creating new features
train['network_bytes'] = train['sbytes'] + train['dbytes']

In [None]:
train.shape, test.shape

((1778032, 42), (762015, 49))

In [None]:
# Dropping columns which are not useful for the classification
# attack_cat is for multiclass classification
# all the other columns are address related and not present in sample train data
train.drop(['srcip', 'sport', 'dstip', 'dsport', 'attack_cat'], axis=1, inplace=True)

In [None]:
# To use during test data transformation
saved_dict['to_drop'] = ['srcip', 'sport', 'dstip', 'dsport', 'attack_cat']

In [None]:
train.shape, test.shape

((1778032, 37), (762015, 49))

## Applying log1p on Numerical columns

During EDA we found that few numerical columns shows better visualization for pdf curves if we apply log1p to the columns.

So I thought to try log1p on all the columns and check the correlation value of the original column and log1p column with target column i.e. "label"

In [None]:
# Getting number of unique values of all the columns
# If the unique values are high that means it has continuous set of values
col_unique_values = train.nunique()

In [None]:
# If the unique values are getter than some threshould than we will check its corr
col = col_unique_values[col_unique_values>200].index

In [None]:
# Checking corr value of original col and log1p applied col
# Taking those columns whose unique values are getter than some threshould
for column in col:
    print("{:-^30}".format(column))
    multi_corr(column)

-------------dur--------------
Correlation : 0.001927402870127572
log_Correlation: -0.032544137564606314
------------sbytes------------
Correlation : 0.010344749695328405
log_Correlation: -0.356163155589846
------------dbytes------------
Correlation : -0.07641408324455988
log_Correlation: -0.5193868283741504
------------sload-------------
Correlation : 0.19211948100191475
log_Correlation: 0.34746601450349446
------------dload-------------
Correlation : -0.21978094390335498
log_Correlation: -0.6033545881626384
------------spkts-------------
Correlation : -0.12200425437272483
log_Correlation: -0.31635338269675845
------------stcpb-------------
Correlation : -0.23365153315631276
log_Correlation: -0.3135563222142899
------------dtcpb-------------
Correlation : -0.2334607177357848
log_Correlation: -0.31340064798120926
-----------smeansz------------
Correlation : -0.0651799037902329
log_Correlation: -0.15111450989648403
-----------dmeansz------------
Correlation : -0.2723060560764758
log_Cor

In [None]:
# Will apply log1p on this columns and remove original columns
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit', 'network_bytes']

In [None]:
saved_dict['log1p_col'] = log1p_col

In [None]:
# mode values of every features, will use to fill Null values of test
mode_dict = train.mode().iloc[0].to_dict()

In [None]:
def log1p_transform(col, df=train):
    '''
    Apply log1p on given column.
    Remove the original cola and keep log1p applied col
    '''
    new_col = col+'_log1p'
    df[new_col] = df[col].apply(np.log1p)
    df.drop(col, axis=1, inplace=True)

In [None]:
# Transforming columns with log1p
for col in log1p_col:
    log1p_transform(col, df=train)

In [None]:
train.shape

(1778032, 37)

In [None]:
train.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'label', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:
train.shape, test.shape

((1778032, 37), (762015, 49))

In [None]:
# creating x and y set from the dataset
x_train, y_train = train.drop(columns=['label']), train['label']
x_test, y_test = test.drop(columns=['label']), test['label']

In [None]:
print(x_train.shape, y_train.shape)
print()
print(x_test.shape, y_test.shape)

(1778032, 36) (1778032,)

(762015, 48) (762015,)


In [None]:
# Saving all the files to disk to use later
pickle.dump((x_train, y_train), open('.final_ipynb/final_train.pkl', 'wb'))
pickle.dump((x_test, y_test), open('.final_ipynb/final_test.pkl', 'wb'))

In [None]:
# getting categorical and numerical columns in 2 diff lists
cat_col = ['proto', 'service', 'state']
num_col = list(set(x_train.columns) - set(cat_col))

In [None]:
# To use later, during test data cleaning
saved_dict['cat_col'] = cat_col
saved_dict['num_col'] = num_col

In [None]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_dst_ltm,ct_src_ltm,ct_dst_sport_ltm,dur_log1p,sbytes_log1p,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,254,0,,0,0,0,1421930643,33.479,0.0,0.0,0.0,0.0,0,2,0.0,0,0,2,1,1,1,0.095707,5.187386,0.0,9.2756,0.0,1.609438,0.0,0.0,3.828641,0.0,3.878042,0.0,5.187386
1,udp,INT,60,0,dns,0,0,0,1424246229,0.008,0.0,0.0,0.0,0.0,0,0,0.0,0,0,20,16,16,8,8e-06,5.57973,0.0,18.698312,0.0,1.098612,0.0,0.0,4.890349,0.0,0.0,0.0,5.57973
2,tcp,FIN,31,29,,255,0,0,1421948071,0.372205,0.348561,0.000681,0.000547,0.000134,0,0,0.0,0,0,18,3,4,1,0.014733,7.8411,10.048583,14.105347,16.314201,3.713572,20.196135,21.733479,4.174387,6.313548,0.0,3.01207,10.152883
3,tcp,FIN,31,29,ftp,255,0,0,1421971944,16.14474,15.226283,0.000706,0.000553,0.000153,0,0,0.0,1,1,1,5,2,1,0.591974,7.984463,8.227108,10.258074,10.501435,3.970292,21.803017,20.49442,4.043051,4.248495,7.264606,3.984562,8.806124
4,tcp,FIN,31,29,,255,0,0,1421963050,1.2188,1.065941,0.000592,0.00047,0.000122,0,0,0.0,0,0,8,10,10,1,0.018427,7.340187,7.405496,13.339317,13.412088,2.833213,20.673269,21.855078,4.574711,4.521789,4.309533,1.138118,8.066208


## Standardizing
    
As we have seen that the range of few features in this dataset is very large. So we will keep everything within certain range by applying standardscaler. After this all the features will have mean 0 and std 1

In [None]:
# Standardizing the data
scaler = StandardScaler()
scaler = scaler.fit(x_train[num_col])

In [None]:
x_train[num_col] = scaler.transform(x_train[num_col])

In [None]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_dst_ltm,ct_src_ltm,ct_dst_sport_ltm,dur_log1p,sbytes_log1p,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,2.561444,-0.71776,,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,-0.055099,-0.136439,-0.128893,-0.123848,-0.04053,2.541751,-0.198264,-0.13202,-0.111699,-0.665012,-0.666629,-0.719387,-0.419757,-0.251062,-1.052109,-1.703404,-1.383776,-1.879995,-0.704801,-1.190007,-1.189697,-1.137341,-1.850553,0.20427,-0.899657,-1.229918
1,udp,INT,-0.037542,-0.71776,dns,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,-0.055099,-0.136439,-0.128893,-0.123848,-0.04053,-0.382468,-0.198264,-0.13202,-0.111699,0.996846,1.172325,1.109976,0.714379,-0.447582,-0.813189,-1.703404,1.614925,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657,-1.062092
2,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,-0.054857,-0.121383,-0.107394,-0.11811,-0.04053,-0.382468,-0.198264,-0.13202,-0.111699,0.812195,-0.421435,-0.353515,-0.419757,-0.417344,0.563887,0.911063,0.153252,0.988351,0.843701,0.741629,0.888925,-0.582651,0.881301,-0.995343,0.170283,0.894089
3,tcp,FIN,-0.426051,-0.041365,ftp,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,-0.044512,-0.12083,-0.107159,-0.117296,-0.04053,-0.382468,-0.198264,7.574602,5.349858,-0.757337,-0.176241,-0.59743,-0.419757,0.76803,0.651189,0.437147,-1.071111,-0.033644,1.03263,0.895318,0.77042,-0.793357,-0.012242,1.251851,0.51573,0.318009
4,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,-0.054358,-0.123351,-0.110421,-0.118624,-0.04053,-0.382468,-0.198264,-0.13202,-0.111699,-0.111059,0.436744,0.378231,-0.419757,-0.409758,0.258853,0.223377,-0.090531,0.478104,0.195816,0.787264,0.900555,0.059601,0.106012,0.337745,-0.495378,0.001507


## Onehot Encoding

In our dataset we have few categorical columns with text data.
But ML models can't process text data it can process numbers.

So we have to convert categorical columns to numerical columns in some way.
We will use onehotencoder where we will assign 1 if the value is present for the row and rest of the columns will be 0.

In [None]:
# Onehot Encoding
service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(x_train.service.values.reshape(-1,1))
ohe_proto = proto_.fit(x_train.proto.values.reshape(-1,1))
ohe_state = state_.fit(x_train.state.values.reshape(-1,1))

In [None]:
# We are onehot encoding the given column
# Remove the original categorical column
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(x_train[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
    x_train = pd.concat([x_train.drop(col, axis=1), tmp_df], axis=1)

In [None]:
x_train.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_dst_ltm,ct_src_ltm,ct_dst_sport_ltm,dur_log1p,sbytes_log1p,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p,proto_3pc,proto_a/n,proto_aes-sp3-d,proto_any,proto_argus,proto_aris,proto_arp,...,proto_vines,proto_visa,proto_vmtp,proto_vrrp,proto_wb-expak,proto_wb-mon,proto_wsn,proto_xnet,proto_xns-idp,proto_xtp,proto_zero,service_None,service_dhcp,service_dns,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl,state_ACC,state_CLO,state_CON,state_ECO,state_ECR,state_FIN,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,2.561444,-0.71776,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,-0.055099,-0.136439,-0.128893,-0.123848,-0.04053,2.541751,-0.198264,-0.13202,-0.111699,-0.665012,-0.666629,-0.719387,-0.419757,-0.251062,-1.052109,-1.703404,-1.383776,-1.879995,-0.704801,-1.190007,-1.189697,-1.137341,-1.850553,0.20427,-0.899657,-1.229918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.037542,-0.71776,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,-0.055099,-0.136439,-0.128893,-0.123848,-0.04053,-0.382468,-0.198264,-0.13202,-0.111699,0.996846,1.172325,1.109976,0.714379,-0.447582,-0.813189,-1.703404,1.614925,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657,-1.062092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,-0.054857,-0.121383,-0.107394,-0.11811,-0.04053,-0.382468,-0.198264,-0.13202,-0.111699,0.812195,-0.421435,-0.353515,-0.419757,-0.417344,0.563887,0.911063,0.153252,0.988351,0.843701,0.741629,0.888925,-0.582651,0.881301,-0.995343,0.170283,0.894089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,-0.044512,-0.12083,-0.107159,-0.117296,-0.04053,-0.382468,-0.198264,7.574602,5.349858,-0.757337,-0.176241,-0.59743,-0.419757,0.76803,0.651189,0.437147,-1.071111,-0.033644,1.03263,0.895318,0.77042,-0.793357,-0.012242,1.251851,0.51573,0.318009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,-0.054358,-0.123351,-0.110421,-0.118624,-0.04053,-0.382468,-0.198264,-0.13202,-0.111699,-0.111059,0.436744,0.378231,-0.419757,-0.409758,0.258853,0.223377,-0.090531,0.478104,0.195816,0.787264,0.900555,0.059601,0.106012,0.337745,-0.495378,0.001507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Saving all the important parameters and objects to disk so that we can apply same process on test data

In [None]:
file_path = 'final_ipynb/'

In [None]:
pickle.dump(scaler, open(file_path+'scaler.pkl', 'wb'))  # Standard scaler
pickle.dump(saved_dict, open(file_path+'saved_dict.pkl', 'wb'))  # Dictionary with important parameters
pickle.dump(mode_dict, open(file_path+'mode_dict.pkl', 'wb'))  #  Dictionary with most frequent values of columns

In [None]:
# Onehot encoder for categorical columns
pickle.dump(ohe_proto, open(file_path+'ohe_proto.pkl', 'wb'))
pickle.dump(ohe_service, open(file_path+'ohe_service.pkl', 'wb'))
pickle.dump(ohe_state, open(file_path+'ohe_state.pkl', 'wb'))

In [None]:
# Cleaned and processed train data
pickle.dump((x_train, y_train), open(file_path+'final_train.pkl', 'wb'))

## Pipeline functions

In [None]:
def clean_data(data):
    '''
    Cleans given raw data. Performs various cleaning, removes Null and wrong values.
    Check for columns datatype and fix them.
    '''
    numerical_col = data.select_dtypes(include=np.number).columns  # All the numerical columns list
    categorical_col = data.select_dtypes(exclude=np.number).columns  # All the categorical columns list
    
    # Cleaning the data
    for col in data.columns:
        val = mode_dict[col]  # Mode value of the column in train data
        data[col] = data[col].fillna(value=val)
        data[col] = data[col].replace(' ', value=val)
        data[col] = data[col].apply(lambda x:"None" if x=="-" else x)

        # Fixing binary columns
        if col in saved_dict['binary_col']:
            data[col] = np.where(data[col]>1, val, data[col])

    # Fixing datatype of columns
    bad_dtypes = list(set(categorical_col) - set(saved_dict['cat_col']))
    for bad_col in bad_dtypes:
        data[col] = data[col].astype(float)
    
    return data

In [None]:
def apply_log1p(data):
    '''
    Performs FE on the data. Apply log1p on the specified columns create new column and remove those original columns.
    '''
    for col in saved_dict['log1p_col']:
        new_col = col + '_log1p'  # New col name
        data[new_col] = data[col].apply(np.log1p)  # Creating new column on transformed data
        data.drop(col, axis=1, inplace=True)  # Removing old columns
    return data

In [None]:
def standardize(data):
    '''
    Stanardize the given data. Performs mean centering and varience scaling.
    Using stanardscaler object trained on train data.
    '''
    data[saved_dict['num_col']] = scaler.transform(data[saved_dict['num_col']])
    return data

In [None]:
def ohencoding(data):
    '''
    Onehot encoding the categoricla columns.
    Add the ohe columns with the data and removes categorical columns.
    Using Onehotencoder objects trained on train data.
    '''

    # Onehot encoding cat col using onehotencoder objects
    X = ohe_service.transform(data['service'].values.reshape(-1, 1))
    Xm = ohe_proto.transform(data['proto'].values.reshape(-1, 1))
    Xmm = ohe_state.transform(data['state'].values.reshape(-1, 1))
    
    # Adding encoding data to original data
    data = pd.concat([data,
                      pd.DataFrame(Xm.toarray(), columns=['proto_'+i for i in ohe_proto.categories_[0]]),
                      pd.DataFrame(X.toarray(), columns=['service_'+i for i in ohe_service.categories_[0]]),
                      pd.DataFrame(Xmm.toarray(), columns=['state_'+i for i in ohe_state.categories_[0]])],
                      axis=1)
    
    # Removing cat columns
    data.drop(['proto', 'service', 'state'], axis=1, inplace=True)

    return data

In [None]:
# Parametrs
saved_dict = pickle.load(open(file_path+'saved_dict.pkl', 'rb'))
# Mode value of all the columns
mode_dict = pickle.load(open(file_path+'mode_dict.pkl', 'rb'))
# Stanardscaler object
scaler = pickle.load(open(file_path+'scaler.pkl', 'rb'))

In [None]:
# One hot encoder objects
ohe_proto = pickle.load(open(file_path+'ohe_proto.pkl', 'rb'))
ohe_service = pickle.load(open(file_path+'ohe_service.pkl', 'rb'))
ohe_state = pickle.load(open(file_path+'ohe_state.pkl', 'rb'))

In [None]:
x_test.shape

(762015, 48)

In [None]:
# Resetting index of test data
x_test.reset_index(drop=True, inplace=True)

In [None]:
x_test.shape

(762015, 48)

In [None]:
x_test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat'],
      dtype='object')

In [None]:
# Adding column names
x_test.columns = saved_dict['columns']

In [None]:
# Creating new Feature
x_test['network_bytes'] = x_test['dbytes'] + x_test['sbytes']

In [None]:
# Droping all the unwanted columns
dropable_col = saved_dict['to_drop'] + saved_dict['corr_col']
x_test.drop(columns=dropable_col, inplace=True)

In [None]:
x_test.shape

(762015, 36)

In [None]:
# Cleaning data using clean_data()
x_test = clean_data(x_test)

In [None]:
x_test.shape

(762015, 36)

In [None]:
# FE: applying log1p using apply_log1p()
x_test = apply_log1p(x_test)

In [None]:
x_test.shape

(762015, 36)

In [None]:
x_test.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'dur_log1p', 'sbytes_log1p', 'dbytes_log1p',
       'sload_log1p', 'dload_log1p', 'spkts_log1p', 'stcpb_log1p',
       'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p', 'sjit_log1p',
       'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:
# Standardscaling using stanardize()
x_test = standardize(x_test)

In [None]:
x_test.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_dst_ltm,ct_src_ltm,ct_dst_sport_ltm,dur_log1p,sbytes_log1p,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,1.775189,-0.964938,dns,-0.789226,-0.581776,-0.382468,35141420000.0,-1.878061,-7.270119,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,-0.417373,-0.089072,128.349675,39.174197,-0.44547,-0.067822,-0.596769,292.602505,36.097275,1.778657,-0.841345,-0.055099,-0.77835,-0.434216,-4.211001,-3.448839,1.15486
1,udp,CON,1.775189,-0.964938,dns,-0.789226,-0.581776,-0.382468,35140990000.0,-1.879116,-7.265306,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,1.790426,-0.088988,128.349675,39.174197,-0.445529,-0.067822,-0.596769,293.229567,36.173933,1.778657,-0.841345,-0.055099,-0.77835,-0.434216,-4.211001,-3.448839,1.15486
2,udp,INT,4.548792,-1.196045,dns,-0.789226,-0.581776,-0.382468,35197610000.0,-1.878413,-7.279745,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,20.924684,-0.08859,1070.488842,982.44836,-0.44758,-0.06761,-0.71776,410.641491,-0.225343,1.778657,-0.841345,-0.055099,-0.722275,-0.849663,-4.211001,-3.448839,1.082362
3,tcp,FIN,1.775189,-0.964938,,30.473001,-0.581776,-0.382468,35141150000.0,-1.830755,-6.84837,-1.703222,-4.335486,-0.995296,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,3.262292,-0.088988,256.823197,39.174197,-0.437109,-0.06664,-0.535677,344.101262,40.593628,5.10017,1.821291,-0.039918,-0.674575,-0.395423,-2.574204,-3.321848,2.212211
4,udp,CON,1.775189,-0.964938,dns,-0.789226,-0.581776,-0.382468,35197860000.0,-1.878588,-7.270119,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,0.31856,-0.089093,85.525167,39.174197,-0.445271,-0.067822,-0.596769,290.625628,35.855604,1.778657,-0.841345,-0.055099,-0.77835,-0.434216,-4.211001,-3.448839,1.15486


In [None]:
# Onehot encoding categorical columns using ohencoding()
x_test = ohencoding(x_test)

In [None]:
x_test.shape

(762015, 197)

In [None]:
# Final test data
x_test.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_dst_ltm,ct_src_ltm,ct_dst_sport_ltm,dur_log1p,sbytes_log1p,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p,proto_3pc,proto_a/n,proto_aes-sp3-d,proto_any,proto_argus,proto_aris,proto_arp,...,proto_vines,proto_visa,proto_vmtp,proto_vrrp,proto_wb-expak,proto_wb-mon,proto_wsn,proto_xnet,proto_xns-idp,proto_xtp,proto_zero,service_None,service_dhcp,service_dns,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl,state_ACC,state_CLO,state_CON,state_ECO,state_ECR,state_FIN,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,1.775189,-0.964938,-0.789226,-0.581776,-0.382468,35141420000.0,-1.878061,-7.270119,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,-0.417373,-0.089072,128.349675,39.174197,-0.44547,-0.067822,-0.596769,292.602505,36.097275,1.778657,-0.841345,-0.055099,-0.77835,-0.434216,-4.211001,-3.448839,1.15486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.775189,-0.964938,-0.789226,-0.581776,-0.382468,35140990000.0,-1.879116,-7.265306,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,1.790426,-0.088988,128.349675,39.174197,-0.445529,-0.067822,-0.596769,293.229567,36.173933,1.778657,-0.841345,-0.055099,-0.77835,-0.434216,-4.211001,-3.448839,1.15486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.548792,-1.196045,-0.789226,-0.581776,-0.382468,35197610000.0,-1.878413,-7.279745,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,20.924684,-0.08859,1070.488842,982.44836,-0.44758,-0.06761,-0.71776,410.641491,-0.225343,1.778657,-0.841345,-0.055099,-0.722275,-0.849663,-4.211001,-3.448839,1.082362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.775189,-0.964938,30.473001,-0.581776,-0.382468,35141150000.0,-1.830755,-6.84837,-1.703222,-4.335486,-0.995296,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,3.262292,-0.088988,256.823197,39.174197,-0.437109,-0.06664,-0.535677,344.101262,40.593628,5.10017,1.821291,-0.039918,-0.674575,-0.395423,-2.574204,-3.321848,2.212211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.775189,-0.964938,-0.789226,-0.581776,-0.382468,35197860000.0,-1.878588,-7.270119,-1.703404,-4.33566,-0.995343,-0.111699,-0.13202,-1.850553,-0.841353,-1254.632022,0.31856,-0.089093,85.525167,39.174197,-0.445271,-0.067822,-0.596769,290.625628,35.855604,1.778657,-0.841345,-0.055099,-0.77835,-0.434216,-4.211001,-3.448839,1.15486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Matching test data columns with train data columns
all(x_train.columns == x_test.columns)

True