In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from lightgbm import plot_importance
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

In [3]:
from metrics import binary_evaluate

In [4]:
read_file = './data/kdd99/kddcup_10p_preprocessing.csv'
# test_file = './data/nslkdd/KDDTest_binary.csv'
df = pd.read_csv(read_file)
# test_df = pd.read_csv(test_file)

In [6]:
numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
categorical_features = [x for x in df.columns if df[x].dtype == object]

In [7]:
def discretization(x):
    mapp = dict(zip(x.unique().tolist(),
         range(len(x.unique().tolist()))))
    def mapfunction(y):
        if y in mapp:
            return mapp[y]
        else:
            return -1
    return mapfunction

In [8]:
for i in categorical_features:
    df[i] = df[i].apply(discretization(df[i]))
    # test_df[i] = test_df[i].apply(discretization(test_df[i]))
categorical_features.remove('label')

In [59]:
# train_df = shuffle(train_df)
# x_train = train_df.copy()
# y_train = x_train.pop('label')

In [60]:
# test_df = shuffle(test_df)
# x_test = test_df.copy()
# y_test = x_test.pop('label')

In [132]:
def preprocessing(file_path):
    df = pd.read_csv(file_path)
    label = 'label'
    target = df.pop(label)
    # df = df[columns]

    numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
    categorical_features = [x for x in df.columns if df[x].dtype == object]

    def discretization(x):
        mapp = dict(zip(x.unique().tolist(),
            range(len(x.unique().tolist()))))
        def mapfunction(y):
            if y in mapp:
                return mapp[y]
            else:
                return -1
        return mapfunction
    for i in categorical_features:
        df[i] = df[i].apply(discretization(df[i]))
    target = target.apply(discretization(target))

    # categorical_features.remove('label')
    # target = df.pop('label')

    mms = MinMaxScaler()
    # enc = OneHotEncoder()
    # enc.fit(df[categorical_features])
    # features = pd.concat([df, pd.DataFrame(enc.transform(df[categorical_features]).toarray())], axis=1)
    # features.drop(categorical_features, axis=1, inplace=True)
    df[numerical_features] = mms.fit_transform(df[numerical_features])

    return df, target

In [9]:
features = df
target = features.pop('label')
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

clf = LGBMClassifier(subsample_freq=1, subsample=1, colsample_bytree=1)
clf.fit(x_train, y_train, categorical_feature=categorical_features, eval_set=(x_test, y_test), eval_metric=['auc'])
importance = pd.Series(clf.feature_importances_, index=df.columns)
importance = importance.sort_values(ascending=False)
importance

New categorical_feature is ['flag', 'protocol_type', 'service']
[1]	valid_0's auc: 0.999635	valid_0's binary_logloss: 0.40294
[2]	valid_0's auc: 0.999663	valid_0's binary_logloss: 0.340625
[3]	valid_0's auc: 0.999665	valid_0's binary_logloss: 0.293588
[4]	valid_0's auc: 0.999665	valid_0's binary_logloss: 0.255946
[5]	valid_0's auc: 0.999665	valid_0's binary_logloss: 0.224835
[6]	valid_0's auc: 0.999667	valid_0's binary_logloss: 0.198584
[7]	valid_0's auc: 0.999667	valid_0's binary_logloss: 0.176149
[8]	valid_0's auc: 0.999686	valid_0's binary_logloss: 0.156793
[9]	valid_0's auc: 0.999851	valid_0's binary_logloss: 0.139872
[10]	valid_0's auc: 0.999852	valid_0's binary_logloss: 0.125052
[11]	valid_0's auc: 0.999873	valid_0's binary_logloss: 0.112032
[12]	valid_0's auc: 0.999886	valid_0's binary_logloss: 0.100511
[13]	valid_0's auc: 0.999886	valid_0's binary_logloss: 0.0903113
[14]	valid_0's auc: 0.999892	valid_0's binary_logloss: 0.0812541
[15]	valid_0's auc: 0.999921	valid_0's binary_lo

src_bytes                      642
dst_bytes                      222
protocol_type                  167
count                          166
dst_host_srv_count             165
dst_host_count                 159
duration                       155
dst_host_diff_srv_rate         126
dst_host_srv_diff_host_rate    113
service                        110
dst_host_same_srv_rate         108
hot                            103
dst_host_same_src_port_rate    101
logged_in                       93
dst_host_serror_rate            90
dst_host_srv_serror_rate        59
dst_host_rerror_rate            53
srv_count                       40
num_root                        39
wrong_fragment                  35
same_srv_rate                   35
num_file_creations              31
dst_host_srv_rerror_rate        29
root_shell                      26
flag                            26
srv_diff_host_rate              19
num_compromised                 16
num_access_files                14
serror_rate         

In [10]:
importance_10 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.1, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_10[i] = clf.feature_importances_



In [11]:
importance_20 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.2, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_20[i] = clf.feature_importances_



In [12]:
importance_30 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.3, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_30[i] = clf.feature_importances_



In [13]:
importance_40 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.4, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_40[i] = clf.feature_importances_



In [14]:
importance_50 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.5, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_50[i] = clf.feature_importances_



In [15]:
importance_60 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.6, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_60[i] = clf.feature_importances_



In [16]:
importance_70 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.7, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_70[i] = clf.feature_importances_



In [17]:
importance_80 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.8, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_80[i] = clf.feature_importances_



In [18]:
importance_90 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.9, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_90[i] = clf.feature_importances_



In [19]:
importance_100 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=1, colsample_bytree=1)
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_100[i] = clf.feature_importances_



In [20]:
pd.DataFrame(importance_10, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_10.csv')
pd.DataFrame(importance_20, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_20.csv')
pd.DataFrame(importance_30, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_30.csv')
pd.DataFrame(importance_40, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_40.csv')
pd.DataFrame(importance_50, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_50.csv')
pd.DataFrame(importance_60, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_60.csv')
pd.DataFrame(importance_70, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_70.csv')
pd.DataFrame(importance_80, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_80.csv')
pd.DataFrame(importance_90, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_90.csv')
pd.DataFrame(importance_100, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_100.csv')

In [21]:
pd.DataFrame(importance_10, index=df.columns).to_csv('importance_raw_10.csv')
pd.DataFrame(importance_20, index=df.columns).to_csv('importance_raw_20.csv')
pd.DataFrame(importance_30, index=df.columns).to_csv('importance_raw_30.csv')
pd.DataFrame(importance_40, index=df.columns).to_csv('importance_raw_40.csv')
pd.DataFrame(importance_50, index=df.columns).to_csv('importance_raw_50.csv')
pd.DataFrame(importance_60, index=df.columns).to_csv('importance_raw_60.csv')
pd.DataFrame(importance_70, index=df.columns).to_csv('importance_raw_70.csv')
pd.DataFrame(importance_80, index=df.columns).to_csv('importance_raw_80.csv')
pd.DataFrame(importance_90, index=df.columns).to_csv('importance_raw_90.csv')
pd.DataFrame(importance_100, index=df.columns).to_csv('importance_raw_100.csv')