In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from lightgbm import plot_importance
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

In [None]:
from metrics import multi_evaluate

In [None]:
train_file = './data/kdd99/kddcup_10p_preprocessing_five.csv'
df = pd.read_csv(train_file)

In [None]:
numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
categorical_features = [x for x in df.columns if df[x].dtype == object]

In [None]:
def discretization(x):
    mapp = dict(zip(x.unique().tolist(),
         range(len(x.unique().tolist()))))
    def mapfunction(y):
        if y in mapp:
            return mapp[y]
        else:
            return -1
    return mapfunction

In [None]:
for i in categorical_features:
    df[i] = df[i].apply(discretization(df[i]))
target = df.pop('label')
categorical_features.remove('label')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2)

In [None]:
# features = pd.concat([train_df, test_df], axis=0)
# target = features.pop('label')
# x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

clf = LGBMClassifier(subsample_freq=1, subsample=1, colsample_bytree=1, objective='multiclass')
clf.fit(x_train, y_train, categorical_feature=categorical_features, eval_set=(x_test, y_test))
# importance = pd.Series(clf.feature_importances_, index=df.columns)
# importance = importance.sort_values(ascending=False)
# importance

In [None]:
importance_10 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.1, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_10[i] = clf.feature_importances_

In [None]:
importance_20 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.2, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_20[i] = clf.feature_importances_

In [None]:
importance_30 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.3, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_30[i] = clf.feature_importances_

In [None]:
importance_40 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.4, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_40[i] = clf.feature_importances_

In [None]:
importance_50 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.5, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_50[i] = clf.feature_importances_

In [None]:
importance_60 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.6, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_60[i] = clf.feature_importances_

In [None]:
importance_70 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.7, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_70[i] = clf.feature_importances_

In [None]:
importance_80 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.8, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_80[i] = clf.feature_importances_

In [None]:
importance_90 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=0.9, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_90[i] = clf.feature_importances_

In [None]:
importance_100 = dict()
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.0001)

    clf = LGBMClassifier(subsample_freq=1, subsample=1, colsample_bytree=1, objective='multiclass')
    clf.fit(x_train, y_train, categorical_feature=categorical_features)
    importance_100[i] = clf.feature_importances_

In [None]:
pd.DataFrame(importance_10, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_10.csv')
pd.DataFrame(importance_20, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_20.csv')
pd.DataFrame(importance_30, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_30.csv')
pd.DataFrame(importance_40, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_40.csv')
pd.DataFrame(importance_50, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_50.csv')
pd.DataFrame(importance_60, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_60.csv')
pd.DataFrame(importance_70, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_70.csv')
pd.DataFrame(importance_80, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_80.csv')
pd.DataFrame(importance_90, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_90.csv')
pd.DataFrame(importance_100, index=df.columns).sum(axis=1).sort_values(ascending=False).to_csv('importance_100.csv')

In [None]:
pd.DataFrame(importance_10, index=df.columns).to_csv('importance_raw_10.csv')
pd.DataFrame(importance_20, index=df.columns).to_csv('importance_raw_20.csv')
pd.DataFrame(importance_30, index=df.columns).to_csv('importance_raw_30.csv')
pd.DataFrame(importance_40, index=df.columns).to_csv('importance_raw_40.csv')
pd.DataFrame(importance_50, index=df.columns).to_csv('importance_raw_50.csv')
pd.DataFrame(importance_60, index=df.columns).to_csv('importance_raw_60.csv')
pd.DataFrame(importance_70, index=df.columns).to_csv('importance_raw_70.csv')
pd.DataFrame(importance_80, index=df.columns).to_csv('importance_raw_80.csv')
pd.DataFrame(importance_90, index=df.columns).to_csv('importance_raw_90.csv')
pd.DataFrame(importance_100, index=df.columns).to_csv('importance_raw_100.csv')