In [2]:
from keras.layers import Input, Dense, InputLayer
from keras.models import Model, Sequential, load_model
from keras.datasets import mnist
from keras import optimizers, initializers
from keras.utils import to_categorical
from keras import regularizers
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

In [3]:
# for Decision Tree
from sklearn import tree
from sklearn.metrics import accuracy_score

In [4]:
import sys, os
sys.path.append(os.pardir) 
import h5py # for model file

In [16]:
dataset_list = ["australian", "cancer", "german", "BurstHeaderPacket", "statlog", "wine"]
#dataset_list = ["australian"]
#dataset_list = ["wine"]

In [8]:
model_dir = "./model_AE/"
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

In [17]:
train_result_each_data = []
test_result_each_data = []
tree_train_each_data = []
tree_result_each_data = []
tree_size_each_data = []

for data in dataset_list:
    print("----- " + str(data) + " -----")
    result_train_each_k = []
    result_test_each_k = []
    result_tree_train = []
    result_tree_test = []
    size_tree = []
    
    df = pd.read_csv("../dataset/" + str(data) + "_m0s1.csv", header=None)
    
    K = 10
    features_length = len(df.columns) - 1
    block_size = math.floor(len(df.index) / K)  # test data size for one validation
    num_class = max(df.iloc[:, 0]) + 1

    for k in range(K):
        print("----- k = " + str(k+1) + " -----")

        # rearrange dataset
        dataset = df
        test_range = range(k * block_size, (k + 1) * block_size) # e.g. iris 0~14, 15~29 ...
        test = dataset[test_range[0]:test_range[-1] + 1]
        dataset = dataset.drop(test_range)
        train = dataset

        x_train = train.iloc[:, 1:]
        y_train = train.iloc[:, 0]
        x_test = test.iloc[:, 1:]
        y_test = test.iloc[:, 0]

        y_train = to_categorical(y_train, num_classes=num_class)
        y_test = to_categorical(y_test, num_classes=num_class)

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)
        
        # 1st encoding
        encoding_dim = 100

        autoencoder1 = Sequential()
        autoencoder1.add(InputLayer(input_shape=(features_length,)))
        autoencoder1.add(Dense(encoding_dim, activation='relu',\
                       activity_regularizer=regularizers.l1(1e-2),\
                       kernel_initializer='he_uniform', name='encoded1'
                       ))
        autoencoder1.add(Dense(features_length, activation='sigmoid', name='decoded'))

        autoencoder1.compile(optimizer='adam', loss='mean_squared_error', metrics=["acc"])
        #autoencoder1.summary()
        autoencoder1.fit(x_train, x_train,\
                        epochs=50,\
                        batch_size=128,\
                        shuffle=True,\
                        #validation_data=(x_test, x_test),\
                        validation_split=0.1,\
                        verbose=0)
        result_train1 = autoencoder1.evaluate(x_train, x_train, batch_size=32,verbose=0)
        result_test1 = autoencoder1.evaluate(x_train, x_train, batch_size=32,verbose=0)
        print("encoder1 train: " + str(result_train1))
        print("encoder1 test: " + str(result_test1))

        model_name1 = str(model_dir) + "/encoder1.h5"
        autoencoder1.save(model_name1)

        model_weights1 = str(model_dir) + "/encoder1_weights.h5"
        autoencoder1.save_weights(model_weights1)


        # extracting hidden output
        encoder1 = Model(input=autoencoder1.input, output=autoencoder1.get_layer('encoded1').output)
        encoded1_train = encoder1.predict(x_train)
        encoded1_test = encoder1.predict(x_test)


        # 2nd encoding
        encoding_dim2 = encoding_dim
        autoencoder2 = Sequential()
        autoencoder2.add(Dense(encoding_dim2, input_dim=encoding_dim, activation='relu',\
                       activity_regularizer=regularizers.l1(1e-2), \
                       kernel_initializer='he_uniform', name='encoded2'
                       ))
        autoencoder2.add(Dense(encoding_dim, activation='sigmoid', name='decoded2'))

        autoencoder2.compile(optimizer='adam', loss='mean_squared_error', metrics=["acc"])
        #autoencoder2.summary()
        autoencoder2.fit(encoded1_train, encoded1_train,\
                       epochs=50,\
                       batch_size=128,\
                       shuffle=True,\
                       #validation_data=(encoded1_test, encoded1_test),\
                       validation_split=0.1,\
                       verbose=0)
        result_train2 = autoencoder2.evaluate(encoded1_train, encoded1_train, batch_size=32,verbose=0)
        result_test2 = autoencoder2.evaluate(encoded1_test, encoded1_test, batch_size=32,verbose=0)
        print("encoder2 train: " + str(result_train2))
        print("encoder2 test: " + str(result_test2))

        model_name2 = str(model_dir) + "/encoder2.h5"
        autoencoder2.save(model_name2)

        model_weights2 = str(model_dir) + "/encoder2_weights.h5"
        autoencoder2.save_weights(model_weights2)


        # extracting hidden output
        encoder2 = Model(input=autoencoder2.input, output=autoencoder2.get_layer('encoded2').output)
        encoded2_train = encoder2.predict(encoded1_train)
        encoded2_test = encoder2.predict(encoded1_test)


        encoding_dim3 = encoding_dim2

        autoencoder3 = Sequential()
        autoencoder3.add(Dense(encoding_dim3, input_dim=encoding_dim2, activation='relu',\
                       activity_regularizer=regularizers.l1(1e-2), \
                       kernel_initializer='he_uniform', name='encoded3'
                       ))
        autoencoder3.add(Dense(encoding_dim2, activation='sigmoid', name='decoded3'))

        autoencoder3.compile(optimizer='adam', loss='mean_squared_error', metrics=["acc"])
        #autoencoder3.summary()
        autoencoder3.fit(encoded2_train, encoded2_train,\
                       epochs=50,\
                       batch_size=128,\
                       shuffle=True,\
                       #validation_data=(encoded2_test, encoded2_test),\
                       validation_split=0.1,\
                       verbose=0)
        result_train3 = autoencoder3.evaluate(encoded2_train, encoded2_train, batch_size=32,verbose=0)
        result_test3 = autoencoder3.evaluate(encoded2_test, encoded2_test, batch_size=32,verbose=0)
        print("encoder3 train: " + str(result_train3))
        print("encoder3 test: " + str(result_test3))

        model_name3 = str(model_dir) + "/encoder3.h5"
        autoencoder3.save(model_name3)

        model_weights3 = str(model_dir) + "/encoder3_weights.h5"
        autoencoder3.save_weights(model_weights3)

        encoder3 = Model(input=autoencoder3.input, output=autoencoder3.get_layer('encoded3').output)
        encoded_data3 = encoder3.predict(encoded2_test)

        #print(encoded_data3)
        #print("encoded3 data mean: ", encoded_data3.mean())

        encoded3_train = encoder3.predict(encoded2_train)
        encoded3_test = encoder3.predict(encoded2_test)


        # Normal network
        autoencoder_main = Sequential()
        autoencoder_main.add(InputLayer(input_shape=(features_length,)))
        autoencoder_main.add(Dense(encoding_dim, activation='relu', name='encoded1'))
        autoencoder_main.add(Dense(encoding_dim2, activation='relu', name='encoded2'))
        autoencoder_main.add(Dense(encoding_dim3, activation='relu', name='encoded3'))

        autoencoder_main.add(Dense(num_class, activation='softmax', name='decoded_main'))
        autoencoder_main.load_weights(model_weights1, by_name=True)
        autoencoder_main.load_weights(model_weights2, by_name=True)
        autoencoder_main.load_weights(model_weights3, by_name=True)

        autoencoder_main.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["acc"])
        autoencoder_main.summary()
        autoencoder_main.fit(x_train, y_train,\
                       epochs=200,\
                       batch_size=128,\
                       shuffle=True,\
                       #validation_data=(x_test, y_test),\
                       validation_split=0.1, \
                       verbose=0)
        result_train_main = autoencoder_main.evaluate(x_train, y_train, batch_size=32,verbose=0)
        result_test_main = autoencoder_main.evaluate(x_test, y_test, batch_size=32,verbose=0)
        print("encoder_main result(train): " + str(result_train_main))
        print("encoder_main result: " + str(result_test_main))
        result_train_each_k.append(np.round(result_train_main[1], 3))
        result_test_each_k.append(np.round(result_test_main[1], 3))


        model3_name = str(model_dir) + "/encoder3.h5"
        autoencoder_main.save(model3_name)

        encoder_main = Model(input=autoencoder_main.input, output=autoencoder_main.get_layer('encoded3').output)
        encoded_main_train = encoder_main.predict(x_train)
        encoded_main_test = encoder_main.predict(x_test)


        # make Decision Tree from hidden layer output
        clf = tree.DecisionTreeClassifier(presort=True)
        clf = clf.fit(encoded_main_train, y_train)
        predict_tree_train = clf.predict(encoded_main_train)
        predict_tree_test = clf.predict(encoded_main_test)
        acc_tree_train = accuracy_score(y_train, predict_tree_train)
        acc_tree_test = accuracy_score(y_test, predict_tree_test)
        print("Decision Tree(train): "+ str(acc_tree_train))
        print("Decision Tree: "+ str(acc_tree_test))
        result_tree_train.append(np.round(acc_tree_train, 3))
        result_tree_test.append(np.round(acc_tree_test, 3))

        node = clf.tree_.feature
        node_num = len(node)
        print(node_num)
        size_tree.append(node_num)

        # make Decision Tree from raw data
        clf_raw = tree.DecisionTreeClassifier(presort=True)
        clf_raw = clf_raw.fit(x_train, y_train)
        predict_raw_train = clf_raw.predict(x_train)
        predict_raw_test = clf_raw.predict(x_test)
        acc_raw_train = accuracy_score(predict_raw_train, y_train)
        acc_raw_test = accuracy_score(predict_raw_test, y_test)
        #print(acc_raw_train)
        #print(acc_raw_test)

        node_raw = clf_raw.tree_.feature
        node_raw_num = len(node_raw)
        print(node_raw_num)
    train_result_each_data.append(np.round(np.mean(result_train_each_k), 3))
    test_result_each_data.append(np.round(np.mean(result_test_each_k), 3))
    tree_train_each_data.append(np.round(np.mean(result_tree_train), 3))
    tree_result_each_data.append(np.round(np.mean(result_tree_test), 3))
    tree_size_each_data.append(np.round(np.mean(size_tree), 3))


----- australian -----
----- k = 1 -----
encoder1 train: [6.874256457683545, 0.4863123994038685]
encoder1 test: [6.874256457683545, 0.4863123994038685]




encoder2 train: [0.22943937821664673, 0.04830917874396135]
encoder2 test: [0.2048462303220362, 0.028985507246376812]




encoder3 train: [0.1931925710176692, 0.001610305958132045]
encoder3 test: [0.19535134106442548, 0.0]




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_130 (InputLayer)       (None, 14)                0         
_________________________________________________________________
encoded1 (Dense)             (None, 100)               1500      
_________________________________________________________________
encoded2 (Dense)             (None, 100)               10100     
_________________________________________________________________
encoded3 (Dense)             (None, 100)               10100     
_________________________________________________________________
decoded_main (Dense)         (None, 2)                 202       
Total params: 21,902
Trainable params: 21,902
Non-trainable params: 0
_________________________________________________________________
encoder_main result(train): [0.14661665635757845, 0.9790660228322285]
encoder_main result: [0.8839833684638454, 0.8985507246376812]




Decision Tree(train): 1.0
Decision Tree: 0.8840579710144928
29
157
----- k = 2 -----
encoder1 train: [6.515817667551087, 0.5088566828177171]
encoder1 test: [6.515817667551087, 0.5088566828177171]
encoder2 train: [0.22987695647802522, 0.014492753623188406]
encoder2 test: [0.20374397264010663, 0.04347826108552408]
encoder3 train: [0.19304172956233248, 0.00322061191626409]
encoder3 test: [0.20742030700911646, 0.0]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_132 (InputLayer)       (None, 14)                0         
_________________________________________________________________
encoded1 (Dense)             (None, 100)               1500      
_________________________________________________________________
encoded2 (Dense)             (None, 100)               10100     
_________________________________________________________________
encoded3 (Dense)             (None, 100)               10

In [18]:
print(train_result_each_data)
print(test_result_each_data)
print(tree_train_each_data)
print(tree_result_each_data)
print(tree_size_each_data)

[0.937, 0.962, 0.63, 0.464, 0.146, 1.0]
[0.802, 0.928, 0.63, 0.466, 0.117, 0.947]
[0.956, 0.964, 0.633, 0.0, 0.002, 1.0]
[0.776, 0.925, 0.629, 0.0, 0.007, 0.894]
[29.2, 4.4, 9.0, 1.0, 7.8, 5.0]


In [51]:
print(np.std(result_train_each_k))
print(np.std(result_test_each_k))
print(np.std(result_tree_train))
print(np.std(result_tree_test))
print(np.std(size_tree))

0.00825832912882504
0.04598869426282942
0.0039000000000000037
0.052289195824759056
5.3814496188294845
