### 堆栈式自编码器
#### 1 优点
1. 以分类为目的的微调，惯用做法是丢掉堆栈式自编码网络的解码层，直接将隐藏层的输出/输入到分类器即可
2. 经过预训练在微调得到的模型效果远远好于纯粹用监督学习得到的结果

#### 2 步骤
- step1:训练分类器找到分类问题较优的网络结构
- step2:以分类问题的网络结构训练堆栈式自编码器的底层
- step3:由step2获取的底层自编码器微调各个目标领域的分类器

#### 3 相关数据
- yhj_fine_tune_data = pd.read_hdf('yhj_fine_tune_data.hdf',key='fine_tune')  
- yhj_encoding_data = pd.read_hdf('yhj_encoding_data.hdf',key='encoding')

In [None]:
import time
import pandas as pd
import tensorflow as tf
import tensorlayer as tl
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
# #数据读取
# reader = pd.read_csv('./yhj_index_20181112.csv',encoding='gbk',index_col= False,chunksize=100000)

# #获取微调的目标数据（正负样本比例1:1），并存储
# target_y =  [' inv_desire', ' current_fin', ' load_desire', ' fund_desire', ' bank_desire', ' risk_desire', ' card_desire']

# def get_df(file):
#     data_splits = pd.DataFrame()
#     for chunk in  file:
#         for y in target_y:
# #             cols = [col for col in chunk.columns if col not in list(set(target_y)-set(y))]
# #             chunk = chunk.loc[:,cols]
#             label = chunk.loc[chunk[y]==1,:]
#             y_num = len(label)
#             unlabel = chunk.loc[chunk[y]==0,:].sample(n=y_num,axis=0,random_state=123)
# #             print(len(label),len(unlabel))
#             if (len(label)!=0):
#                 a = pd.concat([label,unlabel],axis=0).sample(frac=1) #数据打散,取数时需要做数据的标准化
#                 a['type'] = y
#                 data_splits = data_splits.append(a)
#             else:
#                 continue
#     return data_splits

# data = get_df(reader)
# data = data.drop(['dt'],axis=0)
# data.to_hdf('yhj_fine_tune_data.hdf',key='fine_tune')

# #堆栈式自编码器数据获取
# data = pd.DataFrame()
# i = 0
# for chunk in reader:
#     target_y =  [' inv_desire', ' current_fin', ' load_desire', ' fund_desire', ' bank_desire', ' risk_desire', ' card_desire']
#     cols = [col for col in chunk.columns if col not in target_y]
#     chunk = chunk.loc[:,cols]
#     if (i <=20):
#         data = data.append(chunk)
#     else:
#         pass
#     i = i + 1
    
# # data.to_hdf('yhj_encoding_data.hdf',key='encoding') 

#### step2:自编码器网络设计

In [None]:
yhj_encoding_data = pd.read_hdf('yhj_encoding_data.hdf',key='encoding')
yhj_encoding_data = yhj_encoding_data.drop([' dt'],axis=1)
X_data = yhj_encoding_data.fillna(0)
X_data = preprocessing.scale(X_data)
X_train = X_data[:1500000]
X_val = X_data[1500000:]
# len(X_data) 2100000

In [None]:
#堆栈式自编码器
#模型参数
model = 'relu'
n_epoch = 200
batch_size = 128
learning_rate = 0.0001
print_freq = 10
sess = tf.InteractiveSession()
act = tf.nn.relu
act_recon = tf.nn.softplus

In [None]:
#定义模型
x = tf.placeholder(tf.float32, shape=[None, 72], name='x')
y_ = tf.placeholder(tf.float32, shape=[None, 1], name='y_')

network = tl.layers.InputLayer(x, name='input_layer')
#降噪层
network = tl.layers.DropoutLayer(network, keep=0.5, name='denoising1')
# 第一个降噪自编码器：1st layer
network = tl.layers.DropoutLayer(network, keep=0.8, name='drop1')
network = tl.layers.DenseLayer(network, n_units=50, act=act, name=model + '1')
x_recon1 = network.outputs
recon_layer1 = tl.layers.ReconLayer(network, x_recon=x, n_units=72, act=act_recon, name='recon_layer1')
# 第二个降噪自编码器：2nd layer
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop2')
network = tl.layers.DenseLayer(network, n_units=30, act=act, name=model + '2')
#因为第二个降噪自编码器的输入来自第一个降噪自编码器，因此其重构输出目标也是接近上一层输出
recon_layer2 = tl.layers.ReconLayer(network, x_recon=x_recon1, n_units=50, act=act_recon, name='recon_layer2')
# 分类器3rd layer
# network = tl.layers.DropoutLayer(network, keep=0.5, name='drop3')
network = tl.layers.DenseLayer(network,n_units=1, act=tf.identity, name='output')


In [None]:
##逐层贪婪预训练 Greedy Layer-Wise Pretrain
#初始化所有变量
sess = tf.InteractiveSession()
tl.layers.initialize_global_variables(sess)
# Pre-train
print("\nAll Network Params before pre-train")
network.print_params()
print("\nPre-train Layer 1")
#预训练阶段只开启desonising1层，各降噪自编码器内部dropout层
#pretrain用于快速实现自编码器深度神经网络的逐层贪婪预训练
recon_layer1.pretrain(
    sess, x=x, X_train=X_train, X_val=X_val, denoise_name='denoising1', n_epoch=100, batch_size=320, print_freq=10, save=False, save_name='w1pre_')
print("\nPre-train Layer 2")


In [None]:
recon_layer2.pretrain(sess, x=x, X_train=X_train, X_val=X_val, denoise_name='denoising1', n_epoch=60, batch_size=320, print_freq=10, save=False)
print("\nAll Network Params after pre-train")
# network.print_params()

In [None]:
#保存模型为ckpt格式
saver = tf.train.Saver()
save_path = saver.save(sess,'./stack_encoder.ckpt')
# sess.close()

In [None]:
#自编码器的网络结构，读取保存的模型
act = tf.nn.relu
x = tf.placeholder(tf.float32, shape=[None, 72], name='x')
y_ = tf.placeholder(tf.float32, shape=[None, 1], name='y_')
network = tl.layers.InputLayer(x, name='input_layer')
network = tl.layers.DropoutLayer(network, keep=0.5, name='denoising1')
network = tl.layers.DropoutLayer(network, keep=0.8, name='drop1')
network = tl.layers.DenseLayer(network, n_units=50, act=act, name= 'relu1')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop2')
network = tl.layers.DenseLayer(network, n_units=30, act=act, name= 'relu2')

In [None]:
#重载网络
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, save_path = './stack_encoder.ckpt')

In [None]:
#微调
# 定义损失函数和衡量指标
# tl.cost.cross_entropy 在内部使用 tf.nn.sparse_softmax_cross_entropy_with_logits() 实现 softmax
#cost = tl.cost.cross_entropy(y, y_, name = 'cost')

#定义分类网络层

#####构建NetworkStructure.loss
network = tl.layers.DenseLayer(network,n_units=6, act=tf.identity, name='class_1')
network = tl.layers.DenseLayer(network,n_units=1, act=tf.identity, name='output')
y = network.outputs
loss = tl.cost.mean_squared_error( y,y_)
# loss = tl.cost.binary_cross_entropy(y,y_,name='entropy')
#####构建NetworkStructure.acc
# correct_prediction = tf.equal(tf.arg_max(y,1),y_)
# acc = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
#auc = compute_auc(y,y_,500)

# 定义 optimizer
train_params = network.all_params
train_op = tf.train.GradientDescentOptimizer(0.005).minimize(loss)

In [None]:
#读数
yhj_fine_tune_data = pd.read_hdf('yhj_fine_tune_data.hdf',key='fine_tune')
yhj_fine_tune_data = yhj_fine_tune_data.drop([' dt'],axis=1)
yhj_fine_tune_data = yhj_fine_tune_data.fillna(0)
# yhj_fine_tune_data = yhj_fine_tune_data[:100000]

In [None]:
def get_data(data,y):
    target_y =  [' inv_desire', ' current_fin', ' load_desire', ' fund_desire', ' bank_desire', ' risk_desire', ' card_desire']
    cols = [col for col in data.columns if col not in set(target_y)-set([y])]
    target_data = data.loc[data['type']==y,cols]
    target_data = target_data.drop(['type'],axis=1)
    target_data = target_data.fillna(0)
    train = target_data[:round(len(target_data)*0.9)]
    test = target_data[round(len(target_data)*0.9):]
    new_cols = [col for col in target_data if col not in [y]]
    x_train = train.loc[:,new_cols]
    x_train = preprocessing.scale(x_train)
    y_train = train[[y]]
    X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
    x_test = test.loc[:,new_cols]
    x_test = preprocessing.scale(x_test)
    y_test = test[[y]]
    y_train = y_train.values
    y_val = y_val.values
    y_test = y_test.values
    return X_train, X_val, y_train, y_val,x_test,y_test


In [None]:
X_train, X_val, y_train, y_val, x_test, y_test = get_data( yhj_fine_tune_data,' inv_desire')
print(len(X_train),len( X_val),len(x_test))
#44487 4943 5492

In [None]:
# Define fine-tune process
n_epoch = 1000
batch_size = 320
learning_rate = 0.0001
print_freq = 100

# Initialize all variables including weights, biases and the variables in train_op
tl.layers.initialize_global_variables(sess)

In [None]:
train_loss_list=[]
val_loss_list=[]

In [None]:
for epoch in range(n_epoch):
    start_time = time.time()
    for X_train_a, y_train_a in tl.iterate.minibatches(X_train, y_train, batch_size, shuffle=True):
        feed_dict = {x: X_train_a, y_: y_train_a}
        #微调阶段开启各降噪编码器内部Dropout层
        feed_dict.update(network.all_drop)  # enable noise layers
        #而denoising1只在预训练过程中开启，微调时则关闭
        feed_dict[tl.layers.LayersConfig.set_keep['denoising1']] = 1  # disable denoising layer
        sess.run(train_op, feed_dict=feed_dict)
    #每个epoch完结后，在训练集和测试集上做测试
    if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
        print("Epoch %d of %d took %fs" % (epoch + 1, n_epoch, time.time() - start_time))
        train_loss, train_acc, n_batch = 0, 0, 0
        #在训练集上测试
        for X_train_a, y_train_a in tl.iterate.minibatches(X_train, y_train, batch_size, shuffle=True):
            #关闭所有dropout层
            dp_dict = tl.utils.dict_to_one(network.all_drop)  # disable noise layers
            feed_dict = {x: X_train_a, y_: y_train_a}
            feed_dict.update(dp_dict)
            err  = sess.run([loss], feed_dict=feed_dict)[0]
            train_loss += err
#             train_acc += ac
            n_batch += 1
        print("   train loss: %f" % (train_loss / n_batch))
#         print("   train acc: %f" % (train_acc / n_batch))
        train_loss_list.append(train_loss/ n_batch)
        val_loss, val_acc, n_batch = 0, 0, 0
        #在验证集上测试
        for X_val_a, y_val_a in tl.iterate.minibatches(X_val, y_val, batch_size, shuffle=True):
            #关闭所有dropout层
            dp_dict = tl.utils.dict_to_one(network.all_drop)  # disable noise layers
            feed_dict = {x: X_val_a, y_: y_val_a}
            feed_dict.update(dp_dict)
            err  = sess.run([loss], feed_dict=feed_dict)[0]
            val_loss += err
#             val_acc += ac
            n_batch += 1
        print("   val loss: %f" % (val_loss / n_batch))
#         print("   val acc: %f" % (val_acc / n_batch))
        val_loss_list.append(val_loss/ n_batch)
print('Evaluation')
test_loss, test_acc, n_batch = 0, 0, 0
for X_test_a, y_test_a in tl.iterate.minibatches(x_test, y_test, batch_size, shuffle=True):
    dp_dict = tl.utils.dict_to_one(network.all_drop)  # disable noise layers
    feed_dict = {x: X_test_a, y_: y_test_a}
    feed_dict.update(dp_dict)
    err  = sess.run([loss], feed_dict=feed_dict)[0]
    test_loss += err
#     test_acc += ac
    n_batch += 1
print("   test loss: %f" % (test_loss / n_batch))
# print("   test acc: %f" % (test_acc / n_batch))
# print("   test acc: %f" % np.mean(y_test == sess.run(y_op, feed_dict=feed_dict)))

# Add ops to save and restore all the variables.
# ref: https://www.tensorflow.org/versions/r0.8/how_tos/variables/index.html
y_pred = tl.utils.predict(sess, network, x_test, x, y)
saver = tf.train.Saver()
# you may want to save the model
save_path = saver.save(sess, "./stack_encoder_bank_desire.ckpt")
print("Model saved in file: %s" % save_path)
# sess.close()


In [None]:
#训练测试阶段数据绘图
import matplotlib.pyplot as plt

x = range(round(n_epoch/print_freq +1))
x = [i*10 for i in x]
# print(train_loss_list,val_loss_list)
assert len(x) == len(train_loss_list) and len(x)== len(val_loss_list), 'not in the same length'
plt.plot(x, train_loss_list, 'r', label = 'train')
plt.plot(x, train_loss_list, 'ro')
plt.plot(x, val_loss_list, 'b', label = 'validate')
plt.plot(x, val_loss_list, 'bo')
plt.title('change of accuracy during training and validation')
plt.xlabel('number of epoch')
plt.ylabel('accuracy of classification')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
def predictive_Accu(df, dep, score):
    
    fpr_dev = dict()
    tpr_dev = dict()
    roc_auc_dev = dict() 
    fpr_dev, tpr_dev, _ = roc_curve(df[dep], df[score])
    roc_auc_dev = auc(fpr_dev, tpr_dev)

    dev_roc = {"fpr_dev":fpr_dev,"tpr_dev":tpr_dev}
    Dev_Roc = pd.DataFrame(dev_roc, columns=["fpr_dev", "tpr_dev"])
 
    return Dev_Roc, roc_auc_dev

In [None]:
#模型test数据AUC

dev_pred = pd.DataFrame()

dev_pred['score'] = y_pred.ravel().tolist()
dev_pred['dep'] = y_test

dev_roc, dev_auc = predictive_Accu(dev_pred, "dep", "score")

In [None]:
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(dev_roc['fpr_dev'], dev_roc['tpr_dev'], color='darkorange', lw=lw,  label='ROC curve(Dev)(area = %0.3f)' % dev_auc)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()