
### 7.8.1 时序数据的导入和处理

In [1]:
# 首先把数据从文件中读入Dataframe：
import numpy as np
import pandas as pd
df_train = pd.read_csv('./dataset/exoTrain.csv')
df_test = pd.read_csv('./dataset/exoTest.csv')
print(df_train.head()) # 输入头几行数据
print(df_train.info()) # 输出训练集信息

   LABEL   FLUX.1   FLUX.2   FLUX.3   FLUX.4   FLUX.5   FLUX.6  FLUX.7  \
0      2    93.85    83.81    20.10   -26.98   -39.56  -124.71 -135.18   
1      2   -38.88   -33.83   -58.54   -40.09   -79.31   -72.81  -86.55   
2      2   532.64   535.92   513.73   496.92   456.45   466.00  464.50   
3      2   326.52   347.39   302.35   298.13   317.74   312.70  322.33   
4      2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34   

    FLUX.8  FLUX.9  ...  FLUX.3188  FLUX.3189  FLUX.3190  FLUX.3191  \
0   -96.27  -79.89  ...     -78.07    -102.15    -102.15      25.13   
1   -85.33  -83.97  ...      -3.28     -32.21     -32.21     -24.89   
2   486.39  436.56  ...     -71.69      13.31      13.31     -29.89   
3   311.31  312.42  ...       5.71      -3.73      -3.73      30.05   
4 -1022.71 -989.57  ...    -594.37    -401.66    -401.66    -357.24   

   FLUX.3192  FLUX.3193  FLUX.3194  FLUX.3195  FLUX.3196  FLUX.3197  
0      48.57      92.54      39.32      61.42       5.08  

In [2]:
# 数据集是预先排过序的，下面的代码将其进行乱序排列：
from sklearn.utils import shuffle # 导入乱序工具
df_train = shuffle(df_train)
df_test = shuffle(df_test)

In [3]:
X_train = df_train.iloc[:,1:].values # 构建特征集（训练）
y_train = df_train.iloc[:,0].values # 构建标签集（训练）
X_test = df_test.iloc[:,1:].values # 构建特征集（测试）
y_test = df_test.iloc[:,0].values # 构建标签集（测试）
y_train = y_train - 1   # 标签转换成惯用的(0，1)分类
y_test = y_test - 1
print (X_train) # 打印训练集中的特征
print (y_train) # 打印训练集中的标签

[[   9.17  -29.55  -49.19 ...   22.86   55.46   58.34]
 [  -5.63  -11.97   -7.33 ...    1.64    2.09    4.24]
 [ -91.24  -88.34  -89.36 ...  -12.44   -5.5    -7.75]
 ...
 [-112.91 -108.05 -149.54 ...   52.41   23.74   56.08]
 [  -3.27   -8.6    -9.61 ...    2.81   -3.27    5.53]
 [ -53.26  -27.11  -25.67 ...   17.07   -7.19   11.14]]
[0 0 0 ... 0 0 0]


In [4]:
X_train = np.expand_dims(X_train, axis=2) # 张量升阶，以满足序列数据集的要求
X_test = np.expand_dims(X_test, axis=2) # 张量升阶，以满足序列数据集的要求
print(X_train.shape)

(5087, 3197, 1)


In [None]:
from keras.models import Sequential # 导入序贯模型
from keras import layers # 导入所有类型的层
from keras.optimizers import Adam # 导入优化器
model = Sequential() # 序贯模型
model.add(layers.Conv1D(32, kernel_size = 10, strides = 4,
                    input_shape = (3197, 1))) # 1D CNN层
model.add(layers.MaxPooling1D(pool_size = 4, strides = 2)) # 池化层
model.add(layers.GRU(256, return_sequences=True)) # 关键，GRU层要够大
model.add(layers.Flatten()) # 展平
model.add(layers.Dropout(0.5)) # Dropout层
model.add(layers.BatchNormalization()) # 批标准化
model.add(layers.Dense(1, activation='sigmoid')) # 分类输出层
opt = Adam(lr = 0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(optimizer=opt, # 优化器
                            loss = 'binary_crossentropy', # 交叉熵
                            metrics = ['accuracy']) # 准确率

In [None]:
history = model.fit(X_train,y_train, # 训练集
                    validation_split = 0.2, # 部分训练集数据拆分成验证集
                    batch_size = 128, # 批量大小
                    epochs = 4, # 训练轮次
                    shuffle = True) # 乱序

### 7.8.3 输出阈值的调整

In [None]:
from sklearn.metrics import classification_report # 分类报告
from sklearn.metrics import confusion_matrix # 混淆矩阵
y_prob = model.predict(X_test) # 对测试集进行预测
y_pred =  np.where(y_prob > 0.5, 1, 0) #将概率值转换成真值
cm = confusion_matrix(y_pred, y_test)
print('Confusion matrix:\n', cm, '\n')
print(classification_report(y_pred, y_test))

阈值调整

In [None]:
y_pred =  np.where(y_prob > 0.15, 1, 0) # 进行阈值调整
cm = confusion_matrix(y_pred, y_test) 
print('Confusion matrix:\n', cm, '\n')
print(classification_report(y_pred, y_test))

### 7.8.4 使用函数式API

In [None]:
from keras import layers # 导入各种层
from keras.models import Model # 导入模型
from keras.optimizers import Adam # 导入Adam优化器
input = layers.Input(shape=(3197, 1)) # Input
# 通过函数式API构建模型
x = layers.Conv1D(32, kernel_size=10, strides=4)(input)
x = layers.MaxPooling1D(pool_size=4, strides=2)(x)
x = layers.GRU(256, return_sequences=True)(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.BatchNormalization()(x)
output = layers.Dense(1, activation='sigmoid')(x) # Output
model = Model(input, output) 
model.summary() # 显示模型的输出
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01) # 设置优化器
model.compile(optimizer=opt, # 优化器
              loss = 'binary_crossentropy', # 交叉熵
              metrics=['accuracy']) # 准确率

双向RNN模型:

In [None]:
# 首先在给输入数据集升维之前数据集进行逆序：
X_train_rev = [X[::-1] for X in X_train]
X_test_rev = [X[::-1] for X in X_test]
X_train = np.expand_dims(X_train, axis=2)
X_train_rev = np.expand_dims(X_train_rev, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_test_rev = np.expand_dims(X_test_rev, axis=2)

In [None]:
# 再构建多头网络：
# 构建正向网络
input_1 = layers.Input(shape=(3197, 1))
x = layers.GRU(32, return_sequences=True)(input_1)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
# 构建逆向网络
input_2 = layers.Input(shape=(3197, 1))
y = layers.GRU(32, return_sequences=True)(input_2)
y = layers.Flatten()(y)
y = layers.Dropout(0.5)(y)
# 连接两个网络
z = layers.concatenate([x, y])
output = layers.Dense(1, activation='sigmoid')(z)
model = Model([input_1,input_2], output)
model.summary()

In [None]:
history = model.fit([X_train, X_train_rev], y_train, # 训练集
                    validation_split = 0.2, # 部分训练集数据拆分成验证集
                    batch_size = 128, # 批量大小
                    epochs = 1, # 训练轮次
                    shuffle = True) # 乱序