In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir("/content/drive/MyDrive")
!ls

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

#  导入数据
TRAIN_SPLIT = 30000

# 导入网络访问数据
# 通过分析网络流量来学习正常和异常行为，尝试将神经网络模型应用到入侵检测中，来解决高误报率的问题
# 数据集选择CICIDS2017，这是加拿大网络安全研究所于2017年发布的数据集
# 使用Pandas对CICIDS2017数据集进行数据预处理，清洗数据集并标准化
import datetime

start_time = datetime.datetime.now()
CSV_FILE_PATH = 'binary_classification.csv'
df = pd.read_csv(CSV_FILE_PATH)
df.head()

In [None]:
#修改数据类型
#Object类型转换为离散数值（Label列）
df['Label'] = pd.Categorical(df['Label'])
df['Label'] = df['Label'].cat.codes
columns_counts = df.shape[1]                                                     #获取列数
for i in range(columns_counts): # 把不是float 类型的数据转化为float
  if(df.iloc[:,i].dtypes) != 'float64':
    df.iloc[:, i] = df.iloc[:,i].astype(float)

In [None]:
#选取11个特征和Label
features_considered = ['Bwd_Packet_Length_Min','Subflow_Fwd_Bytes','Total_Length_of_Fwd_Packets','Fwd_Packet_Length_Mean','Bwd_Packet_Length_Std','Flow_Duration','Flow_IAT_Std','Init_Win_bytes_forward','Bwd_Packets/s',
                 'PSH_Flag_Count','Average_Packet_Size']
features = df[features_considered]
data_result = df['Target']

In [None]:
# 对数据进行聚类
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5) # 聚类5      
kmeans.fit(features) # 训练模型
label = kmeans.predict(features) # 预测模型

In [None]:
# 添加聚类结果
import warnings
warnings.filterwarnings("ignore")
features['lb']=label
features_considered.append("lb")

In [None]:
#标准化
dataset = features.values
feature_mean = dataset.mean(axis=0)
feature_std = dataset.std(axis=0)
dataset = (dataset-feature_mean)/feature_std
dataset = pd.DataFrame(dataset,columns=features_considered)
dataset.insert(0,'Target',data_result)
dataset = dataset.values

In [None]:
#返回时间窗,根据给定步长对过去的观察进行采样  history_size为过去信息窗口的大小，target_size为模型需要预测的未来时间
def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
  data = []
  labels = []

  start_index = start_index + history_size
  if end_index is None:
    end_index = len(dataset) - target_size                                      #如果未指定end_index,则设置最后一个训练点

  for i in range(start_index, end_index):
    indices = range(i-history_size, i, step)
    data.append(dataset[indices])

    if single_step:
      labels.append(target[i+target_size])                                      #仅仅预测未来的单个点
    else:
      labels.append(target[i:i+target_size])

  return np.array(data), np.array(labels)

In [None]:
past_history = 10000
future_target = 100
STEP = 6 

In [None]:
x_train_single, y_train_single = multivariate_data(dataset, dataset[:, 0], 0,
                                                   TRAIN_SPLIT, past_history,
                                                   future_target, STEP,
                                                   single_step=True)            #dataset[:,1]取最后一列的所有值
x_val_single, y_val_single = multivariate_data(dataset, dataset[:, 0],
                                               TRAIN_SPLIT, None, past_history,
                                               future_target, STEP,
                                               single_step=True)

In [None]:
#训练集、验证集
BATCH_SIZE = 256
BUFFER_SIZE = 10000
# 构建lstm 的数据
train_data_single = tf.data.Dataset.from_tensor_slices((x_train_single, y_train_single))
train_data_single = train_data_single.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_data_single = tf.data.Dataset.from_tensor_slices((x_val_single, y_val_single))
val_data_single = val_data_single.batch(BATCH_SIZE).repeat()

In [None]:
#创建模型
model = tf.keras.Sequential([
    layers.LSTM(32,
                input_shape=x_train_single.shape[-2:]),
    layers.Dense(32),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer='Adam',
              loss = 'binary_crossentropy',
              metrics=['accuracy']) # 设置优化器

log_dir = "graph/log_fit/7"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)# 打印log

model.fit(x_train_single, y_train_single, epochs=10, batch_size=256,callbacks=[tensorboard_callback]) # 训练模型
