In [1]:
import os
import shutil
import pandas as pd

def mkdir_if_not_exist(dirPath):
    dirPath=dirPath.strip()
    if os.path.exists(dirPath):
        shutil.rmtree(dirPath) 
    os.makedirs(dirPath)
    
# folder path
data_folder_path = '../input/backblaze-2018-data-clean-1'
result_folder_path = './data'

selected_columns = ['time_interval','date','model','capacity_bytes','failure',
                    'smart_1_normalized',
#                     'smart_1_raw', # Smart 1: Raw_Read_Error_Rate (Raw Value)
                    'smart_3_normalized',
#                     'smart_3_raw', # Smart 3: Spin_Up_Time (Raw Value)
                    'smart_5_normalized',
#                     'smart_5_raw', # Smart 5: Reallocated_Sector_Ct (Raw Value)
                    'smart_7_normalized',
#                     'smart_7_raw', # Smart 7: Seek_Error_Rate (Raw Value)
                    'smart_9_normalized',
#                     'smart_9_raw', # Smart 9: Power_On_Hours (Raw Value)
                    'smart_187_normalized',
#                     'smart_187_raw', # Smart 187: Reported_Uncorrect (Raw Value)
#                     'smart_188_normalized','smart_188_raw',
#                     'smart_189_normalized','smart_189_raw', # Smart 189: High_Fly_Writes (Raw Value)
#                     'smart_193_normalized','smart_193_raw',
#                     'smart_194_normalized','smart_194_raw', # Smart 194: Temperature_Celsius (Raw Value)
#                     'smart_195_normalized','smart_195_raw', # Smart 195: Hardware_ECC_Recovered (Raw Value)
#                     'smart_197_normalized','smart_197_raw', # Smart 197: Current_Pending_Sector (Raw Value)
#                     'smart_198_normalized','smart_198_raw',
#                     'smart_241_normalized','smart_241_raw',
#                     'smart_242_normalized','smart_242_raw'
                    ]

# 读取所有的文件
mkdir_if_not_exist(result_folder_path)
Files = os.listdir(data_folder_path)
for f in Files:
    fPath = data_folder_path+'/'+f
    if os.path.isdir(fPath):
        dataFiles = os.listdir(fPath) 
    for file in dataFiles:
        filePath = fPath + '/' + file
        all_df = pd.read_csv(filePath)
        model_type =  all_df.loc[0]['model']
        if model_type == 'ST4000DM000': # 指定硬盘类型
            filtered_df = all_df.loc[:,selected_columns]
            clean_csv_file = result_folder_path + '/' + file
            with open(clean_csv_file, 'w+') as f:
                filtered_df.to_csv(f)
                
shutil.make_archive("filtered_data", 'zip', result_folder_path)

In [None]:
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import LSTM, Dense

# 将时间序列输出为带标签的监督学习样本
def series_to_supervised(features, target, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(features) is list else features.shape[1]
    df = pd.DataFrame(features)
    flag = pd.DataFrame(target)
    cols, names = [], []
    # 添加过去的时间序列变量 (var1(t-7) - var6(t-7))->(var6(t-1)-var6(t-1))
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i)) # axis = 0 is set by default, row向下移动i个单位
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # 添加未来需预测数据 var1(t)-var6(t)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    cols.append(flag)
    names += ['failure']
    agg = pd.concat(cols, axis=1) # shape = [rows, 6*（7+1）+ 1 = 49]
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg

# n_vars feature numbers
def prepare_data(filepath, n_in, n_out=1, n_vars=6):
    dataset = pd.read_csv(dataPath).fillna(0).iloc[:,6:] # only features columns
    target = pd.read_csv(dataPath).fillna(0).iloc[:,5] # failure feature
    feature_values = dataset.values.astype('float32')
    target = target.values.astype('float32')
    reframed = series_to_supervised(feature_values, target, n_in, n_out)
    contain_vars = []
    contain_vars += [('var%d(t)' % (j+1)) for j in range(n_vars)] # var1(t) -> var6(t)
    for i in range(1, n_in+1): # i = 1-7
        # (var1(t-1)->var6(t-1))->(var1(t-1)->var6(t-7))
        contain_vars += [('var%d(t-%d)' % (j, i)) for j in range(1,n_vars+1)] 
#     data = reframed [ contain_vars + ['var1(t)'] + [('var1(t+%d)' % (j)) for j in range(1,n_out)]]
    # (var1(t) -> var6(t)) | var1(t-1)->var6(t-1))->(var1(t-1)->var6(t-7) | failure
    data = reframed [contain_vars + ['failure']]
    #修改列名
    col_names = ['Y(t)','X1','X2','X3','X4','X5','X6']
    contain_vars = []
    contain_vars += [('Y(t)var%d' % (j+1)) for j in range(n_vars)] # Y(t)(var1->var6)
    for j in range(1,n_in+1):
        # (X1(t-1)->X6(t-1))
        contain_vars += [('%s(t-%d)' % (col_names[i], j)) for i in range(1,n_vars+1)] 
#     data.columns = contain_vars +  ['Y(t)'] + [('Y(t+%d)' % (j)) for j in range(1,n_out)]
    data.columns = contain_vars + ['target']
    return data

# 划分数据集
def train_test_split(data, n_vars=6, train_proportion=0.8):
    values = data.values
    n_train = round(data.shape[0]*train_proportion)
    train = values[:n_train, :]
    test = values[n_train:, :]
    #分隔输入X和输出y
    train_X, train_y = train[:, :(n_in+1)*n_vars], train[:, (n_in+1)*n_vars:]
    test_X, test_y = test[:, :(n_in+1)*n_vars], test[:, (n_in+1)*n_vars:]
    #将输入X改造为LSTM的输入格式，即[samples,timesteps,features]
    train_X = train_X.reshape((train_X.shape[0], n_in+1, n_vars))
    test_X = test_X.reshape((test_X.shape[0], n_in+1, n_vars))
    return train_X, train_y, test_X, test_y

def fit_lstm(data_splited, n_neurons=50, n_batch=72, n_epoch=100, loss='mae', optimizer='adam', repeats=1):
    train_X, train_y, test_X, test_y = data_splited
    model_list = []
    for i in range(repeats):
        #设计神经网络
        model = Sequential()
        model.add(LSTM(n_neurons, input_shape=(train_X.shape[1], train_X.shape[2])))
        model.add(Dense(train_y.shape[1]))
        model.compile(loss=loss, optimizer=optimizer)
        #拟合神经网络
        history = model.fit(train_X, train_y, epochs=n_epoch, batch_size=n_batch, validation_data=(test_X, test_y), verbose=0, shuffle=False)
        #画出学习过程
        p1 = pyplot.plot(history.history['loss'], color='blue', label='train')
        p2 = pyplot.plot(history.history['val_loss'], color='yellow',label='test')
        #保存model
        model_list.append(model)
    pyplot.legend(["train","test"])
    pyplot.show()
    return model_list

def lstm_predict(model, data_prepare):
    scaler = data_prepare[0]
    test_X = data_prepare[4]
    test_y = data_prepare[5]
    #做出预测
    yhat = model.predict(test_X)
    #将测试集上的预测值还原为原来的数据维度
    scale_new = MinMaxScaler()
    scale_new.min_, scale_new.scale_ = scaler.min_[0], scaler.scale_[0]
    inv_yhat = scale_new.inverse_transform(yhat)
    #将测试集上的实际值还原为原来的数据维度
    inv_y = scale_new.inverse_transform(test_y)
    return inv_yhat, inv_y

n_in = 7 # (t-1) -> (t-7)
n_out = 1 # target(t)
n_neuron = 5 
n_batch = 16
n_epoch = 200
repeats = 5

dataFiles = os.listdir(result_folder_path)
dataFiles = dataFiles[:]
frames = []
for data in dataFiles:
    dataPath = result_folder_path+'/'+data
    data_prepared = prepare_data(dataPath, n_in, n_out)
    frames.append(data_prepared)
#     print(data + ">>>>>>>" + str(data_prepared.shape))
data = pd.concat(frames) # 
data_splited = train_test_split(data)
model_list = fit_lstm(data_splited, n_neuron, n_batch, n_epoch,repeats=repeats)
