### 销量预测-----lstm添加滚动预测
说明:按照序列预测，每次均是预测下个月数据，未知数据用上次预测结果填充，模型前序均为真实值的模型；
    例如，预测已知201401~201811月数据，预测201812~201902的值。
    根据201401~201811月数据训练出模型model，预测出201812值；
    然后将201812预测值当作真实值放进预测模型，再次利用已训练出的model预测出201901的值；
    然后将201812、201901预测值当作真实值，利用上述model预测出201902的值

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import copy
import datetime


import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

pd.set_option('precision',4) #设置精度
pd.set_option('display.float_format', lambda x: '%.4f' % x)  

from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
# keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
##  把序列数据转化为监督学习模式：X和y一起存放在一张宽表中,n_out为预测滞后第几期
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    df = pd.DataFrame(data)
    n_vars = data.shape[1]
    cols, names = list(), list()
    
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    
    return agg

In [3]:
#模型训练函数
def model_train(data_train,n_hours,n_batch, nb_epoch, n_neurons, dropout,n_features):
    
    values = data_train.values.astype('float32')
    n_train_hours=data_train.shape[0]-1
    train_values=values[:n_train_hours]
    
    # 拆分训练集、验证集(验证集为训练集最后一个点)
    train = train_values[:n_train_hours, :]
    test = train_values[(n_train_hours-n_hours-1):n_train_hours, :]
    
    # 变量标准化
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_train = scaler.fit_transform(train)
    scaled_test = scaler.transform(test)
 
    # 序列数据转化为监督数据类型
    reframed_train = series_to_supervised(scaled_train, n_hours).values
    reframed_test = series_to_supervised(scaled_test, n_hours).values
 
    # 准备train_X,train_y,test_X,test_y  
    n_obs = n_hours * n_features
    train_X, train_y = reframed_train[:, :n_obs], reframed_train[:, -1]
    test_X, test_y = reframed_test[:, :n_obs], reframed_test[:, -1]
    # reshape train_X和test_X成3D数组格式 [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
    test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
    
    
    # 设计LSTM网络
    model = Sequential()
    model.add(LSTM(n_neurons, input_shape=(train_X.shape[1], train_X.shape[2]),dropout=dropout))
    model.add(Dense(1))  #n_dim 输出预测个数
   
    # 模型编译
    model.compile(loss='mape', optimizer='adam')
    
    #模型训练
    model_history = model.fit(train_X, train_y, epochs=nb_epoch, batch_size=n_batch, validation_data=(test_X, test_y), verbose=0, shuffle=False)
    
    # 取每一轮预测的最后一个训练数值
    train_y = model.predict(train_X)
        
        
    #y预测值：归一化结果反转化
    if  n_features==1:
        train_Y=scaler.inverse_transform(train_y)[0]
    elif n_features>1:
        train_X = train_X.reshape((train_X.shape[0], n_hours*n_features))
        train_Y = np.concatenate((train_X[:,-n_features:-1],train_y),axis=1) #拼接成设置scaler的数据格式  #要改成变量个数
        train_Y=train_Y.reshape(train_Y.shape[0],n_features)
        train_Y = scaler.inverse_transform(train_Y)
        train_Y=train_Y[-11:,-1:] #每组取最后十二个预测值
    
    # inv_yhat = inv_yhat[:,-1] #逆转换完成之后，只需要取第1个特征（yhat）即可
    train_Y=pd.DataFrame(train_Y) 
    
    return train_Y,scaler,model

In [4]:
##模型预测函数
def model_predict(predict_values,n_hours,scaler,model,n_features):
    predict_values=pd.DataFrame(predict_values).fillna(1800000).values  #结果不受填充值影响
    # 变量标准化
    scaled_predict = scaler.transform(predict_values)

    # 序列数据转化为监督数据类型
    reframed_predict = series_to_supervised(scaled_predict, n_hours).values

    
    # 准备train_X,train_y,test_X,test_y
    n_obs = n_hours * n_features
    predict_X1, predict_y = reframed_predict[:, :n_obs], reframed_predict[:, -1]
    # reshape train_X和test_X成3D数组格式 [samples, timesteps, features]
    predict_X = predict_X1.reshape((predict_X1.shape[0], n_hours, n_features))
 
    # 预测数据
    yhat = model.predict(predict_X)

    #y预测值：归一化结果反转化
    if  n_features==1:
        inv_yhat=scaler.inverse_transform(yhat)[0]
    elif n_features>1:
        inv_yhat =   np.concatenate((predict_X1[0][-n_features:-1],yhat[0]),axis=0) #拼接成设置scaler的数据格式
        inv_yhat=inv_yhat.reshape(1,n_features)
        inv_yhat = scaler.inverse_transform(inv_yhat)
        inv_yhat = inv_yhat[:,-1] #逆转换完成之后，只需要取第1个特征（yhat）即可

    predict_data=list(inv_yhat)
    
    return predict_data

In [5]:
##模型预测函数调用
def model_predict_inter(data_predict,n_inter,model,scaler,n_hours,n_features): 
    compare_data_origin=[]
    values = data_predict.values.astype('float32')
    predict_values=values[(data_predict.shape[0]-1-n_hours):(data_predict.shape[0])]
    for m in range(n_inter):
        compare=model_predict(predict_values,n_hours,scaler,model,n_features)
        compare_data_origin=compare if compare_data_origin==[] else compare_data_origin+compare        
    return compare_data_origin

In [6]:
def lstm_scroll_predict(var_list,predict_data_index,list_date,n_inter):
    ###模型参数
    n_batch=64
    nb_epoch=100
    n_neurons=100  #神经元的数量
    dropout=0.1
    n_hours=12

    dataset_fill_predict=dataset[var_list]
    var_target=var_list[-1]
    step_list_total=pd.DataFrame()
    n_features=len(var_list)
    for i in list_date:
        predict_date=predict_data_index[predict_data_index.index(i):(predict_data_index.index(i)+3)]
        step_list=[]
        for year_value in range(3):
            if year_value==0:
                data_predict= dataset_fill_predict.loc[pd.date_range('2014-01-01',predict_date[0],freq='MS'),:]
                #模型训练
                train_Y,scaler,model=model_train(data_predict,n_hours,n_batch, nb_epoch, n_neurons, dropout,n_features)
                #模型预测
                year_value_0=model_predict_inter(data_predict,n_inter,model,scaler,n_hours,n_features)
                year_value_0=pd.Series(year_value_0).mean()
                step_list=[year_value_0]
            elif year_value==1:
                data_predict= dataset_fill_predict.loc[pd.date_range('2014-01-01',predict_date[1],freq='MS'),:]
                data_predict.loc[predict_date[0],var_target]=year_value_0
                year_value_1=model_predict_inter(data_predict,n_inter,model,scaler,n_hours,n_features)
                year_value_1=pd.Series(year_value_1).mean()
                step_list=step_list+[year_value_1]
            elif year_value==2:
                data_predict= dataset_fill_predict.loc[pd.date_range('2014-01-01',predict_date[2],freq='MS'),:]
                data_predict.loc[predict_date[0],var_target]=year_value_0
                data_predict.loc[predict_date[1],var_target]=year_value_1
                year_value_2=model_predict_inter(data_predict,n_inter,model,scaler,n_hours,n_features)
                year_value_2=pd.Series(year_value_2).mean()
                step_list=step_list+[year_value_2]
        step_list_total=pd.Series(step_list) if step_list_total.empty else pd.concat([step_list_total,pd.Series(step_list)],axis=1)
    step_list_total=step_list_total.T
    step_list_total.index=list_date
    print(step_list_total)
    

### 模型运行

In [7]:
predict_data_index=list(pd.date_range('2014-01-01','2020-12-01',freq='MS'))  #时间序列索引，已写的很大，可不修改
# 读取数据
dataset = pd.read_csv('predict_data_final.csv', header=0, index_col=0)
dataset.index=pd.to_datetime(dataset.index)
#dataset.index=pd.date_range('2014-01-01','2019-02-01',freq='MS')  

#列表四个值分别为总量、细分市场（合资、自主豪华）预测
scroll_predict_var=[['month','work_days','days','y','y']
                   ,['month','work_days','days','y_joint','y_joint']
                   ,['month','work_days','days','y_independent','y_independent']
                   ,['month','work_days','days','y_luxury','y_luxury']]

n_inter=2  #结果循环遍数
#预测月份
list_date=list(pd.date_range('2018-01-01',(datetime.date.today().replace(day=1) - datetime.timedelta(1)).replace(day=1).strftime("%Y-%m-%d"),freq='MS')) 

for i in scroll_predict_var:
    print('预测变量',i)
    lstm_scroll_predict(i,predict_data_index,list_date,n_inter)  

预测变量 ['month', 'work_days', 'days', 'y', 'y']


NameError: name 'model_predict' is not defined