In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import TimeseriesGenerator

import sklearn
from sklearn import preprocessing

In [2]:
store_sales = pd.read_csv('C:/Users/user/Desktop/xyz/train.csv',parse_dates=['date'],infer_datetime_format=True)
stores = pd.read_csv('C:/Users/user/Desktop/xyz/stores.csv')
oil = pd.read_csv('C:/Users/user/Desktop/xyz/oil.csv', parse_dates=['date'],infer_datetime_format=True)
test = pd.read_csv('C:/Users/user/Desktop/xyz/test.csv', parse_dates=['date'],infer_datetime_format=True)

In [3]:
# fill missing date
oil = oil.set_index("date").asfreq(freq = "D")

# fill the NaN value by interpolation
oil["dcoilwtico"] = oil["dcoilwtico"].interpolate(limit_direction="both")

oil.tail(16)

Unnamed: 0_level_0,dcoilwtico
date,Unnamed: 1_level_1
2017-08-16,46.8
2017-08-17,47.07
2017-08-18,48.59
2017-08-19,48.19
2017-08-20,47.79
2017-08-21,47.39
2017-08-22,47.65
2017-08-23,48.45
2017-08-24,47.24
2017-08-25,47.65


In [4]:
store_sales = store_sales.merge(oil, on = "date")
test = test.merge(oil, on = "date")

In [5]:
def gen_series(x_data_scaled, y_data_scaled, predict_days, past_days):
    x, y = [], []
    length = past_days if (y_data_scaled.shape[0] > past_days) else y_data_scaled.shape[0]
    for i in range(predict_days, length):
        x.append(x_data_scaled[i-predict_days:i])
        y.append(y_data_scaled[i][0])
    return np.array(x), np.array(y)

In [7]:
store_nbr_types = store_sales["store_nbr"].unique()
family_types = store_sales["family"].unique()

In [8]:
for store_nbr_type in store_nbr_types:
    for family_type in family_types:
        # 訓練資料
        train_data = store_sales[(store_sales["store_nbr"] == store_nbr_type) & (store_sales["family"] == family_type)]
        
        # 重置index並drop掉不需要的欄位
        train_data = train_data.reset_index()
        train_data = train_data.drop(columns = ["id", "index", "date", "store_nbr", "family"])
        
        # 測試資料(用於填寫結果)        
        test_data = test[(test["store_nbr"] == store_nbr_type) & (test["family"] == family_type)]
        test_data = test_data.drop(columns = ["date", "store_nbr", "family"])
        break
    break

In [9]:
split_ratio = 0.8
split_number = np.int64((np.floor(len(train_data) * split_ratio)))
df_train = train_data.iloc[0:split_number,:]
df_train

Unnamed: 0,sales,onpromotion,dcoilwtico
0,0.0,0,93.140000
1,2.0,0,93.140000
2,3.0,0,92.970000
3,3.0,0,93.120000
4,5.0,0,93.146667
...,...,...,...
1342,8.0,0,45.470000
1343,3.0,0,47.630000
1344,4.0,0,45.880000
1345,2.0,0,46.013333


In [10]:
df_val = train_data.iloc[split_number:,:]
df_val

Unnamed: 0,sales,onpromotion,dcoilwtico
1347,4.0,0,46.280000
1348,4.0,0,44.910000
1349,9.0,0,43.620000
1350,2.0,0,43.850000
1351,6.0,0,43.040000
...,...,...,...
1679,1.0,0,48.810000
1680,6.0,0,48.403333
1681,1.0,0,47.996667
1682,1.0,0,47.590000


In [11]:
scaler1 = preprocessing.MinMaxScaler(feature_range = (0,1))
df_train_scaled = scaler1.fit_transform(df_train)
print(df_train_scaled.shape)

(1347, 3)


In [12]:
scaler2 = preprocessing.MinMaxScaler(feature_range = (0,1))
df_train_y = scaler2.fit_transform(df_train[['sales']])
print(df_train_y.shape)

(1347, 1)


In [13]:
df_val_scaled = scaler1.transform(df_val)
print(df_val_scaled.shape)
df_val_y = scaler2.transform(df_val[['sales']])
print(df_val_y.shape)

(337, 3)
(337, 1)


In [14]:
past_days = 700
predict_days = 1
n_cols = 3

x_train, y_train = gen_series(df_train_scaled, df_train_y, predict_days, past_days)
x_val, y_val = gen_series(df_val_scaled, df_val_y, predict_days, past_days)

In [17]:
model = keras.models.Sequential([
        keras.layers.LSTM(units=100, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])),        
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.2),
        keras.layers.LSTM(units=100, return_sequences=True),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.2),
        keras.layers.TimeDistributed(keras.layers.Dense(1))
])

optimizer = keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mse"])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 100)            41600     
                                                                 
 batch_normalization (BatchN  (None, 1, 100)           400       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 1, 100)            80400     
                                                                 
 batch_normalization_1 (Batc  (None, 1, 100)           400       
 hNormalization)                                                 
                                                                 
 time_distributed (TimeDistr  (None, 1, 1)             101       
 ibuted)                                                         
                                                        

In [18]:
import time
start = time.time()

early_stopping =  keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
model_result = model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_val, y_val), verbose=2, shuffle=False, callbacks=[early_stopping])

end = time.time()
print (np.int64(end-start))

Epoch 1/100
22/22 - 3s - loss: 0.0602 - mse: 0.0602 - val_loss: 0.0266 - val_mse: 0.0266 - 3s/epoch - 116ms/step
Epoch 2/100
22/22 - 0s - loss: 0.0222 - mse: 0.0222 - val_loss: 0.0333 - val_mse: 0.0333 - 77ms/epoch - 3ms/step
Epoch 3/100
22/22 - 0s - loss: 0.0171 - mse: 0.0171 - val_loss: 0.0306 - val_mse: 0.0306 - 70ms/epoch - 3ms/step
Epoch 4/100
22/22 - 0s - loss: 0.0173 - mse: 0.0173 - val_loss: 0.0311 - val_mse: 0.0311 - 74ms/epoch - 3ms/step
Epoch 5/100
22/22 - 0s - loss: 0.0171 - mse: 0.0171 - val_loss: 0.0309 - val_mse: 0.0309 - 75ms/epoch - 3ms/step
Epoch 6/100
22/22 - 0s - loss: 0.0171 - mse: 0.0171 - val_loss: 0.0309 - val_mse: 0.0309 - 76ms/epoch - 3ms/step
Epoch 7/100
22/22 - 0s - loss: 0.0170 - mse: 0.0170 - val_loss: 0.0309 - val_mse: 0.0309 - 77ms/epoch - 3ms/step
Epoch 8/100
22/22 - 0s - loss: 0.0170 - mse: 0.0170 - val_loss: 0.0309 - val_mse: 0.0309 - 77ms/epoch - 4ms/step
Epoch 9/100
22/22 - 0s - loss: 0.0171 - mse: 0.0171 - val_loss: 0.0309 - val_mse: 0.0309 - 76ms/

In [19]:
df_train_last_16 = df_train.iloc[-16:]
df_test = pd.concat((df_train_last_16, test_data.drop(["id"],axis = 1)),axis = 0)
print(df_test.shape)

(32, 3)


In [20]:
df_test_scaled = scaler1.transform(df_test)
df_test.shape

(32, 3)

In [21]:
test = []
for i in range(predict_days,df_test.shape[0]):
    test.append(df_test[i-predict_days:i])
test = np.array(test)
print(test.shape)

(31, 1, 3)


In [22]:
pred = model.predict(test)
prediction = np.squeeze(pred) / scaler2.scale_[0]
prediction = prediction[~np.isnan(prediction)]
test_data["sales"] = prediction

In [23]:
prediction

array([4.7084026, 4.700791 , 4.6584783, 4.6742363, 4.6494155, 4.6134243,
       4.656642 , 4.6229796, 4.653157 , 4.616648 , 4.657552 , 4.618548 ,
       4.7039337, 4.6660733, 4.6793604, 4.6889896], dtype=float32)

In [24]:
test_data

Unnamed: 0,id,onpromotion,dcoilwtico,sales
0,3000888,0,46.8,4.708403
1782,3002670,0,47.07,4.700791
3564,3004452,0,48.59,4.658478
5346,3006234,0,48.19,4.674236
7128,3008016,0,47.79,4.649415
8910,3009798,0,47.39,4.613424
10692,3011580,0,47.65,4.656642
12474,3013362,0,48.45,4.62298
14256,3015144,0,47.24,4.653157
16038,3016926,0,47.65,4.616648


In [25]:
submit_data = test_data
submit_data = submit_data.drop(columns = ["onpromotion", "dcoilwtico"])

submit_data = submit_data.sort_values(by=["id"])

submit_data = submit_data.reset_index(drop=True)

submit_data.loc[submit_data.sales < 0.001, "sales"] = 0

In [6]:
import time
start = time.time()

store_nbr_types = store_sales["store_nbr"].unique()
family_types = store_sales["family"].unique()

past_days = 16
predict_days = 1
n_cols = 3

counter = 0
# 走訪所有商店/類型
for store_nbr_type in store_nbr_types:
    for family_type in family_types:
        counter = counter + 1
        # 訓練資料
        train_data = store_sales[(store_sales["store_nbr"] == store_nbr_type) & (store_sales["family"] == family_type)]
        
        # 重置index並drop掉不需要的欄位
        train_data = train_data.reset_index()
        train_data = train_data.drop(columns = ["id", "index", "date", "store_nbr", "family"])
        
        # 測試資料(用於填寫結果)        
        test_data = test[(test["store_nbr"] == store_nbr_type) & (test["family"] == family_type)]
        test_data = test_data.drop(columns = ["date", "store_nbr", "family"])
        
        #
        split_ratio = 0.95
        split_number = np.int64((np.floor(len(train_data) * split_ratio)))
        df_train = train_data.iloc[0:split_number,:]
        df_val = train_data.iloc[split_number:,:]

        scaler1 = preprocessing.MinMaxScaler(feature_range = (0,1))
        df_train_scaled = scaler1.fit_transform(df_train)
        scaler2 = preprocessing.MinMaxScaler(feature_range = (0,1))
        df_train_y = scaler2.fit_transform(df_train[['sales']])
        
        df_val_scaled = scaler1.transform(df_val)
        df_val_y = scaler2.transform(df_val[['sales']])
        
        x_train, y_train = gen_series(df_train_scaled, df_train_y, predict_days, past_days)
        x_val, y_val = gen_series(df_val_scaled, df_val_y, predict_days, past_days)
        
        #
        model = keras.models.Sequential([
                keras.layers.LSTM(units=100, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])),        
                keras.layers.BatchNormalization(),
                keras.layers.Dropout(0.2),
                keras.layers.LSTM(units=100, return_sequences=True),
                keras.layers.BatchNormalization(),
                keras.layers.Dropout(0.2),
                keras.layers.TimeDistributed(keras.layers.Dense(1))
        ])

        optimizer = keras.optimizers.Adam(learning_rate=0.001)
        model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mse"])
        
        early_stopping =  keras.callbacks.EarlyStopping(monitor='val_mse', min_delta=0.001, patience=100, restore_best_weights=True)
        model_result = model.fit(x_train, y_train, epochs=1000, batch_size=512, validation_data=(x_val, y_val),\
                                 verbose=0, shuffle=False, callbacks=[early_stopping])
        
        df_train_last_16 = df_train.iloc[-16:]
        df_test = pd.concat((df_train_last_16, test_data.drop(["id"],axis = 1)),axis = 0)
        df_test_scaled = scaler1.transform(df_test)

        final_test = []
        for i in range(predict_days,df_test.shape[0]):
            final_test.append(df_test[i-predict_days:i])
        final_test = np.array(final_test)
        
        pred = model.predict(final_test)
        prediction = np.squeeze(pred) / scaler2.scale_[0] #inverse_transform
        prediction = prediction[~np.isnan(prediction)]
        test_data["sales"] = prediction
                        
        
        if counter == 1:
            submit_data = test_data
        else:
            submit_data = pd.concat([submit_data, test_data])
        
        if counter % 50 == 0:
            print(counter)
             
print("done!")
end = time.time()
print (np.int64(end-start))

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
done!
10072


In [7]:
submit_data

Unnamed: 0,id,onpromotion,dcoilwtico,sales
0,3000888,0,46.800000,1.840689
1782,3002670,0,47.070000,1.796847
3564,3004452,0,48.590000,1.866382
5346,3006234,0,48.190000,1.850272
7128,3008016,0,47.790000,1.802142
...,...,...,...,...
21383,3022271,0,46.816667,22.450991
23165,3024053,0,46.400000,22.549946
24947,3025835,0,46.460000,23.642616
26729,3027617,0,45.960000,25.616495


In [8]:
submit_data = submit_data.drop(columns = ["onpromotion", "dcoilwtico"])

submit_data = submit_data.sort_values(by=["id"])

submit_data = submit_data.reset_index(drop=True)

submit_data.loc[submit_data.sales < 0.001, "sales"] = 0

submit_data.to_csv('C:/Users/user/Desktop/xyz/submission_0526_3.csv', index=False)