In [1]:
import pandas as pd
import os
import numpy as np
from math import cos, sin, atan2, sqrt, pi, radians, degrees, asin
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

Using TensorFlow backend.


In [3]:
def parse(x):
    x = x[:4] + ' ' + x[4:6] + ' '+x[6:]
    return datetime.strptime(x, '%Y %m %d')
def read_data(file):
    return pd.read_csv(file, parse_dates=['date'], date_parser=parse)

In [5]:
A = read_data('data/A.csv')
B = read_data('data/B.csv')
C = read_data('data/C.csv')
D = read_data('data/D.csv')
E = read_data('data/E.csv')
n_features = A.shape[1] - 3

In [2]:
scaler = MinMaxScaler(feature_range=(0, 1))
n_step = 1
n_ob = n_step * n_features

In [4]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def process(cdf):
    v = []
    for i in range(cdf.region.max()+1):
        df = cdf[cdf.region == i].copy()
        df = df.set_index("date")
        df.drop(["city", "region"], axis=1, inplace=True)
        values = df.values
        values = values.astype('float32')
        # scaled = scaler.fit_transform(values)
        reframed = series_to_supervised(values, n_step, 1)
        # reframed.drop(reframed.columns[range(53, 104)], axis=1, inplace=True)
        v.append(reframed.values)
        a = v[0]
        for i in range(1, len(v)):
            a = np.concatenate((a, v[i]), axis=0)
    return a

In [6]:
# A1 = read_data('data/A1.csv')
a = process(A)
b = process(B)
c = process(C)
d = process(D)
e = process(E)
data = np.concatenate((a,b,c,d,e),axis = 0)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data[:, :n_ob], data[:, n_ob:], test_size=0.2, random_state=2)

In [8]:
X_train = np.reshape(X_train, (X_train.shape[0], n_step, n_features))
X_test = np.reshape(X_test, (X_test.shape[0], n_step, n_features))

In [9]:
model = Sequential()
model.add(LSTM(units=128, input_shape=(X_test.shape[1], X_test.shape[2])))
# model.add(LSTM(units=52))
model.add(Dense(n_features))
# model.add(Dense(1))

model.compile(loss='msle', optimizer='adam')
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test), verbose=2)







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 13798 samples, validate on 3450 samples
Epoch 1/4
 - 3s - loss: 0.0419 - val_loss: 0.0272
Epoch 2/4
 - 3s - loss: 0.0201 - val_loss: 0.0196
Epoch 3/4
 - 2s - loss: 0.0165 - val_loss: 0.0158
Epoch 4/4
 - 2s - loss: 0.0141 - val_loss: 0.0131


<keras.callbacks.History at 0x1a3b4f3b70>

In [10]:
# Evaluation
yhat = model.predict(X_test)
# print(yhat)
# x_test0 = X_test.reshape((X_test.shape[0], n_features*n_step))
# inv_yhat = np.concatenate((yhat, x_test0[:, n_ob:]), axis=1)
# inv_yhat = scaler.inverse_transform(yhat)
# inv_yhat = inv_yhat[:, 0]
# y_test = y_test.reshape((len(y_test), n_features))
# inv_y = np.concatenate((y_test, x_test0[:, n_ob:]), axis=1)
# inv_y = scaler.inverse_transform(y_test)
# inv_y = inv_y[:, 0]

In [11]:
# RMSE
rmse = sqrt(mean_squared_error(y_test, yhat))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 1.164


In [12]:
def process_last_day(cdf):
    v = []
    for i in range(cdf.region.max()+1):
        df = cdf[cdf.region == i].copy()
        df = df.set_index("date")
        df.drop(["city", "region"], axis=1, inplace=True)
        values = df.values
        values = values.astype('float32')
        # scaled = scaler.fit_transform(values)
        reframed = series_to_supervised(values, n_step, 1)
        v.append(np.array([reframed.values[-1]])[:,n_ob:])
    return v
        

In [13]:
a_lastday = process_last_day(A)
b_lastday = process_last_day(B)
c_lastday = process_last_day(C)
d_lastday = process_last_day(D)
e_lastday = process_last_day(E)

In [26]:
def predict(day_before):
    day_before = np.reshape(day_before, (day_before.shape[0], n_step, n_features))
    day_after = model.predict(day_before)
    return day_after

In [27]:
def generate_future_prediction(lastday):
    res_city_lst = []
    for region_begin in lastday:
        print(region_begin)
        res_date_lst = [predict(region_begin)]
        for i in range(1, 30):
            res_date_lst.append(predict(res_date_lst[-1]))
        res_city_lst.append(res_date_lst)
    return res_city_lst

In [28]:
a_prediction = generate_future_prediction(a_lastday)
b_prediction = generate_future_prediction(b_lastday)
c_prediction = generate_future_prediction(c_lastday)
d_prediction = generate_future_prediction(d_lastday)
e_prediction = generate_future_prediction(e_lastday)

  0.6136364  0.57894737 0.6153846  0.40000004 0.40625    0.6875
  0.26388887 0.55       0.3205128  0.6666666  0.20270273 0.6500001
  0.23943666 0.6363636  0.28767124 0.6        0.2837838  0.7307693
  0.42307696 0.5555556  0.5555556  0.6956522  0.49999997 0.39130434
  0.34545457 0.42857143 0.38297868 0.375      0.39999992 0.47368422
  0.39999992 0.35413164 0.13644001 0.19092464]]
[[1.         0.3809524  0.85416675 0.25       0.8333333  0.33333334
  0.6363636  0.2777778  0.8292682  0.2631579  0.88       0.21739131
  0.89130425 0.86956525 0.89130425 0.87999994 0.86111104 0.30434784
  0.5405406  0.36842105 0.48936164 0.5        0.39215696 0.5882354
  0.0625     0.6        0.24999994 0.631579   0.40740743 0.54545456
  0.37878782 0.65       0.265625   0.5        0.55714285 0.28
  0.5294117  0.6818182  0.6571429  0.33333334 0.61016935 0.7
  0.70491797 0.5        0.6666666  0.61904764 0.7173914  0.3809524
  0.74509805 0.4489988  0.18221802 0.        ]]
[[0.8333334  0.3809524  0.85416675 0.25  

In [31]:
a_prediction[0][0]

array([[ 1.5134339 ,  3.262701  ,  0.70592916,  0.9038207 ,  0.76391673,
         0.99305165,  0.7706631 , -4.541298  ,  0.75200313,  1.8693546 ,
         0.8222115 ,  1.0580941 ,  0.8169484 ,  4.7765985 ,  0.81093776,
         1.4679995 ,  0.8648067 ,  7.608754  ,  0.8363721 ,  8.55347   ,
         0.6103633 ,  9.390717  ,  0.541253  , 13.050402  ,  0.634488  ,
        12.729559  ,  0.5326317 , 13.288866  ,  0.3798835 , 15.15965   ,
         0.40002948, 14.240046  ,  0.36940652, 14.895222  ,  0.3931324 ,
         6.6268206 ,  0.36837894,  6.8567505 ,  0.42582583,  7.947989  ,
         0.5676015 ,  7.029104  ,  0.5507684 ,  6.464271  ,  0.7619359 ,
         6.417109  ,  0.761809  ,  4.232056  ,  0.75528395,  0.22261022,
         0.21896641,  2.1021855 ]], dtype=float32)

In [241]:
def generate_res_infection_lst(cities_prediction):
    res_infection = []
    res_region = []
    for city_prediction in cities_prediction:
        region_id = 0
        for region in city_prediction:
            for date in region:
                res_infection.append(int(round(date[0][0])))
                res_region.append(region_id)
            region_id = region_id + 1
    return res_infection, res_region
def check_minus(infection):
    for i in range(len(infection)):
        if infection[i] < 0:
            if infection[i - 1] < 6:
                infection[i] = 0
            else:
                infection[i] = int(infection[i - 1])
    return infection

In [245]:
infection, region = generate_res_infection_lst([a_prediction,b_prediction,c_prediction,d_prediction,e_prediction])
infection = check_minus(infection)
submission = pd.read_csv('data/submission.csv', header=None, names=['city','region','date','infection'])
submission['infection'] = infection
submission['region'] = region
submission.to_csv('data/test_submission_single_LSTM.csv', header=None, index=False)

result  
100 - 97.669  
80 - 98.666  
70 - 98.581    
50 - 99.553  
