In [289]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import math

In [290]:
def parse(x):
    x = x[:4] + ' '+ x[4:6] + ' '+x[6:]
    return datetime.strptime(x, '%Y %m %d')
def read_data(file):
    return pd.read_csv(file,parse_dates = ['date'], date_parser=parse)

In [291]:
A = read_data("data1/A.csv")
B = read_data("data1/B.csv")
C = read_data("data1/C.csv")
D = read_data("data1/D.csv")
E = read_data("data1/E.csv")
n_features = A.shape[1] - 3

In [292]:
scaler = MinMaxScaler(feature_range=(0, 1))
n_step = 3
n_ob = n_step * n_features

In [293]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def process(cdf):
    v = []
    for i in range(cdf.region.max()+1):
        df = cdf[cdf.region==i].copy()
        df = df.set_index("date")
        df.drop(["city","region"],axis = 1,inplace = True)
        values = df.values
        values = values.astype('float32')
        scaled = scaler.fit_transform(values)
        reframed = series_to_supervised(scaled, n_step, 30)
#         print(reframed.head())
#         reframed.drop(reframed.columns[range(52,102)], axis=1, inplace=True)
        v.append(reframed.values)
        a = v[0]
        for i in range(1,len(v)):
            a = np.concatenate((a,v[i]),axis=0)
    return a

In [294]:
a = process(A)
b = process(B)
c = process(C)
d = process(D)
e = process(E)
data = np.concatenate((a,b,c,d,e),axis=0)

In [295]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data[:,:n_ob],data[:,-n_features],test_size=0.2,random_state=2)

In [296]:
x_train = x_train.reshape((x_train.shape[0], n_step, n_features))
x_test = x_test.reshape((x_test.shape[0], n_step, n_features))

In [297]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(4076, 3, 4) (4076,) (1020, 3, 4) (1020,)


In [298]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [299]:
# model
model = Sequential()
model.add(LSTM(50, input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dense(1))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')
# train
model.fit(x_train, y_train, epochs=50, batch_size=16, validation_data=(x_test, y_test), verbose=2, shuffle=False)

Train on 4076 samples, validate on 1020 samples
Epoch 1/50
 - 4s - loss: 0.1007 - val_loss: 0.0836
Epoch 2/50
 - 2s - loss: 0.0846 - val_loss: 0.0808
Epoch 3/50
 - 2s - loss: 0.0833 - val_loss: 0.0797
Epoch 4/50
 - 2s - loss: 0.0823 - val_loss: 0.0787
Epoch 5/50
 - 2s - loss: 0.0813 - val_loss: 0.0775
Epoch 6/50
 - 2s - loss: 0.0802 - val_loss: 0.0764
Epoch 7/50
 - 2s - loss: 0.0792 - val_loss: 0.0752
Epoch 8/50
 - 2s - loss: 0.0782 - val_loss: 0.0742
Epoch 9/50
 - 2s - loss: 0.0773 - val_loss: 0.0731
Epoch 10/50
 - 2s - loss: 0.0764 - val_loss: 0.0720
Epoch 11/50
 - 2s - loss: 0.0754 - val_loss: 0.0708
Epoch 12/50
 - 2s - loss: 0.0745 - val_loss: 0.0697
Epoch 13/50
 - 2s - loss: 0.0736 - val_loss: 0.0688
Epoch 14/50
 - 3s - loss: 0.0728 - val_loss: 0.0681
Epoch 15/50
 - 2s - loss: 0.0721 - val_loss: 0.0675
Epoch 16/50
 - 2s - loss: 0.0715 - val_loss: 0.0669
Epoch 17/50
 - 2s - loss: 0.0710 - val_loss: 0.0665
Epoch 18/50
 - 2s - loss: 0.0705 - val_loss: 0.0661
Epoch 19/50
 - 6s - loss:

<keras.callbacks.callbacks.History at 0x163043ad0>

In [300]:
def gen_var(t):
    var_list = []
    for s in range(t,t+3):
        for var in range(1,n_features+1):
            var_list.append("var"+str(var)+"(t+"+str(s)+")")
    return var_list
def generate_test(cdf):
    v = []
    for i in range(cdf.region.max() + 1):
        df = cdf[cdf.region==i].copy()
        df = df.set_index("date")
        df.drop(["city","region"],axis = 1,inplace = True)
        values = df.values
        values = values.astype('float32')
        scaled = scaler.fit_transform(values)
        reframed = series_to_supervised(scaled, n_step, 30)
        df = reframed[gen_var(1)][-4:].copy()
        df.columns = gen_var(-3)
        data = df
        df = reframed[gen_var(14)].copy()
        df.columns = gen_var(-3)
        data = data.append(df)
        df = reframed[gen_var(27)].copy()
        df.columns = gen_var(-3)
        data = data.append(df)
        data = data.reset_index(drop=True)
        val = data.values
        val = val.reshape((data.shape[0], n_step, n_features))
        yh = model.predict(val)
        val0 = val.reshape((val.shape[0], n_step*n_features))
        inv_yh = np.concatenate((yh, val0[:, -(n_features-1):]), axis=1)
        inv_yh = scaler.inverse_transform(inv_yh)
        inv_yh = inv_yh[:,0]
        v.append(inv_yh)
    return v

In [301]:
aa = generate_test(A)
bb = generate_test(B)
cc = generate_test(C)
dd = generate_test(D)
ee = generate_test(E)

In [302]:
def generate_res_infection_lst(data):
    res_infection = []
    res_region = []
    for city in data:
        region_id = 0
        for region in city:
            for i in region:
                res_infection.append(int(round(i)))
                res_region.append(region_id)
            region_id = region_id + 1
    return res_infection, res_region
def check_minus(infection):
    for i in range(len(infection)):
        if infection[i] < 0:
            if infection[i - 1] < 6:
                infection[i] = 0
            else:
                infection[i] = int(infection[i - 1])
    return infection

In [303]:
infection, region = generate_res_infection_lst([aa,bb,cc,dd,ee])
infection = check_minus(infection)
submission = pd.read_csv('submission.csv', header=None, names=['city','region','date','infection'])
submission['infection'] = infection
submission['region'] = region
submission.to_csv('submission_3.csv',index=False,header=None)