In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
pwd

'/Users/pasin/Competition/DengAI'

In [3]:
target = pd.read_csv('dengue_labels_train.csv')
data = pd.read_csv('dengue_features_train.csv')
validate = pd.read_csv('dengue_features_test.csv')

In [4]:
def date_convert2(data):
    data['date'] = 0
    predate = str(data.loc[0, 'year']) + "-W" + str(data.loc[0, 'weekofyear'])
    data.loc[0, 'date'] = datetime.datetime.strptime(predate + '-1', "%Y-W%W-%w")
    for i in range(data.shape[0] - 1):
        data.loc[i+1, 'date'] = data.loc[i, 'date'] + datetime.timedelta(days =7)

In [5]:
sanjuan = target[target.city == 'sj']
iquitos = target[target.city == 'iq']

date_convert2(sanjuan)
sj_series = sanjuan[['date', 'total_cases']].set_index('date')

iquitos = iquitos.reset_index(drop = True)
date_convert2(iquitos)
iq_series = iquitos[['date', 'total_cases']].set_index('date')

In [6]:
#Split the validate
sj_val = validate[validate['city'] == 'sj']
iq_val = validate[validate['city'] == 'iq']

print('Length of SJ = ', sj_val.shape[0])
print('Length of IQ = ', iq_val.shape[0])

Length of SJ =  260
Length of IQ =  156


In [7]:
# Train-Test Split
sj_train = sj_series[sj_series.index.year < 2004]
sj_test = sj_series[sj_series.index.year > 2003]

iq_series_cut = iq_series[iq_series.index.year >= 2002]
iq_train = iq_series_cut[iq_series_cut.index.year < 2009]
iq_test = iq_series_cut[iq_series_cut.index.year > 2008]

In [23]:
#use sj_train, sj_test, iq_train, iq_test

#Generate list of hyperparameters
hyperp_list = [(1,0,0), (0,0,1)]
max_p = 12
max_d = 2
max_q = 12
for i in range(1,max_p+1):
    for j in range(max_d+1):
        for k in range(1,max_q+1):
            hyperp_list.append((i,j,k))

In [9]:
from sklearn.metrics import mean_absolute_error

def forecasting(data_train, data_test, orders = hyperp_list):
    #train_result = dict()
    test_result = dict()
    for order in orders:
        model = ARIMA(data_train, order = order)
        try:
            model_fit = model.fit()
            test_forecast = model_fit.forecast(steps = data_test.shape[0])
            test_result[order] = [int(i) for i in test_forecast[0]]
            print('The MAE for order {0} = {1}'.format(order, mean_absolute_error(sj_test, test_result[order])))
            
        except:
            pass
            #print('The order of {0} is not stationary'.format(order))
            
    #return test_result

In [15]:
sj_result = forecasting(data_train=sj_train, data_test = sj_test

The MAE for order (1, 0, 0) = 24.445945945945947
The MAE for order (0, 0, 1) = 28.12162162162162
The MAE for order (1, 0, 1) = 25.24774774774775
The MAE for order (1, 0, 2) = 25.73873873873874
The MAE for order (1, 0, 3) = 25.66216216216216
The MAE for order (1, 0, 4) = 25.87837837837838
The MAE for order (1, 0, 5) = 26.05855855855856
The MAE for order (1, 0, 6) = 26.13963963963964
The MAE for order (1, 0, 7) = 26.175675675675677
The MAE for order (1, 0, 8) = 26.43243243243243
The MAE for order (1, 0, 9) = 26.45045045045045
The MAE for order (1, 0, 10) = 27.07207207207207
The MAE for order (1, 0, 11) = 27.076576576576578
The MAE for order (1, 0, 12) = 27.09009009009009
The MAE for order (1, 1, 1) = 15.752252252252251
The MAE for order (1, 1, 2) = 15.882882882882884
The MAE for order (1, 2, 1) = 40.792792792792795
The MAE for order (1, 2, 2) = 39.734234234234236
The MAE for order (1, 2, 3) = 39.92342342342342
The MAE for order (1, 2, 4) = 40.44144144144144
The MAE for order (1, 2, 5) = 

The MAE for order (11, 0, 10) = 27.513513513513512
The MAE for order (11, 1, 1) = 16.58108108108108
The MAE for order (11, 1, 2) = 16.56756756756757
The MAE for order (11, 1, 3) = 16.59009009009009
The MAE for order (11, 1, 4) = 16.5990990990991
The MAE for order (11, 1, 5) = 16.756756756756758
The MAE for order (11, 1, 6) = 16.33783783783784
The MAE for order (11, 1, 7) = 16.75225225225225
The MAE for order (11, 1, 8) = 16.72072072072072
The MAE for order (11, 1, 9) = 16.743243243243242
The MAE for order (11, 1, 10) = 16.896396396396398
The MAE for order (11, 1, 11) = 16.815315315315317
The MAE for order (11, 2, 1) = 36.833333333333336
The MAE for order (11, 2, 2) = 39.828828828828826
The MAE for order (11, 2, 3) = 42.072072072072075
The MAE for order (11, 2, 4) = 25.153153153153152
The MAE for order (11, 2, 5) = 46.16216216216216
The MAE for order (11, 2, 6) = 41.927927927927925
The MAE for order (12, 0, 1) = 27.13063063063063
The MAE for order (12, 0, 2) = 27.22972972972973
The MAE 

In [24]:
iq_result = forecasting(data_train=iq_train, data_test = iq_test)

In [34]:
sj_val = validate[validate['city'] == 'sj']
iq_val = validate[validate['city'] == 'iq']

In [59]:
# Best parameters for San Juan (all data).
best_sj = ARIMA(sj_series, order = (1,1,2))
best_predict = best_sj.fit().forecast(steps = sj_val.shape[0])
best_predict = [int(round(x)) for x in best_predict[0]]

In [60]:
best_predict[:5]

[5, 5, 6, 6, 6]

In [37]:
sub_format = pd.read_csv('submission_format.csv')

In [55]:
# IQ
iq_model = ARIMA(iq_series_cut, order = (1,0,1))
iq_result = iq_model.fit().forecast(steps = iq_val.shape[0])
iq_int = [int(round(x)) for x in iq_result[0]]

In [61]:
all_submission = np.hstack((best_predict, iq_int))

In [62]:
sub_format.total_cases = all_submission

In [63]:
sub_format.to_csv('model_1.csv', index = False)