In [1]:
import numpy as np
import pandas as pd
import pickle
from src import constants
from src.data.data_processing import preprocess_data

In [2]:
# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

### REFLECTING ON OUR PERFORMANCE

REFLECTING ON OUR PERFORMANCE
These graphs can actually tell us a lot about where our model is going wrong and give us some good hints about where investments will improve the model performance. For example, we see that our model in blue does track the seasonality of Dengue cases. However, the timing of the seasonality of our predictions has a mismatch with the actual results. One potential reason for this is that our features don't look far enough into the past--that is to say, we are asking to predict cases at the same time as we are measuring percipitation. Because dengue is misquito born, and the misquito lifecycle depends on water, we need to take both the life of a misquito and the time between infection and symptoms into account when modeling dengue. This is a critical avenue to explore when improving this model.

The other important error is that our predictions are relatively consistent--we miss the spikes that are large outbreaks. One reason is that we don't take into account the contagiousness of dengue. A possible way to account for this is to build a model that progressively predicts a new value while taking into account the previous prediction. By training on the dengue outbreaks and then using the predicted number of patients in the week before, we can start to model this time dependence that the current model misses.

So, we know we're not going to win this thing, but let's submit the model anyway!

In [3]:
with open(constants.SJ_BEST_MODEL_PATH, "rb") as file:
    sj_best_model = pickle.load(file)

with open(constants.IQ_BEST_MODEL_PATH, "rb") as file:
    iq_best_model = pickle.load(file)

In [4]:
sj_test, iq_test = preprocess_data(constants.TEST_FEATURE_PATH_RAW)

sj_predictions = sj_best_model.predict(sj_test).astype(int)
iq_predictions = iq_best_model.predict(iq_test).astype(int)

submission = pd.read_csv(constants.TEST_RESULTS_PATH_RAW,
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv(constants.TEST_RESULTS_PATH_PROCESSED)

### REFLECTING ON OUR PERFORMANCE

REFLECTING ON OUR PERFORMANCE
These graphs can actually tell us a lot about where our model is going wrong and give us some good hints about where investments will improve the model performance. For example, we see that our model in blue does track the seasonality of Dengue cases. However, the timing of the seasonality of our predictions has a mismatch with the actual results. One potential reason for this is that our features don't look far enough into the past--that is to say, we are asking to predict cases at the same time as we are measuring percipitation. Because dengue is misquito born, and the misquito lifecycle depends on water, we need to take both the life of a misquito and the time between infection and symptoms into account when modeling dengue. This is a critical avenue to explore when improving this model.

The other important error is that our predictions are relatively consistent--we miss the spikes that are large outbreaks. One reason is that we don't take into account the contagiousness of dengue. A possible way to account for this is to build a model that progressively predicts a new value while taking into account the previous prediction. By training on the dengue outbreaks and then using the predicted number of patients in the week before, we can start to model this time dependence that the current model misses.

So, we know we're not going to win this thing, but let's submit the model anyway!

In [5]:
sj_test, iq_test = preprocess_data(constants.TEST_FEATURE_PATH_RAW)

sj_predictions = sj_best_model.predict(sj_test).astype(int)
iq_predictions = iq_best_model.predict(iq_test).astype(int)

submission = pd.read_csv(constants.TEST_RESULTS_PATH_RAW,
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv(constants.TEST_RESULTS_PATH_PROCESSED)