In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, RobustScaler
from sklearn.impute import SimpleImputer


from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import xgboost as xgb

from sklearn.metrics import mean_absolute_error

import sys
sys.path.insert(0, '../scr')
from transformers import SelectFeatures, Impute, select_city, create_lagged_column

### Load the provided data

In [2]:
train_features = pd.read_csv('../data/dengue_features_train.csv',
                             index_col=[0,1,2])
train_labels = pd.read_csv('../data/dengue_labels_train.csv',
                           index_col=[0,1,2])
test_features = pd.read_csv('../data/dengue_features_test.csv',
                             index_col=[0,1,2])

train_features = create_lagged_column(train_features,'station_precip_mm',lag=4)
train_features = create_lagged_column(train_features,'station_avg_temp_c',lag=18)

test_features = create_lagged_column(test_features,'station_precip_mm',lag=4)
test_features = create_lagged_column(test_features,'station_avg_temp_c',lag=18)
                                    


### Splitting the provided training data by city and into train and validation data for each city

In [3]:
# initialize dictionaries to hold the train and validation data for each city
full_train_X = {}
test_X = {}
train_X = {}
validate_X = {}
full_train_y = {}
train_y = {}
validate_y = {}
cities = ['sj', 'iq']
city_names = {'sj':'San Juan, Puerto Rico', 'iq':'Iquitos, Peru' }

# make the train validation split
for city in cities:
    full_train_X [city] = select_city(train_features, city = city)
    full_train_y [city] = select_city(train_labels, city = city)
    test_X [city] = select_city(test_features, city = city)
    
    assert len(full_train_X [city]) == len(full_train_y [city])
    
    split_idx = int(len(full_train_X[city])*0.8)
    
    train_X [city] = full_train_X [city].head(split_idx)
    train_y [city] = full_train_y [city].head(split_idx)
    
    validate_X [city] = full_train_X [city].tail(len(full_train_X[city]) - split_idx)
    validate_y [city] = full_train_y [city].tail(len(full_train_y[city]) - split_idx)
    

Check that the split worked

In [4]:
for city in cities:
    print('------------- '+city_names[city]+' -----------------')
    print("training")
    print('X', train_X[city].shape, 'y', train_y[city].shape)
    print('validation')
    print('X', validate_X[city].shape, 'y', validate_y[city].shape)
    print('all')
    print('X', full_train_X[city].shape, 'y', full_train_y[city].shape,'\n')

------------- San Juan, Puerto Rico -----------------
training
X (748, 23) y (748, 1)
validation
X (188, 23) y (188, 1)
all
X (936, 23) y (936, 1) 

------------- Iquitos, Peru -----------------
training
X (416, 23) y (416, 1)
validation
X (104, 23) y (104, 1)
all
X (520, 23) y (520, 1) 



### Build a pipeline

In [5]:
# build the general setup of the pipeline


# I added the SimpleImputer, because the ffill method fails when there is a nan value in the first row,
# the first row is filled with the median in that case

# define parameters for the different transformers

myfeatures = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c', 
                 'station_min_temp_c',
             'station_precip_mm_lagged_4.00',
             'station_avg_temp_c_lagged_18.00']

pipe ={}
for city in cities:
    pipe[city] = Pipeline(steps=[('feature_selection', SelectFeatures), ('ffill_nans', Impute),
                       ('median_nans', SimpleImputer(strategy='median')),('scaler',RobustScaler())])
#('poly',PolynomialFeatures(degree=2))
#myfeatures = [x for x in train_X['sj'].columns if x!='week_start_date']

# Here, we set the parameters for all the transformers that we use in the pipeline

this_pipe = {}
for city in cities:
    this_pipe[city]=pipe[city].set_params(feature_selection__kw_args={'features':myfeatures})
    # Here, we add whatever model we want to use as a last step in the pipeline
    #this_pipe[city].steps.append(('lin_reg', Ridge(alpha=.1)))
    this_pipe[city].steps.append(('xgb',xgb.XGBRegressor(max_depth=4)))


#this_pipe.steps.append(('lin_reg', ElasticNet(alpha=.1)))

### Training one model per city

In [None]:
# we train one model per city and save them in this dictionary
model = {}
for city in cities:
    model[city] = this_pipe[city].fit(train_X[city], train_y[city])

### Predicting on training and validation data set for each city

In [None]:
pred_train = {}
pred_valid = {}

for city in cities:
    pred_train[city] = model[city].predict(train_X[city])
    pred_valid[city] = model[city].predict(validate_X[city])

    

### Quality of the model

In [None]:
print('MEAN ABSOLUTE ERROR\n')
for city in cities:
    print('------ '+city_names[city]+' ---------')
    print(f'Training:   {mean_absolute_error(train_y[city], pred_train[city]):1.4f}')
    print(f'Validation: {mean_absolute_error(validate_y[city], pred_valid[city]):1.4f}\n')

with lag 18 and poly degree 2

MEAN ABSOLUTE ERROR

------ San Juan, Puerto Rico ---------
Training:   11.9508
Validation: 24.9103

------ Iquitos, Peru ---------
Training:   1.9890
Validation: 8.1461


with lag 20 in temp

MEAN ABSOLUTE ERROR

------ San Juan, Puerto Rico ---------
Training:   15.6495
Validation: 25.0402

------ Iquitos, Peru ---------
Training:   2.6335
Validation: 8.3078


without lags

MEAN ABSOLUTE ERROR

------ San Juan, Puerto Rico ---------
Training:   29.1376
Validation: 24.7177

------ Iquitos, Peru ---------
Training:   6.1009
Validation: 6.9307



In [None]:
# maybe here, we could save the pipeline and the errors and hyper parameters (and later also a plot)?

### Plot of predictions

In [None]:
# attention, attention, this really depends on pandas and everyone not mixing up the rows

training_results = {}
validation_results = {}

# setting up figure with subplots
fig, axes = plt.subplots(2, 2, gridspec_kw = {'width_ratios':[2.5, 1]})
fig.set_size_inches(14,8)  
axes = axes.flatten()
for i in range(4):
    axes[i].spines['top'].set_visible(False)
    if i%2:
        axes[i].yaxis.tick_right()
        axes[i].spines['left'].set_visible(False)
        axes[i].yaxis.set_label_position("right")
    else:
        axes[i].spines['right'].set_visible(False)        
        
plt.subplots_adjust(hspace = 0.75, wspace = 0.08)

# combine the prediction and true values in one dataframe
for city in cities:
    training_results[city] = train_y[city]
    training_results[city]['pred'] = pred_train[city]
    validation_results[city] = validate_y[city]
    validation_results[city]['pred'] = pred_valid[city]

# plot the predictions and true values for each city and each data set (training/validation)
for i, city in enumerate(cities):
    training_results[city].plot(y = ['total_cases', 'pred'], ax = axes[2*i], 
                                label = ['True', 'Prediction'], color = ['.75','orangered'])
    validation_results[city].plot(y = ['total_cases','pred'], ax = axes[2*i+1], 
                                  label = ['True','Prediction'], color = ['.75','b'])
    axes[2*i].set_title(city_names[city])
    axes[2*i].set_ylim(0,full_train_y[city].total_cases.max())
    axes[2*i+1].set_ylim(0,full_train_y[city].total_cases.max())
    
# format axes
legend_title=['Training data','Validation data']*2
for i in range(4):
    axes[i].set_ylabel('cases')
    axes[i].set_xlabel('time (year, week)')
    axes[i].legend(frameon=False, title = legend_title[i], loc = i%2+1)
    axes[i].tick_params(axis='x', labelrotation=45)
    axes[i].set_facecolor('.98')

### Predict the test data set

In [None]:
pred_test = {}
for city in cities:
    pred_test[city] = model[city].predict(test_X[city])

### Get the prediction into the right format for submission

In [None]:
results = []
for city in cities:
    df_hlp = test_X[city].copy()
    df_hlp['total_cases'] = pred_test[city]
    df_hlp = pd.concat([df_hlp], keys=[city], names=['city'])
    results.append(pd.DataFrame(df_hlp.total_cases.astype(int)))
final = pd.concat(results)
final.to_csv('../predictions/prediction.csv')