# Which lag is the best?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
sj = pd.read_excel('SanJuan_all.xlsx', index_col = 'Unnamed: 0')
iq = pd.read_excel('Iquitos_all.xlsx', index_col = 'Unnamed: 0')

In [3]:
# Replace Null with Month Mean
def mean_replace(data):
    monthly = dict()
    for month in range(1,13):
        monthly[month] = data[data.index.month == month]
        for col in monthly[month]:
            monthly[month][col] = monthly[month][col].fillna(np.mean(monthly[month][col]))
    new = pd.DataFrame(columns=data.columns)
    #for num in monthly.keys():
    #    new = np.vstack([new, monthly[num]])
    templist = list()
    for a in monthly.keys():
        templist.append(monthly[a])
    result = pd.concat(templist).sort_index()
    return result

From what I got from the Correlations notebook.
- For San Juan, use the log-transformed data with top three correlated features and the lag of 26?
- For Iquitos, use the ratio data by population with top three correlated features with the lag of 0

# GridSearch - San Juan

In [4]:
sj_x = sj.copy()
sj_y = sj_x.pop('total_cases')

In [5]:
sj_x = sj_x.iloc[:-26]
sj_y = sj_y.iloc[26:]

In [6]:
sj_x['total_cases'] = sj_y
sj_26corr = sj_x.corr()['total_cases'].sort_values(ascending = False)

In [7]:
sj_26corr

total_cases                              1.000000
reanalysis_specific_humidity_g_per_kg    0.214854
reanalysis_dew_point_temp_k              0.210690
station_avg_temp_c                       0.204662
station_max_temp_c                       0.200575
reanalysis_max_air_temp_k                0.199302
reanalysis_min_air_temp_k                0.192839
reanalysis_air_temp_k                    0.185743
station_min_temp_c                       0.181731
reanalysis_avg_temp_k                    0.178910
reanalysis_relative_humidity_percent     0.152666
reanalysis_precip_amt_kg_per_m2          0.111116
ndvi_nw                                  0.091284
reanalysis_sat_precip_amt_mm             0.060250
precipitation_amt_mm                     0.060250
station_precip_mm                        0.050884
ndvi_ne                                  0.046965
station_diur_temp_rng_c                  0.039125
ndvi_se                                  0.007234
ndvi_sw                                  0.005894


In [8]:
sj_features = sj_26corr[1:11]
sj_features.index

Index(['reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k',
       'station_avg_temp_c', 'station_max_temp_c', 'reanalysis_max_air_temp_k',
       'reanalysis_min_air_temp_k', 'reanalysis_air_temp_k',
       'station_min_temp_c', 'reanalysis_avg_temp_k',
       'reanalysis_relative_humidity_percent'],
      dtype='object')

In [9]:
sj_used = sj_x[sj_features.index]
sj_used = mean_replace(sj_used)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [10]:
sj_pretrain = sj_used.iloc[:sj_used.shape[0] - sj_y.isnull().sum()]
sj_try = sj_used.iloc[-sj_y.isnull().sum():]

In [11]:
sj_y = sj_y.dropna()
sj_y.shape[0] == sj_pretrain.shape[0] # check if the numbers of rows are equal

True

In [12]:
train_size = int(0.7 * sj_y.shape[0])
sj_x_train, sj_x_test = sj_pretrain.iloc[:train_size], sj_pretrain.iloc[train_size:]
sj_y_train, sj_y_test = sj_y.iloc[:train_size], sj_y.iloc[train_size:]

sj_y_train = np.log1p(sj_y_train)

## Support Vector Regression

In [40]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

# Algorithms used for modeling
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
print('Algorithm packages imported!')

# Model selection packages used for sampling dataset and optimising parameters
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
print('Model selection packages imported!')

import warnings
warnings.filterwarnings('ignore')

Algorithm packages imported!
Model selection packages imported!


In [14]:
svr_sj = SVR(kernel='linear')
svr_sj_fit = svr_sj.fit(sj_x_train, sj_y_train)
svr_sj_pred = svr_sj_fit.predict(sj_x_test)
svr_sj_pred = [int(x) for x in np.expm1(svr_sj_pred)]

In [15]:
mean_absolute_error(svr_sj_pred, sj_y_test)

13.747252747252746

Try GridSearch

In [43]:
models = [SVR(), KernelRidge(), ElasticNet(), Lasso(), GradientBoostingRegressor(), BayesianRidge(), 
          LassoLarsIC(), RandomForestRegressor(), xgb.XGBRegressor(), RANSACRegressor(), HuberRegressor(),
          PassiveAggressiveRegressor()]

SVR_param_grid = {'kernel':['linear', 'rbf']}
KR_param_grid = {'alpha': [0.1, 0.2, 0.3], 'coef0': [100], 'degree': [1, 2], 'gamma': [None], 
                 'kernel': ['linear']}
EN_param_grid = {'alpha': [0.001, 0.005, 0.1], 'copy_X': [True], 'l1_ratio': [0.3, 0.6, 0.8], 
                 'fit_intercept': [True], 'normalize': [False], 
                 'precompute': [False], 'max_iter': [300, 900], 
                 'tol': [0.0005, 0.001, 0.002], 'selection': ['random'], 
                 'random_state': [None]}
LASS_param_grid = {'alpha': [0.0005, 0.001, 0.005], 'copy_X': [True], 
                   'fit_intercept': [True], 'normalize': [False], 'precompute': [False], 
                   'max_iter': [300], 'tol': [0.01, 0.05, 0.1], 
                   'selection': ['random'], 'random_state': [None]}
GB_param_grid = {'loss': ['huber'], 'learning_rate': [0.01, 0.1, 0.3], 
                 'n_estimators': [300, 1000], 'max_depth': [3, 5], 
                 'min_samples_split': [0.0025, 0.005], 'min_samples_leaf': [3, 5, 7]}
BR_param_grid = {'n_iter': [200, 600], 'tol': [0.00001, 0.0001], 
                 'alpha_1': [0.00000001, 0.0000001, 0.000005], 
                 'alpha_2': [0.000005, 0.00001], 'lambda_1': [0.000005, 0.00001, 0.00005], 
                 'lambda_2': [0.00000001, 0.0000001], 'copy_X': [True]}
LL_param_grid = {'criterion': ['aic'], 'normalize': [True], 
                 'max_iter': [100, 500], 'copy_X': [True], 'precompute': ['auto'], 
                 'eps': [0.000001, 0.00001]}
RFR_param_grid = {'n_estimators': [50, 100, 200], 'max_features': ['auto'], 
                  'max_depth': [None, 2], 'min_samples_split': [5, 10], 
                  'min_samples_leaf': [2]}
XGB_param_grid = {'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [300], 
                  'booster': ['gbtree'], 'gamma': [0], 'reg_alpha': [0.1],
                  'reg_lambda': [0.7], 'max_delta_step': [0], 'min_child_weight': [1], 
                  'colsample_bytree': [0.5], 'colsample_bylevel': [0.2],
                  'scale_pos_weight': [1]}
RANSAC_param_grid = {'min_samples': [0.2, 0.5, 0.8], 'stop_probability': [0.1, 0.4, 0.8], 'max_skips': [10,30,50]}
HB_param_grid = {'epsilon':[1.1, 1.35, 1.7, 2], 'max_iter':[100,300,500], 'alpha':[0.00001, 0.0001, 0.005, 0.01],
                 'tol':[1e-5, 5e-5, 1e-4, 5e-4]}
PAR_param_grid = {'C':[1.0, 1.5, 2], 'max_iter':[500,1000,5000], 'tol':[5e-4, 1e-3, 5e-3, 1e-2]}


params_grid = [SVR_param_grid, KR_param_grid, EN_param_grid, LASS_param_grid, GB_param_grid, BR_param_grid, 
               LL_param_grid, RFR_param_grid, XGB_param_grid, RANSAC_param_grid, HB_param_grid, PAR_param_grid]

In [44]:
for algo in models:
    gridsearch = GridSearchCV(algo, param_grid=params_grid[0], scoring='neg_mean_absolute_error')
    params_grid.pop(0)
    
    gridsearch.fit(sj_x_train, sj_y_train)
    grid_best = gridsearch.best_estimator_
    algo_score = gridsearch.best_score_
    prediction = [int(x) for x in np.expm1(gridsearch.predict(sj_x_test))]
    test_score = mean_absolute_error(prediction, sj_y_test)
    print('Algorithm:', algo)
    print('   Training Error =', algo_score)
    print('   Testing Error =', test_score)
    print('--------------\n')

Algorithm: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)
   Training Error = -0.8085489151930978
   Testing Error = 13.747252747252746
--------------

Algorithm: KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)
   Training Error = -0.8229176693121404
   Testing Error = 13.71062271062271
--------------

Algorithm: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
   Training Error = -0.8127106277951018
   Testing Error = 13.454212454212454
--------------

Algorithm: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_s

- For the best three features, RFR is the best model with testing error = 14.86
- For the best eight features, RFR, 13.59
- For the best ten features, Lasso, 13.40

In [60]:
sj_model = RANSACRegressor(base_estimator=None, is_data_valid=None, is_model_valid=None,
                loss='absolute_loss', max_trials=100,
                min_samples=None, random_state=None, residual_threshold=None,
                stop_probability=0.99)

In [61]:
sj_model.fit(np.vstack((sj_x_train, sj_x_test)), np.hstack((np.expm1(sj_y_train), sj_y_test)))

RANSACRegressor(base_estimator=None, is_data_valid=None, is_model_valid=None,
                loss='absolute_loss', max_skips=inf, max_trials=100,
                min_samples=None, random_state=None, residual_threshold=None,
                stop_n_inliers=inf, stop_probability=0.99, stop_score=inf)

In [62]:
sj_result = [int(x) for x in sj_model.predict(sj_try)]

# GridSearch - Iquitos

In [45]:
iq_corr = iq.corr()['total_cases'].sort_values(ascending = False)
iq_features = iq_corr[1:5].index
iq_features

Index(['reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k',
       'reanalysis_min_air_temp_k', 'station_min_temp_c'],
      dtype='object')

In [46]:
iq_corr

total_cases                              1.000000
reanalysis_specific_humidity_g_per_kg    0.236476
reanalysis_dew_point_temp_k              0.230401
reanalysis_min_air_temp_k                0.214514
station_min_temp_c                       0.211702
reanalysis_relative_humidity_percent     0.130083
station_avg_temp_c                       0.113070
reanalysis_precip_amt_kg_per_m2          0.101171
reanalysis_air_temp_k                    0.097098
reanalysis_sat_precip_amt_mm             0.090171
precipitation_amt_mm                     0.090171
reanalysis_avg_temp_k                    0.079872
station_max_temp_c                       0.075279
station_precip_mm                        0.042976
ndvi_sw                                  0.032999
ndvi_ne                                  0.020215
ndvi_nw                                 -0.009586
ndvi_se                                 -0.041067
reanalysis_max_air_temp_k               -0.056474
station_diur_temp_rng_c                 -0.058230


In [47]:
iq_temp_used = iq[iq_features]

In [48]:
train_size = iq_temp_used.shape[0] - iq.total_cases.isnull().sum()

iq_pretrain, iq_try = iq.iloc[:train_size], iq.iloc[train_size:].drop('total_cases', axis = 1)
iq_pretrain = mean_replace(iq_pretrain)

In [49]:
iq_pop = pd.read_csv('iq_pop.csv')

In [50]:
split_number = int(0.7 * iq_pretrain.shape[0])
iq_x_train, iq_x_test = iq_pretrain.iloc[:split_number], iq_pretrain.iloc[split_number:]
iq_y_train = iq_x_train.pop('total_cases')
iq_y_test = iq_x_test.pop('total_cases')

In [51]:
for i in range(iq_y_train.shape[0]):
    iq_y_train.iloc[i] = 100000 * iq_y_train.iloc[i] / iq_pop['Estimated_population'].iloc[np.argwhere(iq_pop['Year'] == iq_y_train.index[i].year)[0][0]]

In [52]:
for i in range(iq_y_test.shape[0]):
    iq_y_test.iloc[i] = 100000 * iq_y_test.iloc[i] / iq_pop['Estimated_population'].iloc[np.argwhere(iq_pop['Year'] == iq_y_test.index[i].year)[0][0]]

In [53]:
models = [SVR(), KernelRidge(), ElasticNet(), Lasso(), GradientBoostingRegressor(), BayesianRidge(), 
          LassoLarsIC(), RandomForestRegressor(), xgb.XGBRegressor(), RANSACRegressor(), HuberRegressor(),
          PassiveAggressiveRegressor()]

SVR_param_grid = {'kernel':['linear', 'rbf']}
KR_param_grid = {'alpha': [0.1, 0.2, 0.3], 'coef0': [100], 'degree': [1, 2], 'gamma': [None], 
                 'kernel': ['linear']}
EN_param_grid = {'alpha': [0.001, 0.005, 0.1], 'copy_X': [True], 'l1_ratio': [0.3, 0.6, 0.8], 
                 'fit_intercept': [True], 'normalize': [False], 
                 'precompute': [False], 'max_iter': [300, 900], 
                 'tol': [0.0005, 0.001, 0.002], 'selection': ['random'], 
                 'random_state': [None]}
LASS_param_grid = {'alpha': [0.0005, 0.001, 0.005], 'copy_X': [True], 
                   'fit_intercept': [True], 'normalize': [False], 'precompute': [False], 
                   'max_iter': [300], 'tol': [0.01, 0.05, 0.1], 
                   'selection': ['random'], 'random_state': [None]}
GB_param_grid = {'loss': ['huber'], 'learning_rate': [0.01, 0.1, 0.3], 
                 'n_estimators': [300, 1000], 'max_depth': [3, 5], 
                 'min_samples_split': [0.0025, 0.005], 'min_samples_leaf': [3, 5, 7]}
BR_param_grid = {'n_iter': [200, 600], 'tol': [0.00001, 0.0001], 
                 'alpha_1': [0.00000001, 0.0000001, 0.000005], 
                 'alpha_2': [0.000005, 0.00001], 'lambda_1': [0.000005, 0.00001, 0.00005], 
                 'lambda_2': [0.00000001, 0.0000001], 'copy_X': [True]}
LL_param_grid = {'criterion': ['aic'], 'normalize': [True], 
                 'max_iter': [100, 500], 'copy_X': [True], 'precompute': ['auto'], 
                 'eps': [0.000001, 0.00001]}
RFR_param_grid = {'n_estimators': [50, 100, 200], 'max_features': ['auto'], 
                  'max_depth': [None, 2], 'min_samples_split': [5, 10], 
                  'min_samples_leaf': [2]}
XGB_param_grid = {'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [300], 
                  'booster': ['gbtree'], 'gamma': [0], 'reg_alpha': [0.1],
                  'reg_lambda': [0.7], 'max_delta_step': [0], 'min_child_weight': [1], 
                  'colsample_bytree': [0.5], 'colsample_bylevel': [0.2],
                  'scale_pos_weight': [1]}
RANSAC_param_grid = {'min_samples': [0.2, 0.5, 0.8], 'stop_probability': [0.1, 0.4, 0.8], 'max_skips': [10,30,50]}
HB_param_grid = {'epsilon':[1.1, 1.35, 1.7, 2], 'max_iter':[100,300,500], 'alpha':[0.00001, 0.0001, 0.005, 0.01],
                 'tol':[1e-5, 5e-5, 1e-4, 5e-4]}
PAR_param_grid = {'C':[1.0, 1.5, 2], 'max_iter':[500,1000,5000], 'tol':[5e-4, 1e-3, 5e-3, 1e-2]}


params_grid = [SVR_param_grid, KR_param_grid, EN_param_grid, LASS_param_grid, GB_param_grid, BR_param_grid, 
               LL_param_grid, RFR_param_grid, XGB_param_grid, RANSAC_param_grid, HB_param_grid, PAR_param_grid]

In [54]:
for algo in models:
    gridsearch = GridSearchCV(algo, param_grid=params_grid[0], scoring='neg_mean_absolute_error')
    params_grid.pop(0)
    
    gridsearch.fit(iq_x_train, iq_y_train)
    grid_best = gridsearch.best_estimator_
    algo_score = gridsearch.best_score_
    prediction = [int(x) for x in gridsearch.predict(iq_x_test)]
    test_score = mean_absolute_error(prediction, iq_y_test)
    print('Algorithm:', algo)
    print('   Training Error =', algo_score)
    print('   Testing Error =', test_score)
    print('--------------\n')

Algorithm: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)
   Training Error = -1.334788045361365
   Testing Error = 1.907431380003696
--------------

Algorithm: KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)
   Training Error = -1.5230954554851093
   Testing Error = 1.738513156920523
--------------

Algorithm: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
   Training Error = -1.4447027625361486
   Testing Error = 1.7088325770928476
--------------

Algorithm: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_sta

For the best three features, LassoLarsIC is the best model with testing error = 1.67

In [63]:
iq_model = LassoLarsIC(copy_X=True, criterion='aic', eps=2.220446049250313e-16,
            fit_intercept=True, max_iter=500, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [64]:
iq_model.fit(np.vstack((iq_x_train, iq_x_test)), np.hstack((iq_y_train, iq_y_test)))

LassoLarsIC(copy_X=True, criterion='aic', eps=2.220446049250313e-16,
            fit_intercept=True, max_iter=500, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [65]:
iq_result = iq_model.predict(iq_x_test)

In [66]:
for i in range(len(iq_result)):
    iq_result[i] = iq_result[i] * iq_pop['Estimated_population'].iloc[np.argwhere(iq_pop['Year'] == iq_x_test.index[i].year)[0][0]] / 100000
    iq_result[i] = int(iq_result[i])

In [67]:
iq_result[:10]

array([3., 4., 6., 5., 2., 2., 6., 6., 5., 7.])

In [68]:
all_result = [int(i) for i in np.hstack((sj_result, iq_result))]

In [69]:
sub = pd.read_csv('submission_format.csv')

In [70]:
sub.total_cases = all_result

In [71]:
sub.to_csv('Results/10log_0x10e5_ransac_llic.csv', index = False)