#### importing the relevant packages and loading the data

In [1]:
import statsmodels.api as sm

from warnings import filterwarnings

filterwarnings('ignore')

from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

  from pandas.core import datetools


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
# from sklearn.feature_selection import RFE

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from scipy import stats

from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

In [27]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

#### Pre Processing

In [26]:
def impute(X):
    
    # remove the colum that has ~20% null values, also ranks low on feature importance
    X.drop(['ndvi_ne'], axis=1, inplace=True)
    
    # Filling the rest using linear interpolation
    X.interpolate(inplace=True)

def remove_outliers(df):
    return df[(np.abs(stats.zscore(df)) < 5).all(axis=1)]

def mape(Y_test, Y_pred, epsilon = 1):
    return np.mean(np.abs((Y_test - Y_pred + epsilon) / (Y_test + epsilon))) * 100

def extract_month(s):
    return int(s[5:7])

def city_indices(X):
    # city boolean encoding
    return X.city == 'sj'

def pre_process(X, trees = False):
    """
    Extracts the month out of date and converts it to a one hot
    Standardizes the numerical features
    """
    
    #Extracting month from the date
    months = X.week_start_date.apply(extract_month)

    # Removing the columns not required for classification
    X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis=1, inplace=True)

    # Standardizing the data
    if not trees:
        scaler = StandardScaler()
        X[X.columns] = scaler.fit_transform(X)

    # Month one hot features
    month_features = pd.get_dummies(months, prefix='m_')
    X = X.join(month_features)

    # Alternatively use months as a discrete feature
    #X = X.join(months)
    
    return X

def seperate_cities_data(X, is_sj):

    # Seperating the cities data
    X_sj = X.loc[is_sj]
    X_iq = X.loc[~is_sj]
    
    return X_sj, X_iq

def get_y_labels(X_sj, X_iq, y):    
    
    y = y.total_cases    
    y_sj = y.loc[X_sj.index]
    y_iq = y.loc[X_iq.index]
    
    return y_sj, y_iq

def split(X_sj, X_iq, y_sj, y_iq):

    # train and test split
    sj_split_data = train_test_split(X_sj, y_sj, shuffle = False)
    iq_split_data = train_test_split(X_iq, y_iq, shuffle = False)

    return sj_split_data, iq_split_data

def process(X, y = pd.Series(), train = True, trees = False):
    
    is_sj = city_indices(X)
    if not trees:
        impute(X)
    X = pre_process(X, trees)
    
    #X = remove_outliers(X)
    X_sj, X_iq = seperate_cities_data(X, is_sj)
    if y.empty:
        return X_sj, X_iq
    
    y_sj, y_iq = get_y_labels(X_sj, X_iq, y)
    if not train:
        return X_sj, X_iq, y_sj, y_iq
    
    return split(X_sj, X_iq, y_sj, y_iq)

In [28]:
data = process(X,y)
(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

#### Negative Binomial

In [39]:
model_formula = "y ~ " \
                "reanalysis_specific_humidity_g_per_kg + " \
                "reanalysis_dew_point_temp_k + " \
                "reanalysis_max_air_temp_k + " \
                "station_min_temp_c"

In [29]:
train_sj = X_sj_train.copy()
train_sj['y'] = Y_sj_train

test_sj = X_sj_test.copy()

In [30]:
formula = ' + '.join([str(i) for i in list(X_sj_train.columns)])
formula = 'y ~ ' + formula

In [31]:
model = smf.glm(formula=formula,
                data=train_sj,
                family=sm.families.NegativeBinomial())

model = model.fit()

In [32]:
predictions = model.predict(test_sj).astype(int)

In [33]:
score = eval_measures.meanabs(predictions, Y_sj_test)

In [34]:
score

22.602564102564102

#### time based cross validations

In [40]:
from sklearn.model_selection import TimeSeriesSplit

splits = TimeSeriesSplit(n_splits=10)
for train_index, test_index in splits.split(X_sj):
    train = X_sj.iloc[train_index]
    test = X_sj.iloc[test_index]

    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    
    model = smf.glm(formula=formula,
                    data=train,
                    family=sm.families.NegativeBinomial())
    model = model.fit()
    y_pred = model.predict(test)
    y_test = test['y']
    
    print('The error is :', mean_absolute_error(y_pred, y_test))
    
    model = smf.glm(formula=model_formula,
                data=train,
                family=sm.families.NegativeBinomial())
    model = model.fit()
    y_pred = model.predict(test)
    
    print('The improved error is :', mean_absolute_error(y_pred, y_test))
    print('\n')

Observations: 171
Training Observations: 86
Testing Observations: 85
The error is : 25.050005033552154
The improved error is : 18.769353415979243


Observations: 256
Training Observations: 171
Testing Observations: 85
The error is : 64.07987397359854
The improved error is : 66.34764931813167


Observations: 341
Training Observations: 256
Testing Observations: 85
The error is : 30.09983436684409
The improved error is : 40.897010830527115


Observations: 426
Training Observations: 341
Testing Observations: 85
The error is : 21.327720310751932
The improved error is : 21.92113281643686


Observations: 511
Training Observations: 426
Testing Observations: 85
The error is : 35.74883733509669
The improved error is : 37.342354312131576


Observations: 596
Training Observations: 511
Testing Observations: 85
The error is : 22.09534515183008
The improved error is : 35.02266300010787


Observations: 681
Training Observations: 596
Testing Observations: 85
The error is : 31.317828359706446
The improv

In [41]:
from sklearn.model_selection import TimeSeriesSplit

splits = TimeSeriesSplit(n_splits=10)
for train_index, test_index in splits.split(X_iq):
    train = X_sj.iloc[train_index]
    test = X_sj.iloc[test_index]

    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    
    model = smf.glm(formula=formula,
                    data=train,
                    family=sm.families.NegativeBinomial())
    model = model.fit()
    y_pred = model.predict(test)
    y_test = test['y']
    
    print('The error is :', mean_absolute_error(y_pred, y_test))
    
    model = smf.glm(formula=model_formula,
                data=train,
                family=sm.families.NegativeBinomial())
    model = model.fit()
    y_pred = model.predict(test)
    
    print('The improved error is :', mean_absolute_error(y_pred, y_test))
    print('\n')

Observations: 97
Training Observations: 50
Testing Observations: 47
The error is : 44.18153077923765
The improved error is : 48.9891672180699


Observations: 144
Training Observations: 97
Testing Observations: 47
The error is : 34.4126045107896
The improved error is : 13.971000057019054


Observations: 191
Training Observations: 144
Testing Observations: 47
The error is : 22.30170448669125
The improved error is : 22.097412942598062


Observations: 238
Training Observations: 191
Testing Observations: 47
The error is : 81.17416032340172
The improved error is : 87.31470133986242


Observations: 285
Training Observations: 238
Testing Observations: 47
The error is : 32.96837664430572
The improved error is : 57.29418404697074


Observations: 332
Training Observations: 285
Testing Observations: 47
The error is : 32.42788905613295
The improved error is : 32.887411454495606


Observations: 379
Training Observations: 332
Testing Observations: 47
The error is : 16.475065463826468
The improved err

#### Submission

In [35]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

In [36]:
X_sj, X_iq, y_sj, y_iq = process(X, y, train = False)
X_sj_test, X_iq_test = process(test)

In [37]:
X_sj['y'] = y_sj

In [38]:
X_iq['y'] = y_iq

In [105]:
model = smf.glm(formula=formula,
                data=X_sj,
                family=sm.families.NegativeBinomial())
model = model.fit()
Y_pred_sj = model.predict(X_sj_test)

model = smf.glm(formula=formula,
                data=X_iq,
                family=sm.families.NegativeBinomial())
model = model.fit()
Y_pred_iq = model.predict(X_iq_test)

Y_pred = np.concatenate([Y_pred_sj, Y_pred_iq])

In [110]:
test = pd.read_csv("dengue_features_test.csv")
submission = test.loc[:,['city', 'year', 'weekofyear']].copy()
submission['total_cases'] = Y_pred.astype(int)
submission.to_csv('submission_5.csv', index=False)