#### importing the relevant packages and loading the data

In [1]:
import statsmodels.api as sm

from warnings import filterwarnings

filterwarnings('ignore')

from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

  from pandas.core import datetools


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
# from sklearn.feature_selection import RFE

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from scipy import stats

from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

In [3]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#### Pre Processing

In [4]:
def impute(X):
    
    # remove the colum that has ~20% null values, also ranks low on feature importance
    X.drop(['ndvi_ne'], axis=1, inplace=True)
    
    # Filling the rest using linear interpolation
    X.interpolate(inplace=True)

def remove_outliers(df):
    return df[(np.abs(stats.zscore(df)) < 5).all(axis=1)]

def mape(Y_test, Y_pred, epsilon = 1):
    return np.mean(np.abs((Y_test - Y_pred + epsilon) / (Y_test + epsilon))) * 100

def extract_month(s):
    return int(s[5:7])

def city_indices(X):
    # city boolean encoding
    return X.city == 'sj'

def pre_process(X, y, trees = False):
    """
    Extracts the month out of date and converts it to a one hot
    Standardizes the numerical features
    """
    
    #Extracting month from the date
    months = X.week_start_date.apply(extract_month)
    
    #Response coding
#     month = X.week_start_date.apply(extract_month)
#     temp = pd.DataFrame(y.total_cases)
#     temp['month'] = month
#     for name,group in temp.groupby(month):
#         month[group.index] = np.median(group.total_cases)
        
    # Removing the columns not required for classification
    X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis=1, inplace=True)

    # Standardizing the data
    if not trees:
        scaler = StandardScaler()
        X[X.columns] = scaler.fit_transform(X)

    # Month one hot features
    month_features = pd.get_dummies(months, prefix='m_')
    X = X.join(month_features)

    #Alternatively use response coding 
#     X = X.join(month)
    
    return X

def seperate_cities_data(X, is_sj):

    # Seperating the cities data
    X_sj = X.loc[is_sj]
    X_iq = X.loc[~is_sj]
    
    return X_sj, X_iq

def get_y_labels(X_sj, X_iq, y):    
    
    y = y.total_cases    
    y_sj = y.loc[X_sj.index]
    y_iq = y.loc[X_iq.index]
    
    return y_sj, y_iq

def split(X_sj, X_iq, y_sj, y_iq):

    # train and test split
    sj_split_data = train_test_split(X_sj, y_sj, shuffle = False)
    iq_split_data = train_test_split(X_iq, y_iq, shuffle = False)

    return sj_split_data, iq_split_data

def process(X, y = pd.Series(), train = True, trees = False, feature_selection = 0, time_shift = 0):
    
    is_sj = city_indices(X)
    if not trees:
        impute(X)
    X = pre_process(X, y, trees)
    
    if feature_selection:
        selector = SelectKBest(f_regression, k=feature_selection).fit(X,y.total_cases)
        X = X.loc[:,selector.get_support()]

    #X = remove_outliers(X)
    X_sj, X_iq = seperate_cities_data(X, is_sj)
    if y.empty:
        return X_sj, X_iq
    
    y_sj, y_iq = get_y_labels(X_sj, X_iq, y)
    if time_shift:
        y_sj = y_sj.shift(time_shift).dropna()
        y_iq = y_iq.shift(time_shift).dropna()
        X_sj = X_sj[:-time_shift]
        X_iq = X_iq[:-time_shift]
        
    if not train:
        return X_sj, X_iq, y_sj, y_iq
    
    return split(X_sj, X_iq, y_sj, y_iq)

In [5]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

In [6]:
data = process(X,y)

(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

#### rough work

In [10]:
pd.rolling_mean(Y_sj_test, window = 12)

702          NaN
703          NaN
704          NaN
705          NaN
706          NaN
707          NaN
708          NaN
709          NaN
710          NaN
711          NaN
712          NaN
713    26.666667
714    24.250000
715    22.750000
716    21.083333
717    19.750000
718    17.416667
719    15.916667
720    14.166667
721    13.416667
722    12.166667
723    11.833333
724    10.500000
725     9.583333
726     8.916667
727     7.666667
728     7.333333
729     6.750000
730     6.416667
731     5.750000
         ...    
906    82.750000
907    90.666667
908    96.000000
909    98.666667
910    96.666667
911    94.416667
912    90.250000
913    85.333333
914    81.833333
915    73.916667
916    66.083333
917    58.750000
918    46.000000
919    36.000000
920    28.333333
921    23.916667
922    20.750000
923    17.500000
924    15.500000
925    13.250000
926    11.500000
927    10.500000
928     9.583333
929     8.666667
930     7.500000
931     6.583333
932     5.666667
933     4.5000

#### Negative Binomial

In [8]:
formula = ' + '.join([str(i) for i in list(X_sj_train.columns)])
formula = 'y ~ ' + formula

In [9]:
train_sj = X_sj_train.copy()
train_sj['y'] = Y_sj_train
test_sj = X_sj_test.copy()

model = smf.glm(formula=formula,
                data=train_sj,
                family=sm.families.NegativeBinomial())
model = model.fit()

predictions_sj = model.predict(test_sj).astype(int)
print ("cv error:", mean_absolute_error(predictions_sj, Y_sj_test))

pred_train_sj = model.predict(train_sj).astype(int)
print ("train error:", mean_absolute_error(pred_train_sj, Y_sj_train))

cv error: 22.602564102564102
train error: 25.478632478632477


In [10]:
train_iq = X_iq_train.copy()
train_iq['y'] = Y_iq_train
test_iq = X_iq_test.copy()

model = smf.glm(formula=formula,
                data=train_iq,
                family=sm.families.NegativeBinomial())
results = model.fit()

predictions_iq = results.predict(test_iq).astype(int)
print ("cv error:", mean_absolute_error(predictions_iq, Y_iq_test))

pred_train_iq = results.predict(train_iq).astype(int)
print ("train error:", mean_absolute_error(pred_train_iq, Y_iq_train))

cv error: 8.069230769230769
train error: 5.441025641025641


#### single evaluation metric

In [11]:
pred = predictions_iq.append(predictions_sj)
true = Y_iq_test.append(Y_sj_test)
print ("cv error:", mean_absolute_error(pred, true))

train_pred = pred_train_iq.append(pred_train_sj)
train_true = Y_iq_train.append(Y_sj_train)
print ("train error:", mean_absolute_error(train_pred, train_true))

cv error: 17.412087912087912
train error: 18.32234432234432


#### Tuning playground

In [300]:
test_error = []
train_error = []

In [301]:
for k in range(2,32):
    
    X = pd.read_csv("dengue_features_train.csv")
    y = pd.read_csv("dengue_labels_train.csv")
    test = pd.read_csv("dengue_features_test.csv")

    data = process(X,y,feature_selection=k)

    (X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

    formula = ' + '.join([str(i) for i in list(X_sj_train.columns)])
    formula = 'y ~ ' + formula
    # sj
    train_sj = X_sj_train.copy()
    train_sj['y'] = Y_sj_train
    test_sj = X_sj_test.copy()

    model = smf.glm(formula=formula,
                    data=train_sj,
                    family=sm.families.NegativeBinomial())
    model = model.fit()
    predictions_sj = model.predict(test_sj).astype(int)
    pred_train_sj = model.predict(train_sj).astype(int)
    
    # iq
    train_iq = X_iq_train.copy()
    train_iq['y'] = Y_iq_train
    test_iq = X_iq_test.copy()

    model = smf.glm(formula=formula,
                    data=train_iq,
                    family=sm.families.NegativeBinomial())
    results = model.fit()

    predictions_iq = results.predict(test_iq).astype(int)
    pred_train_iq = results.predict(train_iq).astype(int)

    # combined
    pred = predictions_iq.append(predictions_sj)
    true = Y_iq_test.append(Y_sj_test)
    test_error.append(mean_absolute_error(pred, true))

    train_pred = pred_train_iq.append(pred_train_sj)
    train_true = Y_iq_train.append(Y_sj_train)
    train_error.append(mean_absolute_error(train_pred, train_true))

In [302]:
t = pd.DataFrame(train_error)

In [303]:
t['test'] = test_error

In [306]:
t.index = range(2,32)

In [307]:
t

Unnamed: 0,0,test
2,21.233516,19.035714
3,21.271062,18.945055
4,21.258242,18.851648
5,21.284799,17.945055
6,21.184066,17.39011
7,21.118132,17.376374
8,21.122711,18.016484
9,21.093407,18.277473
10,21.088828,18.145604
11,20.787546,18.708791


#### Submission

In [314]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

In [315]:
X_sj, X_iq, y_sj, y_iq = process(X, y, train = False)
X_sj_test, X_iq_test = process(test)

In [316]:
X_sj['y'] = y_sj
X_iq['y'] = y_iq

In [317]:
model = smf.glm(formula=formula,
                data=X_sj,
                family=sm.families.NegativeBinomial())
model = model.fit()
Y_pred_sj = model.predict(X_sj_test)

model = smf.glm(formula=formula,
                data=X_iq,
                family=sm.families.NegativeBinomial(alpha=0.01))
model = model.fit()
Y_pred_iq = model.predict(X_iq_test)

Y_pred = np.concatenate([Y_pred_sj, Y_pred_iq])

In [318]:
test = pd.read_csv("dengue_features_test.csv")
submission = test.loc[:,['city', 'year', 'weekofyear']].copy()
submission['total_cases'] = Y_pred.astype(int)
submission.to_csv('submission_5.csv', index=False)