#### importing the relevant packages and loading the data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
# from sklearn.feature_selection import RFE

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler

from scipy import stats

from xgboost import XGBRegressor

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

#### Pre Processing

In [4]:
def impute(X):
    
    # remove the colum that has ~20% null values, also ranks low on feature importance
    X.drop(['ndvi_ne'], axis=1, inplace=True)
    
    # Filling the rest using linear interpolation
    X.interpolate(inplace=True)

def remove_outliers(df):
    return df[(np.abs(stats.zscore(df)) < 5).all(axis=1)]

def mape(Y_test, Y_pred, epsilon = 1):
    return np.mean(np.abs((Y_test - Y_pred + epsilon) / (Y_test + epsilon))) * 100

def extract_month(s):
    return int(s[5:7])

def city_indices(X):
    # city boolean encoding
    return X.city == 'sj'

def pre_process(X, trees = False):
    """
    Extracts the month out of date and converts it to a one hot
    Standardizes the numerical features
    """
    
    #Extracting month from the date
    months = X.week_start_date.apply(extract_month)

    # Removing the columns not required for classification
    X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis=1, inplace=True)

    # Standardizing the data
    if not trees:
        scaler = StandardScaler()
        X[X.columns] = scaler.fit_transform(X)

    # Month one hot features
    month_features = pd.get_dummies(months)
    X = X.join(month_features)

    # Alternatively use months as a discrete feature
    #X = X.join(months)
    
    return X

def seperate_cities_data(X, is_sj):

    # Seperating the cities data
    X_sj = X.loc[is_sj]
    X_iq = X.loc[~is_sj]
    
    return X_sj, X_iq

def get_y_labels(X_sj, X_iq, y):    
    
    y = y.total_cases    
    y_sj = y.loc[X_sj.index]
    y_iq = y.loc[X_iq.index]
    
    return y_sj, y_iq

def split(X_sj, X_iq, y_sj, y_iq):

    # train and test split
    sj_split_data = train_test_split(X_sj, y_sj, shuffle = False)
    iq_split_data = train_test_split(X_iq, y_iq, shuffle = False)

    return sj_split_data, iq_split_data

def process(X, y = pd.Series(), train = True, trees = False):
    
    is_sj = city_indices(X)
    if not trees:
        impute(X)
    X = pre_process(X, trees)
    
    #X = remove_outliers(X)
    X_sj, X_iq = seperate_cities_data(X, is_sj)
    if y.empty:
        return X_sj, X_iq
    
    y_sj, y_iq = get_y_labels(X_sj, X_iq, y)
    if not train:
        return X_sj, X_iq, y_sj, y_iq
    
    return split(X_sj, X_iq, y_sj, y_iq)

#### Random Model

In [18]:
data = process(X,y)
(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

NameError: name 'StandardScaler' is not defined

In [None]:
def random(Y_test, Y_train):
    y_p = np.full(len(Y_test), np.mean(Y_train))
    return mean_absolute_error(Y_test, y_p)

In [None]:
random(Y_sj_test, Y_sj_train)

In [None]:
random(Y_iq_test, Y_iq_train)

In [None]:
Y_test = Y_sj_test.append(Y_iq_test)
Y_train = Y_sj_train.append(Y_iq_train)

In [None]:
random(Y_test, Y_train)

#### Generalised Model

In [13]:
def general_model(clf, data):
    
    (X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data
    
    clf.fit(X_sj_train, Y_sj_train)
    Y_sj_pred = clf.predict(X_sj_test)
    
    clf.fit(X_iq_train, Y_iq_train)
    Y_iq_pred = clf.predict(X_iq_test)
    
    Y_pred = np.concatenate([Y_sj_pred, Y_iq_pred])
    Y_pred = Y_pred.astype(int).clip(0)
    Y_test = Y_sj_test.append(Y_iq_test)
    
    return mean_absolute_error(Y_test, Y_pred)

#### Baseline Model

In [18]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")

In [19]:
data = process(X,y)

In [20]:
model = RandomForestRegressor(criterion='mae')

In [21]:
general_model(model, data)

22.35164835164835

#### cross validation

In [24]:
from sklearn.model_selection import cross_val_score

from sklearn.metrics import make_scorer

In [22]:
rf = RandomForestRegressor(criterion='mae')

(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

In [25]:
cross_val_score(rf, X_sj_train, Y_sj_train, cv = 5, scoring=make_scorer(mean_absolute_error))

array([31.08014184, 50.52198582, 26.65214286, 43.595     , 30.33142857])

In [26]:
cross_val_score(rf, X_iq_train, Y_iq_train, cv = 5, scoring=make_scorer(mean_absolute_error))

array([7.01153846, 7.29871795, 7.69615385, 6.05641026, 4.63717949])

#### Negative Binomial Regression

In [29]:
from NegBinRegression import negativeBinomialRegression

In [None]:
import patsy
X = patsy.dmatrices('los~type2+type3+hmo+white', medpar)

In [34]:
import patsy

In [37]:
x = patsy.dmatrix(X_sj_train)

In [38]:
y = patsy.dmatrix(Y_sj_train)

In [40]:
res, mod = negativeBinomialRegression(x, y)

[1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.1]
endog mean: 38.00854700854701 log endog mean: 3.6378110557129406
exog mean -0.09766890013618673
=== params in loglike === [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
=== mu in loglike === [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan na

  mu = np.exp(np.dot(self.exog, beta))[:,None]
  const_arr = a1*mu*(a1+y)/(mu+a1)**2
  const_arr = a1*mu*(a1+y)/(mu+a1)**2
  dldpda = np.sum(mu*exog*(y-mu)*da1/(mu+a1)**2 , axis=0)
  dldpda = np.sum(mu*exog*(y-mu)*da1/(mu+a1)**2 , axis=0)
  dldpda = np.sum(mu*exog*(y-mu)*da1/(mu+a1)**2 , axis=0)
  dldpda = np.sum(mu*exog*(y-mu)*da1/(mu+a1)**2 , axis=0)
  np.log(a1) - np.log(a1+mu) - (a1+y)/(a1+mu) + 1)
  (y - mu)/(mu + a1)**2)).sum()
  (y - mu)/(mu + a1)**2)).sum()
  mu = np.exp(np.dot(self.exog, beta))[:,None]
  dparams = exog*a1*(y-mu)/(a1+mu)
  dparams = exog*a1*(y-mu)/(a1+mu)
  dalpha = (digamma(a1) - digamma(y+a1) + np.log(1+alpha*mu) + \
  alpha* (y-mu)/(1+alpha*mu)).sum() / alpha**2
  oldparams) > tol)):


In [42]:
x_test = patsy.dmatrix(X_sj_test)

In [41]:
y_test = patsy.dmatrix(Y_sj_test)

In [44]:
res.predict(x_test)

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

#### Tuned parameters

In [49]:
# Initialize XGB and GridSearch
# we find gamma=0 and max_depth=3, defaut parameters to be best, rest tuned parameters can be seen below
rf = XGBRegressor(n_estimators=55, learning_rate=0.01 ,n_jobs=-1, subsample=0.9, colsample_bytree=0.6, colsample_bylevel=0.1, min_child_weight=5, reg_alpha=0.1) 

general_model(rf, data)

12.39010989010989

In [37]:
#'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
grid = {'colsample_bylevel' : [i/10.0 for i in range(1,11)]}

best_error = 100

i = 1

for g in ParameterGrid(grid):
    rf.set_params(**g)
    error = general_model(rf, data)
    # save if best
    if error < best_error:
        best_error = error
        best_grid = g

    if (i % 5 == 0):
        print (i)
    i += 1
    
print ("Best Error: %0.5f" % best_error)
print ("Grid:", best_grid)

5
10
Best Error: 12.31215
Grid: {'colsample_bylevel': 0.5}


#### Submission

In [25]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")
model = XGBRegressor(n_estimators=55, learning_rate=0.01 ,n_jobs=-1, subsample=0.9, colsample_bytree=0.6, colsample_bylevel=0.1, min_child_weight=5, reg_alpha=0.1)

In [27]:
X_sj, X_iq, y_sj, y_iq = process(X, y, train = False, trees=True)
X_sj_test, X_iq_test = process(test, trees=True)

model.fit(X_sj, y_sj)
Y_pred_sj = model.predict(X_sj_test)

model.fit(X_iq, y_iq)
Y_pred_iq = model.predict(X_iq_test)

Y_pred = np.concatenate([Y_pred_sj, Y_pred_iq])

In [29]:
test = pd.read_csv("dengue_features_test.csv")
submission = test.loc[:,['city', 'year', 'weekofyear']].copy()
submission['total_cases'] = Y_pred.astype(int).clip(0)
submission.to_csv('submission_4.csv', index=False)