 - Edit the split and generalised model functions to make sure they are submission friendly
 - Check the conistency of final 'data' variable with the initial labels in the y csv

#### importing the relevant packages and loading the data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

from sklearn.preprocessing import StandardScaler

In [2]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

#### Pre Processing

In [None]:
def impute(X):
    
    # remove the colum that has ~20% null values, also ranks low on feature importance
    X.drop(['ndvi_ne'], axis=1, inplace=True)
    
    # Filling the rest using linear interpolation
    X.interpolate(inplace=True)

In [4]:
def mape(Y_test, Y_pred, epsilon = 1):
    return np.mean(np.abs((Y_test - Y_pred + epsilon) / (Y_test + epsilon))) * 100

In [5]:
def extract_month(s):
    return int(s[5:7])

In [6]:
def city_indices(X):
    # city boolean encoding
    return X.city == 'sj'

In [7]:
def pre_process(X):
    """
    Extracts the month out of date and converts it to a one hot
    Standardizes the numerical features
    """
    
    #Extracting month from the date
    months = X.week_start_date.apply(extract_month)

    # Removing the dates info for now
    X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis=1, inplace=True)

    # Standardizing the data
    scaler = StandardScaler()
    X[X.columns] = scaler.fit_transform(X)

    sliced_months = months.loc[X.index]
    # Month one hot features
    month_features = pd.get_dummies(sliced_months)
    X = X.join(month_features)

    # Alternatively use months as a discrete feature
    #X = X.join(sliced_months)
    
    return X

In [8]:
def seperate_cities_data(X, is_sj):

    # Seperating the cities data
    X_sj = X[is_sj]
    X_iq = X[~is_sj]
    
    return X_sj, X_iq

In [9]:
def get_y_labels(X_sj, X_iq, y):    
    
    y = y.total_cases    
    y_sj = y.loc[X_sj.index]
    y_iq = y.loc[X_iq.index]
    
    return y_sj, y_iq

In [10]:
def split(X_sj, X_iq, y_sj, y_iq):

    # train and test split
    sj_split_data = train_test_split(X_sj, y_sj, shuffle = False)
    iq_split_data = train_test_split(X_iq, y_iq, shuffle = False)

    return sj_split_data, iq_split_data

In [11]:
def process(X, y = pd.Series(), train = True):
    
    is_sj = city_indices(X)
    impute(X)
    X = pre_process(X)
    X_sj, X_iq = seperate_cities_data(X, is_sj)
    if y.empty:
        return X_sj, X_iq
    
    y_sj, y_iq = get_y_labels(X_sj, X_iq, y)
    if not train:
        return X_sj, X_iq, y_sj, y_iq
    
    return split(X_sj, X_iq, y_sj, y_iq)

#### Random Model

In [12]:
data = process(X,y)
(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

In [13]:
def random(Y_test, Y_train):
    y_p = np.full(len(Y_test), np.mean(Y_train))
    return mean_absolute_error(Y_test, y_p)

In [14]:
random(Y_sj_test, Y_sj_train)

28.027284681130833

In [15]:
random(Y_iq_test, Y_iq_train)

8.287337278106508

In [16]:
Y_test = Y_sj_test.append(Y_iq_test)
Y_train = Y_sj_train.append(Y_iq_train)

In [17]:
random(Y_test, Y_train)

20.337127158555734

#### Generalised Model

In [18]:
def general_model(clf, data):
    
    (X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data
    
    clf.fit(X_sj_train, Y_sj_train)
    Y_sj_pred = clf.predict(X_sj_test)
    
    clf.fit(X_iq_train, Y_iq_train)
    Y_iq_pred = clf.predict(X_iq_test)
    
    Y_pred = np.concatenate([Y_sj_pred, Y_iq_pred])
    Y_test = Y_sj_test.append(Y_iq_test)
    
    return mean_absolute_error(Y_test, Y_pred)

#### Baseline Model

In [19]:
model = LinearRegression()
general_model(model, data)

19.25986624297253

#### L1

In [27]:
clf = linear_model.Lasso(alpha=0.05)
general_model(clf, data)



19.238198314117902

#### L2

In [28]:
clf = linear_model.Ridge(alpha=4)
general_model(clf, data)

19.087904918451816

#### Regularization Tuning

In [61]:
L2 = {}

for a in range(1,100):
    clf = linear_model.Ridge(alpha=a)
    L2[a] = general_model(clf, data)

In [16]:
[(alpha, error) for (alpha, error) in L2.items() if error < 14.86]

[(1, 14.687581912697645),
 (2, 14.622358088021334),
 (3, 14.607366245546181),
 (4, 14.606565202114174),
 (5, 14.61353195495241),
 (6, 14.627340871336601),
 (7, 14.644838994453519),
 (8, 14.669290462590853),
 (9, 14.69855046231257),
 (10, 14.726929676611656),
 (11, 14.754221499902739),
 (12, 14.780390719825304),
 (13, 14.806221660564697),
 (14, 14.830985065652046),
 (15, 14.854850263024398)]

In [21]:
L1 = {}
a= 0.01

while a < 1:
    a += 0.01
    clf = linear_model.Lasso(alpha=a)
    L1[a] = general_model(clf, data)



In [22]:
[(alpha, error) for (alpha, error) in L1.items() if error < 14.86]

[(0.02, 14.808001363111028),
 (0.03, 14.782203358865385),
 (0.04, 14.766329230508985),
 (0.05, 14.7643004006773),
 (0.060000000000000005, 14.769201361201388),
 (0.07, 14.781579139317401),
 (0.08, 14.79964857952635),
 (0.09, 14.812615176341575),
 (0.09999999999999999, 14.82207681766081),
 (0.10999999999999999, 14.833752066805804),
 (0.11999999999999998, 14.85098125835665)]

#### Submission

In [33]:
X = pd.read_csv("dengue_features_train.csv")
test = pd.read_csv("dengue_features_test.csv")
model = LinearRegression()

In [34]:
X_sj, X_iq, y_sj, y_iq = process(X, y, train = False)
X_sj_test, X_iq_test = process(test)

model.fit(X_sj, y_sj)
Y_pred_sj = model.predict(X_sj_test)

model.fit(X_iq, y_iq)
Y_pred_iq = model.predict(X_iq_test)

Y_pred = np.concatenate([Y_pred_sj, Y_pred_iq])

In [36]:
submission = test.loc[:,['city', 'year', 'weekofyear']].copy()
submission['total_cases'] = Y_pred.astype(int).clip(0)
submission.to_csv('submission_3.csv', index=False)

#### Fine Tuning

In [71]:
y.shape

(1199,)

In [84]:
mae_dict = {}
mape_dict = {}

# Recursive Feature Elimination
from sklearn.feature_selection import RFE

n = 21
# create a base classifier used to evaluate a subset of attributes
model = LinearRegression()
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")

X = pre_process(X)
y = y.total_cases.loc[X.index]

mask = X['is_sj']
# create the RFE model and select n attributes
rfe = RFE(model, n)
rfe = rfe.fit(X, y)

y = pd.read_csv("dengue_labels_train.csv")
X_Select = X.loc[:, rfe.support_]
X_Select['is_sj'] = mask
data = split(X_Select, y)

mae_dict[n] = general_model(model, data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [77]:
[(alpha, error) for (alpha, error) in mae_dict.items() if error < 14.86]

[(18, 14.34780865838194)]

In [85]:
[(alpha, error) for (alpha, error) in mae_dict.items() if error < 14.86]

[(21, 14.336964060404789)]

In [79]:
rfe.get_support()

array([False, False, False, False, False,  True,  True,  True, False,
       False, False,  True, False,  True,  True,  True, False, False,
       False, False,  True, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True])

In [86]:
X.columns[rfe.support_]

Index([                              'ndvi_nw',
                                     'ndvi_se',
                       'reanalysis_air_temp_k',
                       'reanalysis_avg_temp_k',
                 'reanalysis_dew_point_temp_k',
        'reanalysis_relative_humidity_percent',
       'reanalysis_specific_humidity_g_per_kg',
                           'reanalysis_tdtr_k',
                          'station_avg_temp_c',
                                             1,
                                             3,
                                             4,
                                             5,
                                             6,
                                             7,
                                             8,
                                             9,
                                            10,
                                            11,
                                            12,
                                       '

In [83]:
X.columns[rfe.support_]

Index([                'reanalysis_air_temp_k',
                       'reanalysis_avg_temp_k',
                 'reanalysis_dew_point_temp_k',
        'reanalysis_relative_humidity_percent',
       'reanalysis_specific_humidity_g_per_kg',
                           'reanalysis_tdtr_k',
                          'station_avg_temp_c',
                                             1,
                                             3,
                                             4,
                                             5,
                                             6,
                                             8,
                                             9,
                                            10,
                                            11,
                                            12,
                                       'is_sj'],
      dtype='object')