#### importing the relevant packages and loading the data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

from sklearn.preprocessing import StandardScaler

from scipy import stats

In [2]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

#### Pre Processing

In [3]:
def impute(X):
    
    # remove the colum that has ~20% null values, also ranks low on feature importance
    X.drop(['ndvi_ne'], axis=1, inplace=True)
    
    # Filling the rest using linear interpolation
    X.interpolate(inplace=True)

In [4]:
def remove_outliers(df):
    return df[(np.abs(stats.zscore(df)) < 3.6).all(axis=1)]

In [5]:
def mape(Y_test, Y_pred, epsilon = 1):
    return np.mean(np.abs((Y_test - Y_pred + epsilon) / (Y_test + epsilon))) * 100

In [6]:
def extract_month(s):
    return int(s[5:7])

In [7]:
def city_indices(X):
    # city boolean encoding
    return X.city == 'sj'

In [8]:
def pre_process(X):
    """
    Extracts the month out of date and converts it to a one hot
    Standardizes the numerical features
    """
    
    #Extracting month from the date
    months = X.week_start_date.apply(extract_month)

    # Removing the columns not required for classification
    X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis=1, inplace=True)

    # Standardizing the data
    scaler = StandardScaler()
    X[X.columns] = scaler.fit_transform(X)

    # Month one hot features
    month_features = pd.get_dummies(months)
    X = X.join(month_features)

    # Alternatively use months as a discrete feature
    #X = X.join(months)
    
    return X

In [9]:
def seperate_cities_data(X, is_sj):

    # Seperating the cities data
    X_sj = X.loc[is_sj]
    X_iq = X.loc[~is_sj]
    
    return X_sj, X_iq

In [10]:
def get_y_labels(X_sj, X_iq, y):    
    
    y = y.total_cases    
    y_sj = y.loc[X_sj.index]
    y_iq = y.loc[X_iq.index]
    
    return y_sj, y_iq

In [11]:
def split(X_sj, X_iq, y_sj, y_iq):

    # train and test split
    sj_split_data = train_test_split(X_sj, y_sj, shuffle = False)
    iq_split_data = train_test_split(X_iq, y_iq, shuffle = False)

    return sj_split_data, iq_split_data

In [12]:
def process(X, y = pd.Series(), train = True):
    
    is_sj = city_indices(X)
    impute(X)
    X = pre_process(X)
    #X = remove_outliers(X)
    X_sj, X_iq = seperate_cities_data(X, is_sj)
    if y.empty:
        return X_sj, X_iq
    
    y_sj, y_iq = get_y_labels(X_sj, X_iq, y)
    if not train:
        return X_sj, X_iq, y_sj, y_iq
    
    return split(X_sj, X_iq, y_sj, y_iq)

#### Random Model

In [13]:
data = process(X,y)
(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data

In [14]:
def random(Y_test, Y_train):
    y_p = np.full(len(Y_test), np.mean(Y_train))
    return mean_absolute_error(Y_test, y_p)

In [15]:
random(Y_sj_test, Y_sj_train)

28.027284681130833

In [16]:
random(Y_iq_test, Y_iq_train)

8.287337278106508

In [17]:
Y_test = Y_sj_test.append(Y_iq_test)
Y_train = Y_sj_train.append(Y_iq_train)

In [18]:
random(Y_test, Y_train)

20.337127158555734

#### Generalised Model

In [19]:
def general_model(clf, data):
    
    (X_sj_train, X_sj_test, Y_sj_train, Y_sj_test), (X_iq_train, X_iq_test, Y_iq_train, Y_iq_test) = data
    
    clf.fit(X_sj_train, Y_sj_train)
    Y_sj_pred = clf.predict(X_sj_test)
    
    clf.fit(X_iq_train, Y_iq_train)
    Y_iq_pred = clf.predict(X_iq_test)
    
    Y_pred = np.concatenate([Y_sj_pred, Y_iq_pred])
    Y_test = Y_sj_test.append(Y_iq_test)
    
    return mean_absolute_error(Y_test, Y_pred)

#### Baseline Model

In [20]:
model = LinearRegression()
general_model(model, data)

19.25986624297253

#### L1

In [21]:
clf = linear_model.Lasso(alpha=0.46)
general_model(clf, data)

18.786187851464504

#### L2

In [22]:
clf = linear_model.Ridge(alpha=27)
general_model(clf, data)

18.709347266859464

#### Submission

In [115]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")
model = linear_model.Ridge(alpha=27)

In [116]:
X_sj, X_iq, y_sj, y_iq = process(X, y, train = False)
X_sj_test, X_iq_test = process(test)

model.fit(X_sj, y_sj)
Y_pred_sj = model.predict(X_sj_test)

model.fit(X_iq, y_iq)
Y_pred_iq = model.predict(X_iq_test)

Y_pred = np.concatenate([Y_pred_sj, Y_pred_iq])

In [117]:
test = pd.read_csv("dengue_features_test.csv")
submission = test.loc[:,['city', 'year', 'weekofyear']].copy()
submission['total_cases'] = Y_pred.astype(int).clip(0)
submission.to_csv('submission_3.csv', index=False)