#### importing the relevant packages and loading the data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

from sklearn.preprocessing import StandardScaler

In [11]:
X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

#### Pre Processing

In [3]:
def mape(Y_test, Y_pred, epsilon = 1):
    return np.mean(np.abs((Y_test - Y_pred + epsilon) / (Y_test + epsilon))) * 100

In [4]:
def extract_month(s):
    return int(s[5:7])

In [5]:
def pre_process(X, train = True):
    """
    Extracts the month out of date and converts it to a one hot
    Standardizes the numerical features
    Converts the city column to a boolean feature
    """

    #Extracting month from the date
    months = X.week_start_date.apply(extract_month)

    # Removing the dates info for now
    X.drop(['city', 'year', 'weekofyear', 'week_start_date'], axis=1, inplace=True)
    
    if train:
        # Dropping the rows with null values for now
        X.dropna(inplace=True)
    else:
        X.fillna(0,inplace=True)

    # Standardizing the data
    scaler = StandardScaler()
    X[X.columns] = scaler.fit_transform(X)

    sliced_months = months.loc[X.index]
    # Month one hot features
    month_features = pd.get_dummies(sliced_months)
    X = X.join(month_features)

    # Alternatively use months as a discrete feature
    #X = X.join(sliced_months)
    
    return X

In [6]:
def split(X,y):
    X = pre_process(X)

    # Selecting y values corresponding to the filtered X values
    y = y.loc[X.index]

    return train_test_split(X, y, shuffle = False)

#### Seperating based on cities

In [12]:
X_sj = X[X['city'] == 'sj']
y_sj = y.total_cases.loc[X_sj.index]
X_iq = X[X['city'] == 'iq']
y_iq = y.total_cases.loc[X_iq.index]

In [13]:
X_sj_train, X_sj_test, Y_sj_train, Y_sj_test = split(X_sj, y_sj)
X_iq_train, X_iq_test, Y_iq_train, Y_iq_test = split(X_iq, y_iq)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See 

#### Random Model

In [11]:
def random(Y_test, Y_train):
    y_p = np.full(len(Y_test), np.mean(Y_train))
    return mean_absolute_error(Y_test, y_p)

In [12]:
random(Y_sj_test, Y_sj_train)

24.830204657727585

In [13]:
random(Y_iq_test, Y_iq_train)

8.296562290529545

#### Baseline Model

In [14]:
def baseline(X_train, X_test, Y_train ,Y_test):
    lr = LinearRegression()

    lr.fit(X_train, Y_train)
    Y_pred = lr.predict(X_test)

    return mean_absolute_error(Y_test, Y_pred)

In [15]:
baseline(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test)

19.363682226286087

In [16]:
baseline(X_iq_train, X_iq_test, Y_iq_train, Y_iq_test)

7.94571153359583

#### Feature Selection using RFE
<font color='red'>This part needs editting</font>

In [None]:
from sklearn.feature_selection import RFE

# create a base classifier used to evaluate a subset of attributes
model = LinearRegression()

# create the RFE model and select n attributes
rfe = RFE(model, 13)
rfe = rfe.fit(X, y)
X_Select = X.loc[:, rfe.support_]

# Split based on the newly selected features and train and test again
X_train, X_test, Y_train, Y_test = train_test_split(X_Select, y, shuffle = False)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

mean_absolute_error(Y_test, Y_pred)

#### L1

In [27]:
def L1(X_train, X_test, Y_train, Y_test):
    clf = linear_model.Lasso(alpha=.5)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    return mean_absolute_error(Y_test, Y_pred)

In [28]:
L1(X_iq_train, X_iq_test, Y_iq_train, Y_iq_test)

8.030400711939572

In [29]:
L1(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test)

20.361085695096747

#### L2

In [32]:
def L2(X_train, X_test, Y_train, Y_test):
    clf = linear_model.Ridge(alpha = 44)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    return mean_absolute_error(Y_test, Y_pred)

In [33]:
L2(X_iq_train, X_iq_test, Y_iq_train, Y_iq_test)

8.07570486399053

In [34]:
L2(X_sj_train, X_sj_test, Y_sj_train, Y_sj_test)

19.818615949346707

#### Submission
<font color='red'>This part needs editting</font>

In [53]:
def submission(clf, rfe, filename = 'submission_1.csv'):
    test = pd.read_csv("dengue_features_test.csv")
    submission = test[['city', 'year', 'weekofyear']]
    X_test = pre_process(test, train =False)
    X_test = X_test.loc[:, rfe.support_]
    submission['total_cases'] = clf.predict(X_test).astype(int).clip(0)
    submission.to_csv(filename, index=False)

In [56]:
submission(clf, rfe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
