In [196]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso, LinearRegression

In [242]:
# Functions
def kf_splitting(data, n_splits_):
    kf = KFold(n_splits=n_splits_)
    df_split = []
    for name, group in data.groupby(["Store", "Dept"]):
        group = group.reset_index(drop=True)
        trains_x = []
        trains_y = []
        tests_x = []
        tests_y = []
        if group.shape[0] <= 5:
            f = np.array(range(5))
            np.random.shuffle(f)
            group['fold'] = f[:group.shape[0]]
            continue
        fold = 0
        for train_index, test_index in kf.split(group):
            group.loc[test_index, 'fold'] = fold
            fold += 1
        df_split.append(group)
    df_split = pd.concat(df_split).reset_index(drop=True)
    return df_split

def gridsearch_wrapper(model, grid, refit_score, skfold_count, X_train, X_test, y_train, y_test):
        """
        fits a GridSearchCV classifier using refit_score for optimization
        prints classifier performance metrics
        """
        model = model()
        skf = KFold(n_splits=10)
        # scoring=scorers,
        grid_search = GridSearchCV(
            model, param_grid, 
            refit=refit_score,
            cv=skf, return_train_score=True, n_jobs=-1
        )
        grid_search.fit(X_train.values, y_train.values)

        # make the predictions
        y_pred = grid_search.predict(X_test.values)

        print('Best params for {}'.format(refit_score))
        print(grid_search.best_params_)

        # confusion matrix on the test data.
        print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(
            refit_score
        ))
        print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                     columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
        return grid_search, grid_search.best_params_

In [226]:
# Read in
df_featurespace = pd.read_parquet("output/01_featurespace.parquet")
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv") 

In [253]:
# Model List
model_list = [Lasso, LinearRegression, RandomForestRegressor, ExtraTreesRegressor, KNeighborsRegressor]
n_splits_ = 4
scorers = {
    'mae': make_scorer(mean_absolute_error)
}

In [254]:
param_grids = {
    'RandomForestRegressor': {
        'bootstrap': [True],
        'max_features': ['auto'],
        'max_leaf_nodes': [None],
        'min_samples_leaf': [4, 8],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [10, 35],
        'n_jobs': [-1],
        'warm_start': [False]
    },
    'ExtraTreesRegressor': {
        'bootstrap': [True],
        'criterion': ['mse'],
        'max_features': ['auto'],
        'max_leaf_nodes': [None],
        'min_samples_leaf': [4, 8],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [10, 35],
        'n_jobs': [-1],
        'oob_score': [False],
        'random_state': [42],
        'verbose': [0],
        'warm_start': [False]
    },
    "KNeighborsRegressor": {
        'n_neighbors': [10, 12, 15]
    },
    "LinearRegression": {
        'normalize': [True],
        'fit_intercept': [True]
    },
    "Lasso": {
        "alpha": [0.1, 0.4, 0.6],
        "fit_intercept": [True],
        "normalize": [True],
        "selection": ['random']
    }
}

In [229]:
# Merge to train
df_train.drop(labels=['IsHoliday'], axis=1, inplace=True)
df_train_fs = pd.merge(
    df_train,
    df_featurespace,
    how='left',
    on=['Store', 'Date'],
    validate='m:1'
)

# Merge to test
df_test.drop(labels=['IsHoliday'], axis=1, inplace=True)
df_test_fs = pd.merge(
    df_test,
    df_featurespace,
    how='left',
    on=['Store', 'Date'],
    validate='m:1'
)
df_train_fs.drop(labels=['Date'], axis=1, inplace=True)
df_test_fs.drop(labels=['Date'], axis=1, inplace=True)

In [230]:
df_train_fs.columns

Index(['Store', 'Dept', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'IsHoliday', 'Type', 'Size', 'Month', 'isSpring',
       'isSummer', 'isFall', 'isWinter', 'cpiUpperGroup', 'unemploymentAbove9',
       'store_month_mean_temp', 'temp_diff_store_month_mean',
       'store_temp_rlg_max', 'temperature_cv', 'upcoming_holiday',
       'MarkDown1_null_ind', 'MarkDown1_zero_ind', 'MarkDown1_log',
       'MarkDown2_null_ind', 'MarkDown2_zero_ind', 'MarkDown2_log',
       'MarkDown3_null_ind', 'MarkDown3_zero_ind', 'MarkDown3_log',
       'MarkDown4_null_ind', 'MarkDown4_zero_ind', 'MarkDown4_log',
       'MarkDown5_null_ind', 'MarkDown5_zero_ind', 'MarkDown5_log'],
      dtype='object')

In [231]:
df_test_fs.columns

Index(['Store', 'Dept', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'IsHoliday', 'Type', 'Size', 'Month', 'isSpring', 'isSummer', 'isFall',
       'isWinter', 'cpiUpperGroup', 'unemploymentAbove9',
       'store_month_mean_temp', 'temp_diff_store_month_mean',
       'store_temp_rlg_max', 'temperature_cv', 'upcoming_holiday',
       'MarkDown1_null_ind', 'MarkDown1_zero_ind', 'MarkDown1_log',
       'MarkDown2_null_ind', 'MarkDown2_zero_ind', 'MarkDown2_log',
       'MarkDown3_null_ind', 'MarkDown3_zero_ind', 'MarkDown3_log',
       'MarkDown4_null_ind', 'MarkDown4_zero_ind', 'MarkDown4_log',
       'MarkDown5_null_ind', 'MarkDown5_zero_ind', 'MarkDown5_log'],
      dtype='object')

In [232]:
# Split train
df_split = kf_splitting(df_train_fs, n_splits_)

In [255]:
best_model = None
error_cv = 0
best_error = np.iinfo(np.int32).max
print(model_list)
for model_ in model_list:

    # Get train / test split for GridSearch
    test = df_train_fs.loc[:, 'Weekly_Sales'].astype(float)
    train = df_train_fs.drop(labels=['Weekly_Sales'], axis=1, inplace=False)
    X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.33, random_state=42)
    
    # Param Grid and Searcg
    print("First Model :: {}".format(str(model_.__name__)))
    param_grid_ = param_grids[str(model_.__name__)]
    print(param_grid_)
    print(model_)
    gs, gs_best_p_ = gridsearch_wrapper(model=model_, 
                                        grid=param_grid_, 
                                        refit_score='mae',
                                        skfold_count=10,
                                        X_train=X_train, 
                                        X_test=X_test, 
                                        y_train=y_train, 
                                        y_test=y_test)
    results = pd.DataFrame(grid_search_clf.cv_results_)
    
    for fold in range(5):
    
        # Split to train and test
        dataset_train = df_split.loc[df_split['fold'] != fold]
        dataset_test = df_split.loc[df_split['fold'] == fold]
        train_y = dataset_train['weeklySales']
        train_x = dataset_train.drop(columns=['weeklySales', 'fold'])
        test_y = dataset_test['weeklySales']
        test_x = dataset_test.drop(columns=['weeklySales', 'fold'])
        print(" ---- ---- ---- ")
        print("Dataset train and test shapes :: current iteration")
        print(dataset_train.shape, dataset_test.shape)
    
        # Train / Test Model
        #predicted, model = train_and_predict(train_x, train_y, test_x)
        model_.fit(train_x, train_y)
        yhat = model_.predict(test_x)
    
        weights = test_x['isHoliday'].replace(True, 5).replace(False, 1)
        error = mean_absolute_error(test_y, yhat, weights)
        error_cv += error
        print(fold, error)
        if error < best_error:
            print('Find best model')
            best_error = error
            best_model = model
    error_cv /= 5

[<class 'sklearn.linear_model.coordinate_descent.Lasso'>, <class 'sklearn.linear_model.base.LinearRegression'>, <class 'sklearn.ensemble.forest.RandomForestRegressor'>, <class 'sklearn.ensemble.forest.ExtraTreesRegressor'>, <class 'sklearn.neighbors.regression.KNeighborsRegressor'>]
First Model :: Lasso
{'alpha': [0.1, 0.4, 0.6], 'fit_intercept': [True], 'normalize': [True], 'selection': ['random']}
<class 'sklearn.linear_model.coordinate_descent.Lasso'>


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').