In [112]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso, LinearRegression

In [113]:
# Functions
def kf_splitting(data, n_splits_):
    kf = KFold(n_splits=n_splits_)
    df_split = []
    for name, group in data.groupby(["Store", "Dept"]):
        group = group.reset_index(drop=True)
        trains_x = []
        trains_y = []
        tests_x = []
        tests_y = []
        if group.shape[0] <= 5:
            f = np.array(range(5))
            np.random.shuffle(f)
            group['fold'] = f[:group.shape[0]]
            continue
        fold = 0
        for train_index, test_index in kf.split(group):
            group.loc[test_index, 'fold'] = fold
            fold += 1
        df_split.append(group)
    df_split = pd.concat(df_split).reset_index(drop=True)
    return df_split

def gridsearch_wrapper(model, grid, refit_score, skfold_count):
        """
        fits a GridSearchCV classifier using refit_score for optimization
        prints classifier performance metrics
        """
        skf = StratifiedKFold(n_splits=10)
        grid_search = GridSearchCV(rf_clf, param_grid, scoring=scorers, refit=refit_score,
                               cv=skf, return_train_score=True, n_jobs=-1)
        grid_search.fit(X_train.values, y_train.values)

        # make the predictions
        y_pred = grid_search.predict(X_test.values)

        print('Best params for {}'.format(refit_score))
        print(grid_search.best_params_)

        # confusion matrix on the test data.
        print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(
            refit_score
        ))
        print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                     columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
        return grid_search, grid_search.best_params_

In [114]:
# Read in
df_featurespace = pd.read_parquet("output/01_featurespace.parquet")
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")    

In [115]:
# Model List
model_list = [RandomForestRegressor, ExtraTreesRegressor, Lasso, LinearRegression, KNeighborsRegressor]
n_splits_ = 4
scorers = {
    'mae': make_scorer(mean_absolute_error)
}

In [116]:
param_grids = {
    'RandomForestRegressor': {
        'bootstrap': [True],
        'criterion': ['mse'],
        'max_features': ['auto'],
        'max_leaf_nodes': [None],
        'min_samples_leaf': [2, 4, 8],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [10, 35, 45],
        'n_jobs': [-1],
        'oob_score': [False],
        'random_state': [42],
        'verbose': [0],
        'warm_start': [False]
    },
    'ExtraTreesRegressor': {
        'bootstrap': [True],
        'criterion': ['mse'],
        'max_features': ['auto'],
        'max_leaf_nodes': [None],
        'min_samples_leaf': [2, 4, 8],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [10, 35, 45],
        'n_jobs': [-1],
        'oob_score': [False],
        'random_state': [42],
        'verbose': [0],
        'warm_start': [False]
    },
    "KNeighborsRegressor": {
        'n_neighbors': [10, 12, 15]
    },
    "LinearRegression": {
        'normalize': [True],
        'fit_intercept': [True],
        'copy_X': [True]
    }
}

In [117]:
# Merge to train
df_train_fs = pd.merge(
    df_train,
    df_featurespace,
    how='left',
    on=['Store', 'Date'],
    validate='m:1'
)
# Merge to test
df_test_fs = pd.merge(
    df_test,
    df_featurespace,
    how='left',
    on=['Store', 'Date'],
    validate='m:1'
)

In [118]:
# Split train
df_split = kf_splitting(df_train_fs, n_splits_)

In [None]:
best_model = None
error_cv = 0
best_error = np.iinfo(np.int32).max

for model_ in model_list:
    
    # Param Grid and Searcg
    param_grid_ = param_grids.get(model_.__name__)
    gs, gs_best_p_ = gridsearch_wrapper(model=model_, 
                                        grid=param_grid_, 
                                        refit_score='mae',
                                        skfold_count=10)
    results = pd.DataFrame(grid_search_clf.cv_results_)
    
    for fold in range(5):
    
        # Split to train and test
        dataset_train = df_split.loc[df_split['fold'] != fold]
        dataset_test = df_split.loc[df_split['fold'] == fold]
        train_y = dataset_train['weeklySales']
        train_x = dataset_train.drop(columns=['weeklySales', 'fold'])
        test_y = dataset_test['weeklySales']
        test_x = dataset_test.drop(columns=['weeklySales', 'fold'])
        print(" ---- ---- ---- ")
        print("Dataset train and test shapes :: current iteration")
        print(dataset_train.shape, dataset_test.shape)
    
        # Train / Test Model
        #predicted, model = train_and_predict(train_x, train_y, test_x)
        model_.fit(train_x, train_y)
        yhat = model_.predict(test_x)
    
        weights = test_x['isHoliday'].replace(True, 5).replace(False, 1)
        error = mean_absolute_error(test_y, yhat, weights)
        error_cv += error
        print(fold, error)
        if error < best_error:
            print('Find best model')
            best_error = error
            best_model = model
    error_cv /= 5

In [None]:
# LassoLARSCV