In [12]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

In [2]:
# Functions

In [8]:
# Read in
df_featurespace = pd.read_parquet("output/01_featurespace.parquet")
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [9]:
# Merge to train
df_train_fs = pd.merge(
    df_train,
    df_featurespace,
    how='left',
    on=['Store', 'Date'],
    validate='m:1'
)
df_test_fs = pd.merge(
    df_test,
    df_featurespace,
    how='left',
    on=['Store', 'Date'],
    validate='m:1'
)

In [None]:
# Split / KFold
kf = KFold(n_splits=5)
df_split = []
# dataset2 = dataset.copy()
for name, group in df_train_fs.groupby(["Store", "Dept"]):
    group = group.reset_index(drop=True)
    trains_x = []
    trains_y = []
    tests_x = []
    tests_y = []
    if group.shape[0] <= 5:
        f = np.array(range(5))
        np.random.shuffle(f)
        group['fold'] = f[:group.shape[0]]
        continue
    fold = 0
    for train_index, test_index in kf.split(group):
        group.loc[test_index, 'fold'] = fold
        fold += 1
    df_split.append(group)

df_split = pd.concat(df_split).reset_index(drop=True)

In [None]:
best_model = None
error_cv = 0
best_error = np.iinfo(np.int32).max
for fold in range(5):
    
    # Split to train and test
    dataset_train = df_split.loc[df_split['fold'] != fold]
    dataset_test = df_split.loc[df_split['fold'] == fold]
    train_y = dataset_train['weeklySales']
    train_x = dataset_train.drop(columns=['weeklySales', 'fold'])
    test_y = dataset_test['weeklySales']
    test_x = dataset_test.drop(columns=['weeklySales', 'fold'])
    print(" ---- ---- ---- ")
    print("Dataset train and test shapes :: current iteration")
    print(dataset_train.shape, dataset_test.shape)
    
    # Train / Test Model
    #predicted, model = train_and_predict(train_x, train_y, test_x)
    
    weights = test_x['isHoliday'].replace(True, 5).replace(False, 1)
    error = mean_absolute_error(test_y, predicted, weights)
    error_cv += error
    print(fold, error)
    if error < best_error:
        print('Find best model')
        best_error = error
        best_model = model
error_cv /= 5