This is a competition on Kaggle https://www.kaggle.com/c/favorita-grocery-sales-forecasting.
<br><br>
#### 1 Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.tabular import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import forest
from IPython.display import display
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

from sklearn import metrics

In [3]:
PATH = 'data/grocery-sales/'
!ls {PATH}

holidays_events.csv  oil.csv		    stores.csv	train.csv
items.csv	     sample_submission.csv  test.csv	transactions.csv


<br><br>

#### 2 Read Data

When reading csv files, if we set **limit_memory = False**, it will us as much memory as it wants. The system will then run out of memory regardless of how much memory you have.
<br><br>
To limit the amount of space when the system is reading in the data, we create a dictionary to regulate the data type of each column in the training dataframe.
<br><br>
'Object' is a quite general and memory heavy Python type. Column *onpromotion* contains NaN values and String values, and we want to convert them to boolean values.
So, in this case, we have to declare it in a quite general data type beforehand.

In [4]:
# To use the smallest number of bits to represent the column to save space usage

types = {'id': 'int64',
        'item_nbr': 'int32',
        'store_nbr': 'int8',
        'unit_sales': 'float32',
        'onpromotion': 'object'}

In [5]:
# %%time
# df_all = pd.read_csv(f'{PATH}train.csv', parse_dates = ['date'], dtype = types,
#                     infer_datetime_format = True)

# df_all.onpromotion.fillna(False, inplace=True)
# df_all.onpromotion = df_all.onpromotion.map({'False': False, 'True': True})
# df_all.onpromotion = df_all.onpromotion.astype(bool)

# df_all.to_feather('tmp/raw_groceries')

# df_all.describe(include='all')

In [6]:
df_all = pd.read_feather('tmp/raw_groceries')

''' The competition requires us to replace the negative values with 0.
Function clip() brings numbers smaller than the lower bound (the second parameter) to the lower bound.
In this case, it transforms negative values to 0 '''

# Replace the value after clippling to log value + 1 (required by the competition)
df_all.unit_sales = np.log1p(np.clip(df_all.unit_sales, 0, None))

In [7]:
# Memory errors incurred when using this function 

df_all = df_all.truncate(before=100000000)
%time add_datepart(df_all, 'date')

CPU times: user 26 s, sys: 4.05 s, total: 30 s
Wall time: 7.07 s


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
100000000,100000000,18,1047786,1.609438,True,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000001,100000001,18,1047790,2.772589,True,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000002,100000002,18,1050142,1.609438,False,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000003,100000003,18,1052563,3.637586,True,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000004,100000004,18,1053943,2.995732,True,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000005,100000005,18,1053944,2.302585,True,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000006,100000006,18,1053945,2.397895,False,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000007,100000007,18,1054129,0.693147,False,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000008,100000008,18,1057033,3.091043,True,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000
100000009,100000009,18,1057034,0.693147,False,2016,12,50,15,3,350,False,False,False,False,False,False,1481760000


In [8]:
def split_vals(a, n): return a[:n].copy(), a[n:].copy()

In [9]:
df_test = pd.read_csv(f'{PATH}test.csv', parse_dates = ['date'], dtype = types,
                     infer_datetime_format = True)

In [10]:
# Assign the same amount of data to the validation set as the testing set

n_valid = len(df_test)
n_trn = len(df_all) - n_valid

train, valid = split_vals(df_all, n_trn)

In [11]:
''' Since all values are numeric now, we do not have to convert them to categorical values.
If there are string values, we convert them using train_cats().
Also, we apply the same categorical convertion to the validation set as the training set
using apply_cats().
'''
# train_cats(raw_train)
# apply_cats(raw_valid, raw_train)

' Since all values are numeric now, we do not have to convert them to categorical values.\nIf there are string values, we convert them using train_cats().\nAlso, we apply the same categorical convertion to the validation set as the training set\nusing apply_cats().\n'

In [12]:
# Copied from the old version of fastai (fastai.structured) 
# https://github.com/fastai/fastai/blob/master/old/fastai/structured.py
def numericalize(df, col, name, max_n_cat):
    ''' For values not numeric, convert it to corresponding categorical values + 1.abc
    e.g. NaN values are not -1 but 0.'''
    
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1
    
    
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict


def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

def set_rf_samples(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
    forest.check_random_state(rs).randint(0, n_samples, n))

In [21]:
# Call proc_df() to check the missing values and make certian transormations if required
trn, y, _ = proc_df(train, 'unit_sales')
val, y_val, _ = proc_df(valid, 'unit_sales')

<br> <br>

#### 3 Models

In [22]:
def rmse(x, y):
    return math.sqrt(((x - y)**2).mean())


def print_score(m):
    res = [rmse(m.predict(x), y), rmse(m.predict(val), y_val),
           m.score(x, y), m.score(val, y_val)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [23]:
set_rf_samples(1_000_000)

In [25]:
''' By default, the random forest will do this conversion internally.
If we use several decision trees, this conversion will be performed for several times.
If we do this conversion by ourselves, we only need to do once;
therefore, we can save a lot of time.
'''
x = np.array(trn, dtype=np.float32)

In [28]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=100, n_jobs=-1)
m.fit(x, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=100, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [29]:
print_score(m)

[0.7513051788835897, 0.7524448969443109, 0.2675414401080748, 0.2464653966808745]


<br> <br> 
In this example, even the **min_samples_leaf** reduces to 3 even 1, the score will not be improved much. Therefore, random forest does not work well in dealing with this dataset.
<br> <br> <br> 
Random forest will do nothing except creating a bunch of binary splits. However, if we check the dataset, we will notice that logically the *unit_sales* prices have little relationship with merely the date value, the store number, whether it is onpromotion. In general, data we use to train the model has little relation with the price we are required to predict.