### **Constructing the pipelines**

**Pipelines work flow**

* Get X and y matrix
* Drop columns 
* Fill missing values in MarkDowns columns with "0"
* Temperature to categorial (5) bins
* IsHoliday, Type : Convert to categorical numbers
* Add Month feature


### **Importing libraries**

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
# Model Selection
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

In [0]:
from google.colab import drive

drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### **Getting the data**

In [0]:
# Path shortcut to files
path = '/content/drive/My Drive/GITHUB REPO/Walmart_forecasting_ML'
features = pd.read_csv(f"{path}/walmart_data/features.csv")
stores = pd.read_csv(f"{path}/walmart_data/stores.csv")
main_set = pd.read_csv(f"{path}/walmart_data/train.csv")

main_set = main_set.merge(stores, how='left', on='Store')
main_set = main_set.merge(features, how='left')

train_set = main_set[main_set['Date']<='2011-12-31']
test_set = main_set[main_set['Date']>'2011-12-31']

pred_set = pd.read_csv(f"{path}/walmart_data/test.csv")
pred_set = pred_set.merge(stores, how='left', on='Store')
pred_set = pred_set.merge(features, how='left')

In [0]:
# Sample ouput
sample = pd.read_csv(f"{path}/walmart_data/sampleSubmission.csv")

In [0]:
train_set.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,,,,,211.24217,8.106
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,,,,,211.289143,8.106
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,,,,,211.319643,8.106
4,1,1,2010-03-05,21827.9,False,A,151315,46.5,2.625,,,,,,211.350143,8.106


In [0]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 294132 entries, 0 to 421526
Data columns (total 16 columns):
Store           294132 non-null int64
Dept            294132 non-null int64
Date            294132 non-null object
Weekly_Sales    294132 non-null float64
IsHoliday       294132 non-null bool
Type            294132 non-null object
Size            294132 non-null int64
Temperature     294132 non-null float64
Fuel_Price      294132 non-null float64
MarkDown1       23700 non-null float64
MarkDown2       20154 non-null float64
MarkDown3       23116 non-null float64
MarkDown4       21042 non-null float64
MarkDown5       23994 non-null float64
CPI             294132 non-null float64
Unemployment    294132 non-null float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 36.2+ MB


In [0]:
train_set.columns

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Type', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment'],
      dtype='object')

### **Data Transformers**

In [0]:
def get_XY(data=None, target=None):
    """
    Return X, y

    Parameters
    ---
    data : DataFrame Object, default None
        data to extract target label.
    target : string, default None
        label of the target column.
    """
    y = data[[target]]
    X = data.drop(labels=target, axis=1)
    return X, y


class DropColumns(BaseEstimator, TransformerMixin):
    """
    Drop specific columns
    """
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.drop(labels=self.columns, axis=1)
        return X


class MonthAdder(BaseEstimator, TransformerMixin):
    """
    Add Month feature to dataframe
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.loc[:, "Date"] = pd.to_datetime(X.loc[:, "Date"])
        X.loc[:, "Month"] = X.loc[:, "Date"].dt.month
        return X


class FillConstant(BaseEstimator, TransformerMixin):
    """
    Fill missing values with constant.
    """
    def __init__(self, columns=None, fill_value=0):
        self.columns = columns
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        columns = self.columns
        X.loc[:, columns] = X[columns].fillna(self.fill_value)
        return X


class ContinuousToCat(BaseEstimator, TransformerMixin):
    """
    Convert continuous attribute to categorical
    """
    def __init__(self, bins, column=None):
        self.bins = bins
        self.column = column
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        label = f"{self.column}_cat"
        X[label] = pd.cut(X[self.column], self.bins, right=True)
        X.drop(labels=self.column, inplace=True, axis=1)
        keys = X[label].unique()
        X[label] = X[label].map(dict(zip(keys, range(len(keys)))))
        return X


class Scaler(BaseEstimator, TransformerMixin):
    """
    Scale specific columns
    """
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = self.columns
        scaler = StandardScaler()
        X[features] = scaler.fit_transform(X[features])
        return X

class Imputer(BaseEstimator, TransformerMixin):
    """
    Impute specific columns with strat
    """
    def __init__(self, columns=None, strategy=None):
        self.columns = columns
        self.strategy = strategy

    def fit(self , X, y=None):
        return self

    def transform(self, X):
        columns = self.columns
        imputer = SimpleImputer(strategy=self.strategy)
        X[columns] = imputer.fit_transform(X[columns])
        return X

In [0]:
# getting X and y sets
X_train, y_train_p = get_XY(data=train_set, target="Weekly_Sales")
X_test, y_test_p = get_XY(data=test_set, target="Weekly_Sales")

#### **Pipeline**

In [0]:
cols = list(X_train[X_train.columns.difference(["IsHoliday", "Type"])].columns)
cols_std = ["Size", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "CPI", "Unemployment"]

pipeline = Pipeline([("fillZero", FillConstant(columns=["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"])),
                     ("tempToCat", ContinuousToCat(5, column="Temperature")),
                     ("addFeature", MonthAdder()), 
                     ("drop", DropColumns(columns="Date")),
                     ("scaler", Scaler(columns=cols_std))])

full_pipeline = ColumnTransformer([("cat", OrdinalEncoder(), ["IsHoliday", "Type"]),
                                   ("pipe", pipeline, cols)])

In [0]:
# Prepared X_train and X_test Sets
X_train_p = full_pipeline.fit_transform(X_train)
X_test_p = full_pipeline.fit_transform(X_test)

In [0]:
pd.DataFrame(X_test_p).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127438 entries, 0 to 127437
Data columns (total 15 columns):
0     127438 non-null object
1     127438 non-null object
2     127438 non-null object
3     127438 non-null object
4     127438 non-null object
5     127438 non-null object
6     127438 non-null object
7     127438 non-null object
8     127438 non-null object
9     127438 non-null object
10    127438 non-null object
11    127438 non-null object
12    127438 non-null object
13    127438 non-null object
14    127438 non-null object
dtypes: object(15)
memory usage: 14.6+ MB


### Model Selection

In [0]:
class ModelEvaluation(object):
    """"
    Parameters
    ---
    models : list type, default None
        models to be train.
    metric : callable function, default None
        Assumes to use absolute error with weights.
    weight: array 1D type, default None
        for weighted estimation.
    """
    def __init__(self, models=None, metric=None, weight=None):
        self.models = models
        self.metric = metric
        self.weight = weight

    def get_score(self, y, y_pred):
        score = self.metric(y, y_pred, sample_weight=self.weight)
        print(f"Metric: {score}")
        return score

    def evaluateCV(self, X, y, cv='warn'):
        """
        Return DataFrame Object with score results with cross-validation

        Parameters
        ---
        X : Dense matrix or DataFrame object
            Matrix to be train.
        y : Dense matrix or DataFrame object
            Data to be predicted.
        cv : integer type, default "warn"
            cross-validation to be set.
        """
        names = self.get_model_names()
        data = pd.DataFrame(index=names, columns=[self.metric.__name__])
        for i, model in enumerate(self.models):
            print(f"Setting up: {names[i]}")
            y_predict = cross_val_predict(model, X, y, cv=cv)
            score = self.get_score(y, y_predict)
            data.iloc[i,0] = score
            print(f"{names[i]}: DONE! [{i+1}/{len(names)}]\n")
        return data.sort_values(self.metric.__name__, ascending=False)

    def evaluateTest(self, X, y, X_test, y_test):
        names = self.get_model_names()
        data = pd.DataFrame(index=names, columns=[self.metric.__name__])
        for i, model in enumerate(self.models):
            estimator = model
            print(f"Setting up: {names[i]}")
            estimator.fit(X, y)
            print(f"Fitted!")
            y_predict = estimator.predict(X_test)
            score = self.get_score(y_test, y_predict)
            data.iloc[i,0] = score
            print(f"{names[i]}: DONE! [{i+1}/{len(names)}]\n")
        return data.sort_values(self.metric.__name__, ascending=False)
    
    def get_model_names(self):
        result = []
        for model in self.models:
            result.append(model.__class__.__name__)
        return result

In [0]:
import warnings

warnings.simplefilter("ignore")

**Evaluation of models (not CV)**

In [0]:
weights = test_set["IsHoliday"].map({True: 5, False: 1})

models = [RandomForestRegressor(random_state=42),
          DecisionTreeRegressor(random_state=42),
          AdaBoostRegressor(random_state=42),
          XGBRegressor(random_state=42),
          LinearRegression()]

evaluator = ModelEvaluation(models=models, metric=mean_absolute_error, weight=weights)
evaluator.evaluateTest(X_train_p, y_train_p, X_test_p, y_test_p)

#### **Testing with cross validation**

In [0]:
# Shuffle data
train_set, test_set = train_test_split(main_set, test_size=0.3, random_state=42)

In [0]:
X_train, y_train_p = get_XY(data=train_set, target="Weekly_Sales")
X_test, y_test_p = get_XY(data=test_set, target="Weekly_Sales")

weights = X_train["IsHoliday"].map({True: 5, False: 1})

In [0]:
# Prepared train and test data
X_train_p = full_pipeline.fit_transform(X_train)
X_test_p = full_pipeline.fit_transform(X_test)

In [0]:
weights = X_train["IsHoliday"].map({True: 5, False: 1})

evaluator = ModelEvaluation(models=models, metric=mean_absolute_error, weight=weights)
evaluator.evaluateCV(X_train_p, y_train_p, cv=5)

Setting up: RandomForestRegressor
Metric: 2001.6962472724197
Metric: 2001.6962472724197
RandomForestRegressor: DONE! [1/5]

Setting up: DecisionTreeRegressor
Metric: 2446.923474235179
Metric: 2446.923474235179
DecisionTreeRegressor: DONE! [2/5]

Setting up: AdaBoostRegressor
Metric: 21030.32234125208
Metric: 21030.32234125208
AdaBoostRegressor: DONE! [3/5]

Setting up: XGBRegressor
Metric: 7112.50752298703
Metric: 7112.50752298703
XGBRegressor: DONE! [4/5]

Setting up: LinearRegression
Metric: 14760.71597497116
Metric: 14760.71597497116
LinearRegression: DONE! [5/5]



Unnamed: 0,mean_absolute_error
AdaBoostRegressor,21030.3
LinearRegression,14760.7
XGBRegressor,7112.51
DecisionTreeRegressor,2446.92
RandomForestRegressor,2001.7


### **Random Forest Regressor**

#### **1st hyper-parameters tunning**

Number of iterations: **15**

In [0]:
param_grid_rand_reg = {
    "n_estimators": range(18 , 25, 2),
    "max_depth": range(25, 40, 3),
    "min_samples_split": range(2, 8),
    "min_samples_leaf": range(2, 7),
    "max_features": ["auto", "sqrt", "log2", None],
    "bootstrap": [True, False],
}

grid_search = RandomizedSearchCV(RandomForestRegressor(), param_grid_rand_reg, scoring='neg_mean_absolute_error', cv=4, return_train_score=True, n_iter=15)

In [0]:
grid_search.fit(X_train_p, y_train_p)

RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [0]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 28,
 'max_features': None,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 20}

In [0]:
grid_search.best_score_

-1719.9645712787956

In [0]:
grid_search.cv_results_['mean_test_score']

array([-2031.72058374, -7777.73154559, -7726.60537201, -1777.4953301 ,
       -2026.66739129, -1780.15341493, -1721.95242901, -2022.18845703,
       -1754.63498339, -7759.73116357, -1955.37933443, -1719.96457128,
       -1955.3991988 , -2010.70525538, -1950.54520091])

In [0]:
random_best = grid_search.best_estimator_ 

In [0]:
random_best.fit(X_train_p, y_train_p)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=28,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=6,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
weights = X_test["IsHoliday"].map({True: 5, False: 1})
evaluate = ModelEvaluation(models=[random_best], metric=mean_absolute_error, weight=weights)
evaluate.evaluateTest(X_train_p, y_train_p, X_test_p, y_test_p)

Setting up: RandomForestRegressor
Fitted!
Metric: 2044.7933116930292
RandomForestRegressor: DONE! [1/1]



Unnamed: 0,mean_absolute_error
RandomForestRegressor,2044.79


##### **Prediction**

In [0]:
X_main, y_main_p = get_XY(data=main_set, target="Weekly_Sales")
X_main_p = full_pipeline.fit_transform(X_main)
X_prediction_p = full_pipeline.fit_transform(pred_set)

###### 1st attempt prediction




In [0]:
params = {'bootstrap': True,
 'max_depth': 28,
 'max_features': None,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 20}
random_1st = RandomForestRegressor(**params)

In [0]:
random_1st.fit(X_main_p, y_main_p)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=28,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=6,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
X_prediction_p

array([[0.0, 0.0, 1.1703895648518932, ..., 0.5690954894375756, 0, 11],
       [0.0, 0.0, 1.170595711028827, ..., 0.5690954894375756, 1, 11],
       [0.0, 0.0, 1.1709473344560106, ..., 0.5690954894375756, 0, 11],
       ...,
       [0.0, 1.0, -1.3159013267511108, ..., -1.3178958191834687, 1, 7],
       [0.0, 1.0, -1.3159013267511108, ..., -1.3178958191834687, 2, 7],
       [0.0, 1.0, -1.3159013267511108, ..., -1.3178958191834687, 1, 7]],
      dtype=object)

In [0]:
# get predictions
y_predict = random_1st.predict(X_prediction_p)

**Submit**

In [0]:
sample['Weekly_Sales'] = y_predict
sample.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,24687.015963
1,1_1_2012-11-09,24837.893428
2,1_1_2012-11-16,24201.357463
3,1_1_2012-11-23,23599.712969
4,1_1_2012-11-30,23010.02276


In [0]:
sample.to_csv(path_or_buf=f"{path}/walmart_data/1stAttempt.csv", index=False)

**Score** : *5356.12302*

###### 2nd attempt prediction

fill missing values in CPI and Unemployment with median

In [0]:
# Change pipeline
cols = list(X_train[X_train.columns.difference(["IsHoliday", "Type"])].columns)
cols_std = ["Size", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "CPI", "Unemployment"]

pipeline = Pipeline([("fillZero", FillConstant(columns=["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], fill_value=0)),
                     ('imputeMedian', Imputer(columns=['CPI', 'Unemployment'], strategy='median')), # Impute missing values with median
                     ("tempToCat", ContinuousToCat(5, column="Temperature")),
                     ("addFeature", MonthAdder()), 
                     ("drop", DropColumns(columns="Date")),
                     ("scaler", Scaler(columns=cols_std))])

full_pipeline = ColumnTransformer([("cat", OrdinalEncoder(), ["IsHoliday", "Type"]),
                                   ("pipe", pipeline, cols)])

In [0]:
X_main, y_main_p = get_XY(data=main_set, target="Weekly_Sales")
X_main_p = full_pipeline.fit_transform(X_main)
X_prediction_p = full_pipeline.fit_transform(pred_set)

In [0]:
params = {'bootstrap': True,
 'max_depth': 28,
 'max_features': None,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 20}

random_2nd = RandomForestRegressor(**params)

In [0]:
random_2nd.fit(X_main_p, y_main_p)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=28,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=6,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
y_predict = random_2nd.predict(X_prediction_p)
y_predict

array([20876.37420952, 22039.57061488, 21953.20985218, ...,
         541.55954008,   639.56963651,   626.11845317])

In [0]:
sample['Weekly_Sales'] = y_predict
sample.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,20876.37421
1,1_1_2012-11-09,22039.570615
2,1_1_2012-11-16,21953.209852
3,1_1_2012-11-23,24882.239237
4,1_1_2012-11-30,24706.548125


In [0]:
sample.to_csv(f"{path}/walmart_data/2ndAttempt.csv", index=False)

**Score**: 4690.63219

###### 3rd attempt prediction

In [0]:
X_main.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,False,A,151315,42.31,2.572,,,,,,211.096358,8.106
1,1,1,2010-02-12,True,A,151315,38.51,2.548,,,,,,211.24217,8.106
2,1,1,2010-02-19,False,A,151315,39.93,2.514,,,,,,211.289143,8.106
3,1,1,2010-02-26,False,A,151315,46.63,2.561,,,,,,211.319643,8.106
4,1,1,2010-03-05,False,A,151315,46.5,2.625,,,,,,211.350143,8.106


In [0]:
# Change pipeline
cols = list(X_train[X_train.columns.difference(["IsHoliday", "Type", "CPI", "Unemployment"])].columns)
cols_std = ["Size", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]

pipeline = Pipeline([("fillZero", FillConstant(columns=["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], fill_value=0)),
                     ("tempToCat", ContinuousToCat(5, column="Temperature")),
                     ("addFeature", MonthAdder()),
                     ("drop", DropColumns(columns="Date")),
                     ("scaler", Scaler(columns=cols_std))])

full_pipeline = ColumnTransformer([("cat", OrdinalEncoder(), ["IsHoliday", "Type"]), 
                                   ("pipe", pipeline, cols)])

In [0]:
X_main, y_main_p = get_XY(data=main_set, target="Weekly_Sales")
X_main_p = full_pipeline.fit_transform(X_main)
X_prediction_p = full_pipeline.fit_transform(pred_set)

In [0]:
params = {'bootstrap': True,
 'max_depth': 28,
 'max_features': None,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 20}

random_3rd = RandomForestRegressor(**params)

In [0]:
random_3rd.fit(X_main_p, y_main_p)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=28,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=6,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
y_predict = random_3rd.predict(X_prediction_p)
y_predict

array([21615.75684286, 20122.10598155, 20721.87881071, ...,
         804.9716371 ,   890.15464067,   773.82550972])

In [0]:
sample["Weekly_Sales"] = y_predict
sample.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,21615.756843
1,1_1_2012-11-09,20122.105982
2,1_1_2012-11-16,20721.878811
3,1_1_2012-11-23,22192.522327
4,1_1_2012-11-30,19568.569821


In [0]:
sample.to_csv(f"{path}/walmart_data/3rdAttempt.csv", index=False)
print("Done!")

Done!


**Score** : 3894.00083

#### **2nd hyper-parameter tunning**

Number of iterations: **35**

In [0]:
param_grid_rand_reg = {
    "n_estimators": range(10, 120, 2),
    "max_depth": range(3, 70, 2),
    "min_samples_split": range(2, 21, 2),
    "min_samples_leaf": range(2, 21, 2),
    "max_features": ["auto", "sqrt", "log2", None],
    "bootstrap": [True, False],
}

grid_search = RandomizedSearchCV(RandomForestRegressor(), param_grid_rand_reg, scoring='neg_mean_absolute_error', cv=4, return_train_score=True, n_iter=35)

In [0]:
start = time.time()
grid_search.fit(X_train_p, y_train_p)
stop = time.time()

print(f"Execution time: {stop - start}")
print(grid_search)

Execution time: 9731.921213388443
RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                               

In [0]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 53,
 'max_features': None,
 'min_samples_leaf': 6,
 'min_samples_split': 14,
 'n_estimators': 78}

In [0]:
grid_search.best_score_

-1761.7064458605826

In [0]:
grid_search.cv_results_['mean_test_score']

array([ -1963.91425527,  -5113.93054128,  -1945.34430238,  -1967.23094648,
        -2867.63686725,  -8088.95411879,  -9424.91398465,  -2044.75410625,
       -10668.54719881,  -8729.38028159,  -7597.362017  ,  -1972.1442478 ,
        -9445.26235097,  -1764.34450242,  -1958.2039217 ,  -2103.21243829,
        -1994.34213719,  -7743.55210516,  -7654.34280525,  -3467.45997166,
        -8114.82592141,  -1773.7364137 , -12388.65048037,  -2621.34434841,
        -7876.53268647,  -6565.20469818,  -1761.70644586,  -2074.7064629 ,
        -1970.41973501,  -7590.95470415,  -8448.74130211, -11363.6364038 ,
        -8219.17021418,  -1958.17829412, -13592.99154927])

In [0]:
params = {'bootstrap': True,
 'max_depth': 53,
 'max_features': None,
 'min_samples_leaf': 6,
 'min_samples_split': 14,
 'n_estimators': 78}

random_2nd = RandomForestRegressor(**params)

In [0]:
weights = X_test["IsHoliday"].map({True: 5, False: 1})

evaluate = ModelEvaluation(models=[random_2nd], metric=mean_absolute_error, weight=weights)
evaluate.evaluateTest(X_train_p, y_train_p, X_test_p, y_test_p)

Setting up: RandomForestRegressor
Fitted!
Metric: 2099.221680557113
RandomForestRegressor: DONE! [1/1]



Unnamed: 0,mean_absolute_error
RandomForestRegressor,2099.22


##### Prediction

###### 1st attempt prediction

In [0]:
cols = list(X_train[X_train.columns.difference(["IsHoliday", "Type", "CPI", "Unemployment"])].columns)
cols_std = ["Size", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]

pipeline = Pipeline([("fillZero", FillConstant(columns=["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], fill_value=0)),
                     ("tempToCat", ContinuousToCat(5, column="Temperature")),
                     ("addFeature", MonthAdder()),
                     ("drop", DropColumns(columns="Date")),
                     ("scaler", Scaler(columns=cols_std))])

full_pipeline = ColumnTransformer([("cat", OrdinalEncoder(), ["IsHoliday", "Type"]), 
                                   ("pipe", pipeline, cols)])

In [0]:
X_main, y_main_p = get_XY(data=main_set, target="Weekly_Sales")
X_main_p = full_pipeline.fit_transform(X_main)
X_prediction_p = full_pipeline.fit_transform(pred_set)

In [0]:
params = {'bootstrap': True,
 'max_depth': 53,
 'max_features': None,
 'min_samples_leaf': 6,
 'min_samples_split': 14,
 'n_estimators': 78}

random_1st = RandomForestRegressor(**params)

In [0]:
random_1st.fit(X_main_p, y_main_p)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=53,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=6, min_samples_split=14,
                      min_weight_fraction_leaf=0.0, n_estimators=78,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
y_predict = random_1st.predict(X_prediction_p)
y_predict

array([22228.75794368, 21378.04530063, 22253.95794493, ...,
         739.14820125,   819.30690439,   822.2699435 ])

In [0]:
sample["Weekly_Sales"] = y_predict
sample.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,22228.757944
1,1_1_2012-11-09,21378.045301
2,1_1_2012-11-16,22253.957945
3,1_1_2012-11-23,23853.155336
4,1_1_2012-11-30,22776.820989


In [0]:
sample.to_csv(f"{path}/walmart_data/1stAttempt (2ndTunning).csv", index=False)
print("Done!")

Done!


#### **3rd hyper-parameters tunning**

Number of iterations : **45**

In [0]:
cols = list(main_set[X_train.columns.difference(["IsHoliday", "Type", "CPI", "Unemployment"])].columns)
cols_std = ["Size", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]

pipeline = Pipeline([("fillZero", FillConstant(columns=["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], fill_value=0)),
                     ("tempToCat", ContinuousToCat(5, column="Temperature")),
                     ("addFeature", MonthAdder()),
                     ("drop", DropColumns(columns="Date")),
                     ("scaler", Scaler(columns=cols_std))])

full_pipeline = ColumnTransformer([("cat", OrdinalEncoder(), ["IsHoliday", "Type"]), 
                                   ("pipe", pipeline, cols)])

In [0]:
X_train, y_train_p = get_XY(data=train_set, target="Weekly_Sales")
X_test, y_test_p = get_XY(data=test_set, target="Weekly_Sales")

X_train_p = full_pipeline.fit_transform(X_train)
X_test_p = full_pipeline.fit_transform(X_test)

In [0]:
param_grid_rand_reg = {
    "n_estimators": range(10, 120, 2),
    "max_depth": range(3, 70, 2),
    "min_samples_split": range(2, 21, 2),
    "min_samples_leaf": range(2, 21, 2),
    "max_features": ["auto", "sqrt", "log2", None],
    "bootstrap": [True, False],
}

grid_search = RandomizedSearchCV(RandomForestRegressor(), param_grid_rand_reg, scoring='neg_mean_absolute_error', cv=4, return_train_score=True, n_iter=45)

In [0]:
start = time.time()
grid_search.fit(X_train_p, y_train_p)
stop = time.time()

print(f"Execution time: {stop - start}")
print(grid_search)

Execution time: 8717.646938562393
RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                               

In [0]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 53,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 92}

In [0]:
grid_search.best_score_

-1691.005120532815

In [0]:
grid_search.cv_results_["mean_test_score"]

array([ -6683.93121836,  -8792.28317267,  -6318.42433407,  -2343.32027552,
        -6675.20074204,  -2072.76113342,  -1997.17846247,  -8168.23064027,
        -7300.22750764,  -2128.25453773,  -1910.96917514,  -6371.46712398,
        -7232.22217873,  -2007.88661436,  -2097.80844353,  -6403.76882913,
        -1715.35550457,  -1942.62998443,  -9493.08831036,  -7437.50947218,
        -7034.46901334,  -1990.67703589, -10310.97209487,  -1691.00512053,
        -6939.16339389,  -1910.58406147,  -7303.13493348,  -7144.90211236,
        -1966.63921755,  -6621.16191811,  -6575.77136341,  -6249.68220836,
        -7556.57581709,  -1766.10792744,  -1982.96887534,  -6891.12364461,
        -1945.05715272,  -7017.89854367,  -6355.79086896,  -6936.28357638,
        -9492.80536452,  -1819.48528776,  -7135.78767779,  -1945.06262058,
        -4912.77569202])

In [0]:
params = {'bootstrap': True,
 'max_depth': 53,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 92}

random_3rd = RandomForestRegressor(**params)

In [0]:
weights = X_test["IsHoliday"].map({True: 5, False: 1})

evaluate = ModelEvaluation(models=[random_3rd], metric=mean_absolute_error, weight=weights)
evaluate.evaluateTest(X_train_p, y_train_p, X_test_p, y_test_p)

Setting up: RandomForestRegressor
Fitted!
Metric: 2096.4112208087604
RandomForestRegressor: DONE! [1/1]



Unnamed: 0,mean_absolute_error
RandomForestRegressor,2096.41


##### Prediction

In [0]:
X_main, y_main_p = get_XY(data=main_set, target="Weekly_Sales")
X_main_p = full_pipeline.fit_transform(X_main)
X_prediction_p = full_pipeline.fit_transform(pred_set)

###### 1st attempt prediction

In [0]:
params = {'bootstrap': True,
 'max_depth': 53,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 92}

random_1st = RandomForestRegressor(**params)

In [0]:
random_1st.fit(X_main_p, y_main_p)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=53,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=6,
                      min_weight_fraction_leaf=0.0, n_estimators=92,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
y_predict = random_1st.predict(X_prediction_p)
y_predict

array([23192.80802775, 21777.23394062, 22118.8993823 , ...,
         795.0157566 ,   791.14823766,   829.01162313])

In [0]:
sample["Weekly_Sales"] = y_predict
sample.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,23192.808028
1,1_1_2012-11-09,21777.233941
2,1_1_2012-11-16,22118.899382
3,1_1_2012-11-23,24442.363596
4,1_1_2012-11-30,22676.112774


In [0]:
sample.to_csv(f"{path}/walmart_data/1stAttempt (3rdTunning).csv", index=False)
print("Done!")

Done!


**Score** : 3883.08964