#### Author: João Pedro Mantoan
#### LinkedIn: http://linkedin.com/in/jo%C3%A3o-pedro-mantoan

# Introduction

The objective of this competition is to forecast the sales for departments in Walmart stores based on historical sales data for 45 Walmart stores located in different regions. 

# Importing dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
import random

# Loading the datasets

In [None]:
train=pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
test=pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
features=pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
sample_sub=pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip')
stores=pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')

# Data preparation, exploration and cleaning

In [None]:
feature_store = features.merge(stores, how='inner', on = "Store")

In [None]:
feature_store.head()

In [None]:
train = train.merge(feature_store, how='inner', on=['Store','Date','IsHoliday'])
train.head()

In [None]:
test = test.merge(feature_store, how='inner', on=['Store','Date','IsHoliday'])
test.head()

In [None]:
features.shape, train.shape, stores.shape, test.shape, sample_sub.shape

### Train and test data types

In [None]:
train.dtypes

In [None]:
test.dtypes

#### Changing date's type from string to date

In [None]:
train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)

### Descriptive statistics and null values verification

In [None]:
train.copy().drop(columns=['Date','IsHoliday','Type','Store']).describe().round(2)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

#### There are some null values in multiple variables that will be treated after further analysis

#### Checking for negative sales 
Those are probably errors and will be removed

In [None]:
train[train['Weekly_Sales'] < 0]

In [None]:
train.drop(train[train.Weekly_Sales < 0].index, inplace=True)

### Encoding categorical data

In [None]:
sup_dict = {'A': 1,
           'B': 2,
           'C':3}
train['Type'] = train['Type'].map(lambda x: sup_dict[x])
train.Type.unique()

In [None]:
test['Type'] = test['Type'].map(lambda x: sup_dict[x])
test.Type.unique()

In [None]:
train['IsHoliday'] = train['IsHoliday'].map(lambda x: 0 if x == False else 1)
test['IsHoliday'] = test['IsHoliday'].map(lambda x: 0 if x == False else 1)

### Splitting Date into Year, Month, Week, Day

This allows a better understanding of the relationship between the target and the date info

In [None]:
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Week'] = train['Date'].dt.week
train['Day'] = train['Date'].dt.day

In [None]:
test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Week'] = test['Date'].dt.week
test['Day'] = test['Date'].dt.day

#### Plotting of Weekly sales' means for all the 3 years of data

In [None]:
weekly_sales2010 = train.loc[train['Year']==2010].groupby(['Week']).agg({'Weekly_Sales': ['mean']})
weekly_sales2011 = train.loc[train['Year']==2011].groupby(['Week']).agg({'Weekly_Sales': ['mean']})
weekly_sales2012 = train.loc[train['Year']==2012].groupby(['Week']).agg({'Weekly_Sales': ['mean']})
plt.figure(figsize=(20, 10))
sns.lineplot(weekly_sales2010['Weekly_Sales']['mean'].index, weekly_sales2010['Weekly_Sales']['mean'].values)
sns.lineplot(weekly_sales2011['Weekly_Sales']['mean'].index, weekly_sales2011['Weekly_Sales']['mean'].values)
sns.lineplot(weekly_sales2012['Weekly_Sales']['mean'].index, weekly_sales2012['Weekly_Sales']['mean'].values)

plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2010', '2011', '2012'])
plt.show()

#### Reordering the variables in the dataset for better visualisation of the correlation betwen them and the target 

In [None]:
train=train[['Store', 'Dept', 'Date', 'Year', 'Month', 'Week', 'Day', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Weekly_Sales']]
test=test[['Store', 'Dept', 'Date', 'Year', 'Month', 'Week', 'Day', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size']]


#### Droping the Date columns since it's now divided in multiple columns


In [None]:
train.drop(columns='Date')
test.drop(columns='Date')

### Plotting the Pearson correlation beetwen the continuous variables


In [None]:
corr = train.drop(columns=['Year', 'Month', 'Week', 'Day','IsHoliday','Type']).corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

#### Treating any remaining null values in the datasets

In [None]:
train_dataset = train.copy()
test_dataset = test.copy()

In [None]:
train_dataset.isnull().sum()

In [None]:
train_dataset.fillna(train_dataset.mean(), inplace=True)

In [None]:
train_dataset.isnull().sum()

In [None]:
test_dataset.isnull().sum()

In [None]:
test[test['Unemployment'].isnull()].shape

In [None]:
test_dataset['Unemployment'].max(), test_dataset['Unemployment'].min(), test_dataset['Unemployment'].mean()

### CPI and Unemployment rate plot

In [None]:
print(train.Year.unique(), test.Year.unique())

In [None]:
cpi2010 = train.loc[train['Year']==2010].groupby(['Week']).agg({'CPI': ['mean']})
cpi2011 = train.loc[train['Year']==2011].groupby(['Week']).agg({'CPI': ['mean']})
cpi2012 = train.loc[train['Year']==2012].groupby(['Week']).agg({'CPI': ['mean']})
plt.figure(figsize=(20, 10))
sns.lineplot(cpi2010['CPI']['mean'].index, cpi2010['CPI']['mean'].values)
sns.lineplot(cpi2011['CPI']['mean'].index, cpi2011['CPI']['mean'].values)
sns.lineplot(cpi2012['CPI']['mean'].index, cpi2012['CPI']['mean'].values)

plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2010', '2011', '2012'])
plt.show()

In [None]:
cpi2012 = test.loc[test['Year']==2012].groupby(['Week']).agg({'CPI': ['mean']})
cpi2013 = test.loc[test['Year']==2013].groupby(['Week']).agg({'CPI': ['mean']})
plt.figure(figsize=(20, 10))
sns.lineplot(cpi2012['CPI']['mean'].index, cpi2012['CPI']['mean'].values)
sns.lineplot(cpi2013['CPI']['mean'].index, cpi2013['CPI']['mean'].values)

plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2012', '2013'])
plt.show()

In [None]:
unemployment2010 = train.loc[train['Year']==2010].groupby(['Week']).agg({'Unemployment': ['mean']})
unemployment2011 = train.loc[train['Year']==2011].groupby(['Week']).agg({'Unemployment': ['mean']})
unemployment2012 = train.loc[train['Year']==2012].groupby(['Week']).agg({'Unemployment': ['mean']})
plt.figure(figsize=(20, 10))
sns.lineplot(unemployment2010['Unemployment']['mean'].index, unemployment2010['Unemployment']['mean'].values)
sns.lineplot(unemployment2011['Unemployment']['mean'].index, unemployment2011['Unemployment']['mean'].values)
sns.lineplot(unemployment2012['Unemployment']['mean'].index, unemployment2012['Unemployment']['mean'].values)

plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2010', '2011', '2012'])
plt.show()

In [None]:
test['Unemployment'].max()

In [None]:
unemployment2012 = test.loc[test['Year']==2012].groupby(['Week']).agg({'Unemployment': ['mean']})
unemployment2013 = test.loc[test['Year']==2013].groupby(['Week']).agg({'Unemployment': ['mean']})
plt.figure(figsize=(20, 10))
sns.lineplot(unemployment2012['Unemployment']['mean'].index, unemployment2012['Unemployment']['mean'].values)
sns.lineplot(unemployment2013['Unemployment']['mean'].index, unemployment2013['Unemployment']['mean'].values)

plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2012', '2013'])
plt.show()

Since the training data shows a tendency of decreasing in the Unemployment rate by the end of the year (probably explained by the temporary employment for Christmas season) and the missing data in the test dataset is from the middle to the end of 2013, it makes sense to use the minimum unemployment rate to fill the missing values. For the CPI rate it is the opposite, the tendency is to increase as the year approaches the end, so the null values will be filled by their maximum.

In [None]:
test_dataset['Unemployment']=test_dataset['Unemployment'].fillna(test_dataset['Unemployment'].min())
test_dataset['CPI']=test_dataset['CPI'].fillna(test_dataset['CPI'].max())

In [None]:
test_dataset.isnull().sum()

In [None]:
test_dataset.fillna(test_dataset.mean(), inplace=True)

## Definition of the Weighted Mean Absolute Error function 
It's the one that will be used to evaluate the selected model's performance.

In [None]:
def WMAE(dataset, real, predicted):
    weights = dataset.IsHoliday.apply(lambda x: 5 if x else 1)
    return np.round(np.sum(weights*abs(real-predicted))/(np.sum(weights)), 2)

In [None]:
X = train_dataset[['Store', 'Dept', 'Year', 'Month', 'Week', 'Day', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size']]
y = train_dataset['Weekly_Sales']

# Model Selection
Random Forest Regressor and XGBoost Regressor are commonly selected for forecasting tasks because of theirs state-of-the-art performance, so they will be evaluated as baseline models.

In [None]:
xgb_model = xgb.XGBRegressor(random_state = 0, objective = 'reg:tweedie', n_jobs=-1)
rf_model = RandomForestRegressor(random_state = 0, n_jobs=-1)

In [None]:
models = {'XGBoost': xgb_model,
          'Random Forest': rf_model}

In [None]:
model_stats = []

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)

In [None]:
def baseline_model_evaluation (name, model, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # Model fit
    model.fit(X_train, y_train)
    # Model predict
    predicted = model.predict(X_test)
    # Metrics
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, predicted)),3)
    r2 = metrics.r2_score(y_test, predicted)
    
    model_stats.append({'Model': name, 'RMSE': rmse, 'R2': r2})
    print(f'Model: {name} | RMSE: {rmse} | R2 {r2}')

    return pd.DataFrame(model_stats)

In [None]:
for name, model in models.items():
    baseline_model_evaluation(name=name, model=model, X=X, y=y)

### Feature importance plot

In [None]:
importance_df_rf = pd.DataFrame({'feature': X_train.columns,'importance':  rf_model.feature_importances_}).sort_values('importance', ascending=False)
plt.figure(figsize=(12,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df_rf.head(10), x='importance', y='feature');

In [None]:
columns = list(importance_df_rf.feature.head(10).values)
columns.append('IsHoliday')
columns

In [None]:
X = train_dataset[columns]
y = train_dataset['Weekly_Sales']

#### Since the Random Forest baseline outperformed the XGboost, it will be optimized.

# Model Optimization

A function was created to test different parameters in Random Forest Regressor. This function allowed selection of the combination that resulted in the lowest WMAE, since GridSearch or RandomizedSearch could not be used for hyperparameter tuning.

In [None]:
result = []

In [None]:
def random_forest_evaluation(n_estimators: int, max_depth: int, max_features: int, min_samples_split: int, min_samples_leaf: int):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    RF = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, 
                                           min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, n_jobs=-1, random_state=0)
    RF.fit(X_train, y_train)
    predicted = RF.predict(X_test)
    error = WMAE(X_test, y_test, predicted)
    mae = metrics.mean_absolute_error(y_test, predicted)
    mse = metrics.mean_squared_error(y_test, predicted)
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y_test, predicted)
    print(f'N_estimators: {n_estimators} | Max_Depth {max_depth} | Max_Features {max_features} | Min_Samples_Leaf {min_samples_leaf} | Min_Samples_Split {min_samples_split} | WMAE: {error} | RMSE: {rmse} | R2: {r2}' )
    result.append({'N_estimators': n_estimators,'Max_Depth': max_depth, 'Max_Features': max_features, 'Min_Samples_Leaf': min_samples_leaf, 'Min_Samples_Split': min_samples_split, 'WMAE': error, 'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2})
    return pd.DataFrame(result)

In [None]:
for estim in range(0,200):
    random_forest_evaluation(n_estimators=random.choice(range(20,200,10)), max_depth=random.choice(range(20,60,2)), max_features=random.choice(range(7, (len(X.columns.values)))), min_samples_split = 2, min_samples_leaf = 2)

In [None]:
results = pd.DataFrame(result)
results.sort_values(by='WMAE', ascending=True)

In [None]:
X_test = test_dataset[columns]

In [None]:
best_params = results.sort_values(by='WMAE', ascending=True).head(1)
RF = RandomForestRegressor(n_estimators=int(best_params['N_estimators']), max_depth=int(best_params['Max_Depth']), max_features=int(best_params['Max_Features']), min_samples_leaf=int(best_params['Min_Samples_Leaf']), min_samples_split=int(best_params['Min_Samples_Split']), n_jobs=6, random_state=0)
RF.fit(X, y)
y_predict = RF.predict(X_test)
sample_sub['Weekly_Sales'] = y_predict
sample_sub.to_csv('submission_4.csv', index=False)