# Modeling and forecasting consumer activity in retail

Group: J41322c - Stebenkov Andrey (email: a.stebenkov75@yandex.ru, tg: @FGksjp67)

This notebook is a baseline solution for the Kaggle competition "Rossmann Store Sale".

<a id="top"></a>
<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#FF206E; border:0' role="tab" aria-controls="home"><center>Table of contents</center></h3>


* **Data preparation**

* **Models:**

    * Linear Regression
    * Random Forest
    * LigtgGBM
    * Xgboost
    * Catboost


In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import itertools

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

RANDOM_STATE = 1518

In [None]:
def str_to_date(date):
    return datetime.strptime(date, '%Y-%m-%d').date()

In [None]:
# save preds from the model to csv
def make_submission(model, name):
    pred = model.predict(df_test_store.drop(['Id'],1))

    submission = pd.DataFrame({'Id': df_test_store['Id'], 
                               'Sales': np.exp(pred)})
    submission.to_csv("{}.csv".format(name), index=False)

## Data preparation

The provided data is in three files: train.csv, store.csv, test.csv. 

Start with the train and store.

In [None]:
df_train = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv',
                       sep=',',
                       parse_dates=['Date'],
                       date_parser=str_to_date,
                       low_memory=False
)

In [None]:
df_store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv',
                       low_memory=False
)

Began with dropping closed stores and stores with zero sales;

In [None]:
# Drop closed stores and stores with zero sales
df_train = df_train.drop(df_train[(df_train.Open == 0) & (df_train.Sales == 0)].index)
df_train = df_train.drop(df_train[(df_train.Open == 1) & (df_train.Sales == 0)].index)
df_train = df_train.reset_index(drop=True)

The feature Date was decomposed into Year, Month, Day and deleted from dataset;

In [None]:
df_train['Day'] = df_train['Date'].dt.day
df_train['Month'] = df_train['Date'].dt.month
df_train['Year'] = df_train['Date'].dt.year

del df_train['Date']

Next filling missing in store.csv. CompetitionDistance was filling the median value. Other features were filling with 0;

In [None]:
df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace=True)
df_store['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
df_store['CompetitionOpenSinceYear'].fillna(0, inplace=True)
df_store['Promo2SinceWeek'].fillna(0, inplace=True)
df_store['Promo2SinceYear'].fillna(0, inplace=True)
df_store['PromoInterval'].fillna(0, inplace=True)

Was merged information about store and training data into one dataframe;

In [None]:
df = pd.merge(df_train, df_store, how='left', on='Store')

df.head()

Change features type from objcet to category;

In [None]:
df['StateHoliday'] = df['StateHoliday'].astype('category')
df['Assortment'] = df['Assortment'].astype('category')
df['StoreType'] = df['StoreType'].astype('category')
df['PromoInterval'] = df['PromoInterval'].astype('category')

CompetitionOpenSinceYear and CompetitionOpneSinceMonth have the same underlying meaning, that’s why they were merged into one variable CompetiotionOpenSince;

In [None]:
df['CompetitionOpenSince'] = np.where((df['CompetitionOpenSinceMonth'] == 0) & (df['CompetitionOpenSinceYear'] == 0),
                                      0,
                                      (df['Month'] - df['CompetitionOpenSinceMonth']) + (12 * df['Year'] - df['CompetitionOpenSinceYear']))

del df['CompetitionOpenSinceYear']
del df['CompetitionOpenSinceMonth']

The feature StateHoliday changed into a boolean variable. The value {a, b, c} became 1, other 0;

In [None]:
df['is_holiday_state'] = df['StateHoliday'].map({'0': 0,
                                                 'a': 1,
                                                 'b': 1,
                                                 'c': 1
                                                }
)

del df['StateHoliday']

The features Assortment, StoreType, PromoInterval were coding with the get_dummies function;

In [None]:
df = pd.get_dummies(df, columns=['Assortment', 'StoreType', 'PromoInterval'],
                    prefix=['is_Assortment', 'is_StoreType', 'is_PromoInterval']
)

In [None]:
df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]

And finally, all these steps were done for test.csv.

In [None]:
df_test = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv',
                       sep=',',
                       parse_dates=['Date'],
                       date_parser=str_to_date,
                       low_memory=False
)

In [None]:
df_test = df_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
df_test['Open'].fillna(1, inplace = True)

In [None]:
df_test['Day'] = df_test['Date'].dt.day
df_test['Month'] = df_test['Date'].dt.month
df_test['Year'] = df_test['Date'].dt.year

del df_test['Date']

In [None]:
df_test_store = pd.merge(df_test, df_store, how='left', on='Store')

In [None]:
df_test_store['StateHoliday'] = df_test_store['StateHoliday'].astype('category')
df_test_store['Assortment'] = df_test_store['Assortment'].astype('category')
df_test_store['StoreType'] = df_test_store['StoreType'].astype('category')
df_test_store['PromoInterval'] = df_test_store['PromoInterval'].astype('category')

In [None]:
df_test_store['CompetitionOpenSince'] = np.where((df_test_store['CompetitionOpenSinceMonth'] == 0) & (df_test_store['CompetitionOpenSinceYear'] == 0),
                                      0,
                                      (df_test_store['Month'] - df_test_store['CompetitionOpenSinceMonth']) + (12 * df_test_store['Year'] - df_test_store['CompetitionOpenSinceYear']))

del df_test_store['CompetitionOpenSinceYear']
del df_test_store['CompetitionOpenSinceMonth']

In [None]:
df_test_store['is_holiday_state'] = df_test_store['StateHoliday'].map({'0': 0,
                                                                       'a': 1,
                                                                       'b': 1,
                                                                       'c': 1
                                                                       }
)

del df_test_store['StateHoliday']

In [None]:
df_test_store = pd.get_dummies(df_test_store, columns=['Assortment', 'StoreType', 'PromoInterval'],
                               prefix=['is_Assortment', 'is_StoreType', 'is_PromoInterval']
)

In [None]:
df_test_store['is_holiday_state'] = df_test_store['is_holiday_state'].astype('int64')

In [None]:
df_test_store.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df_test_store.columns]

Split on train and target;

In [None]:
features = df.drop(['Customers', 'Sales'], axis=1)
targets = np.log(df.Sales)

And finally make train and validation part (80%/20%)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=.2, random_state=RANDOM_STATE)

And don't forget about metric

In [None]:
def rmspe(true, pred):
    rmspe = np.sqrt(np.mean((true - pred)**2))
    return rmspe

## Models

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

In [None]:
y_pred = lin_reg.predict(X_val)

In [None]:
rmspe(y_val, y_pred)

In [None]:
make_submission(lin_reg, 'linreg')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_val)

In [None]:
rmspe(y_val, y_pred)

In [None]:
make_submission(rfr, 'rand_forest')

### LightGBM

In [None]:
import lightgbm as lgbm

lgbm = lgbm.LGBMRegressor()

lgbm.fit(X_train, y_train)

In [None]:
y_pred = lgbm.predict(X_val)

In [None]:
rmspe(y_val, y_pred)

In [None]:
make_submission(lgbm, 'lgbm')

### XGBoost

In [None]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:linear')

xg_reg.fit(X_train, y_train)

In [None]:
y_pred = xg_reg.predict(X_val)

In [None]:
rmspe(y_val, y_pred)

In [None]:
make_submission(xg_reg, 'xgboost')

### Catboost

In [None]:
from catboost import CatBoostRegressor

cbr = CatBoostRegressor()

cbr.fit(X_train, y_train)

In [None]:
y_pred = cbr.predict(X_val)

In [None]:
rmspe(y_val, y_pred)

In [None]:
make_submission(cbr, 'catboost')