### Importing the necessary libraries

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,cross_val_predict, KFold,train_test_split,GridSearchCV,RandomizedSearchCV

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


Next, we import the data.

In [3]:
processed_train_store = pd.read_csv('/content/gdrive/My Drive/kaggle/rossmann-store-sales/store_train.csv')
processed_test_store = pd.read_csv('/content/gdrive/My Drive/kaggle/rossmann-store-sales/store_test.csv')

Let's have a look at how our data looks like.

In [4]:
processed_train_store.head()

Unnamed: 0,Store,Date,day,month,year,DayOfWeek,WeekOfYear,Sales,Customers,Open,...,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,SalePerCustomer
0,1,2015-07-31,31,7,2015,5,31,5263,555,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.482883
1,1,2015-07-30,30,7,2015,4,31,5020,546,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.194139
2,1,2015-07-29,29,7,2015,3,31,4782,523,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.143403
3,1,2015-07-28,28,7,2015,2,31,5011,560,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,8.948214
4,1,2015-07-27,27,7,2015,1,31,6102,612,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.970588


In [5]:
processed_test_store.head(2)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,day,month,year,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,o,0,17,9,2015,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0
1,857,1,3,2015-09-16,1.0,1,o,0,16,9,2015,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0


In [6]:
print("Shape of processed_train_store data is: ",processed_train_store.shape)
print("Shape of processed_test_store data is: ",processed_test_store.shape)

('Shape of processed_train_store data is: ', (844338, 23))
('Shape of processed_test_store data is: ', (41088, 20))


In [7]:
processed_train_store_dtypes = pd.DataFrame(processed_train_store.dtypes,
                                            columns = ["Data_type"])
processed_train_store_dtypes

Unnamed: 0,Data_type
Store,int64
Date,object
day,int64
month,int64
year,int64
DayOfWeek,int64
WeekOfYear,int64
Sales,int64
Customers,int64
Open,int64


In [8]:
#Checking for the unique values
unique_values = pd.DataFrame(columns=['Unique Values'])
for x in list(processed_train_store.columns.values):
    unique_values.loc[x] = [processed_train_store[x].nunique()]

In [None]:
unique_values

Unnamed: 0,Unique Values
Store,1115
Date,942
day,31
month,12
year,3
DayOfWeek,7
WeekOfYear,52
Sales,21733
Customers,4083
Open,1


Since our data is preprocessed, next we are going to extract some useful features from the `Date` variable that will be useful in our modelling.
* `Processed_train_store`
since for this dataframe we had day ,month and year extracted, we will just extract the week,the quarter and the seaon.

In [9]:
#converting the 'Date' column  to date formate
processed_train_store["Date"] = pd.to_datetime(processed_train_store["Date"],format = "%Y-%m-%d")

In [10]:
'''
We are going to consider for seasons;
ab)Summer runs from June 1 to August 31;
Fall (autumn) runs from September 1 to November 30; and
Winter runs from December 1 to February 28 (February 29 in a leap year).
'''
#Extracting.
processed_train_store["Week"] = processed_train_store["Date"].dt.week
processed_train_store["Quarter"] = processed_train_store["Date"].dt.quarter
processed_train_store["Season"] = np.where(processed_train_store["month"].isin([12,1,2]),"Winter",
                                    np.where(processed_train_store["month"].isin([6,7,8]),"Summer",
                                    np.where(processed_train_store["month"].isin([9,10,11]),"Fall",
                                    np.where(processed_train_store["month"].isin([3,4,5]),"Spring","None"))))

In [11]:
#renaming the columns;
processed_train_store.rename(columns = {"year":"Year","day":"Day","month":"Month"},inplace = True)

* `Processed_test_store`
since for this dataframe we had day ,month and year extracted, we will just extract the week,the quarter and the seaon.

In [12]:
processed_test_store.rename(columns = {"year":"Year","day":"Day","month":"Month"},inplace = True)

In [13]:
#converting the 'Date' column  to date formate
processed_test_store["Date"] = pd.to_datetime(processed_test_store["Date"],format = "%Y-%m-%d")

In [14]:
'''
We are going to consider for seasons;
ab)Summer runs from June 1 to August 31;
Fall (autumn) runs from September 1 to November 30; and
Winter runs from December 1 to February 28 (February 29 in a leap year).
'''
#Extracting.
processed_test_store["Week"] = processed_test_store["Date"].dt.week
processed_test_store["Quarter"] = processed_test_store["Date"].dt.quarter
processed_test_store["Season"] = np.where(processed_test_store["Month"].isin([12,1,2]),"Winter",
                                    np.where(processed_test_store["Month"].isin([6,7,8]),"Summer",
                                    np.where(processed_test_store["Month"].isin([9,10,11]),"Fall",
                                    np.where(processed_test_store["Month"].isin([3,4,5]),"Spring","None"))))

In [15]:
processed_test_store["Quarter"].value_counts()

3    41088
Name: Quarter, dtype: int64

In [16]:
processed_train_store["Quarter"].value_counts()

1    252549
2    244396
3    192308
4    155085
Name: Quarter, dtype: int64

In [17]:
### checking the categorical columns
categorical_cols = processed_train_store.select_dtypes(include = ["object"]).columns.tolist()
categorical_cols

['StateHoliday', 'StoreType', 'Assortment', 'Season']

In [18]:
### checking the numerical columns
num_cols = processed_train_store.select_dtypes(include = ["int64",'float64']).columns.tolist()
num_cols

['Store',
 'Day',
 'Month',
 'Year',
 'DayOfWeek',
 'WeekOfYear',
 'Sales',
 'Customers',
 'Open',
 'Promo',
 'SchoolHoliday',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'PromoInterval',
 'SalePerCustomer',
 'Week',
 'Quarter']

In [19]:
target = ["Sales"]

In [20]:
numeric_columns = ["Customers","Open","Promo","Promo2","SchoolHoliday","CompetitionDistance"]
categorical_columns = ["DayOfWeek","Quarter","Month","Year",
"StoreType","Assortment","Season","StateHoliday"]

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [22]:
features = processed_train_store.drop(["Sales"],axis=1)
targets = processed_train_store["Sales"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.8, random_state=42)


In [24]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

In [25]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('rf_model', rf_model)])


In [None]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

In [None]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test_test)


In [None]:
# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)