In [1]:
import numpy as np 
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys, os 
sys.path.append(os.path.abspath(os.path.join('../scripts/')))
# Import utility scripts
from data_cleaner import *

In [4]:
util = Clean_Data()

Utility Functions Imported!!!


In [5]:
def get_date_features(df,col):
    df['Year'] = df[col].dt.year
    df['Month'] = df[col].dt.month
    df['Day'] = df[col].dt.day
    df.drop(columns=[col], axis=1,inplace=True)

In [6]:
store_db = util.read_data('../data/store.csv')
train_db = util.read_data('../data/train.csv')

In [31]:
db = pd.merge(left=train_db,right=store_db,on='Store',how='inner').head(3000)

In [38]:
y = db[['Sales']]

In [67]:
X = db.drop(columns=['Open','CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Sales','Customers','PromoInterval'])

In [68]:
util.to_datetime(X,'Date','%Y-%m-%d')

In [69]:
get_date_features(X,'Date')

# Splitting to test and train sets

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Preprocessor

In [20]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [77]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.to_list()
categorical_features = X.select_dtypes(include=['object']).columns.to_list()

In [80]:
numeric_features


['Store',
 'DayOfWeek',
 'Promo',
 'SchoolHoliday',
 'CompetitionDistance',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'Year',
 'Month',
 'Day']

In [72]:
def encode(df,columns):
    df = pd.get_dummies(df,columns=columns)
    return df

In [78]:
X = encode(X,categorical_features)

In [81]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant', fill_value=0))
      ,('scaler', StandardScaler())
])


In [82]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
]) 


# Estimator

In [83]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestRegressor())
           ])

In [84]:
rf_model = pipeline.fit(X_train, y_train)
print (rf_model)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                          

# Model Accuracy

MSE is chosen as loss function because the dataset contains outliers that are important to the business

In [85]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
predictions = rf_model.predict(X_test)
print (f'r2_score : {r2_score(y_test, predictions)}\n\
RMSE:{np.sqrt(mean_squared_error(y_test, predictions))}\
 ')



r2_score : 0.811216185092343
RMSE:1277.8256103363635 


# Feature Importance

In [57]:
rf_model.steps[1][1].feature_importances_


array([0.1577928 , 0.04638751, 0.54124672, 0.13206146, 0.00538943,
       0.0106833 , 0.01273608, 0.01301683, 0.00942466, 0.0052261 ,
       0.03068109, 0.03535402])