In [1]:
import numpy as np 
import pandas as pd
import streamlit as st

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys, os 
sys.path.append(os.path.abspath(os.path.join('../scripts/')))
# Import utility scripts
from data_cleaner import *

In [4]:
util = Clean_Data()

Utility Functions Imported!!!


In [5]:
def get_date_features(df,col):
    df[col] = pd.to_datetime(df[col])
    df['Year'] = df[col].dt.year
    df['Month'] = df[col].dt.month
    df['Day'] = df[col].dt.day
    df['WeekOfYear'] = df[col].dt.weekofyear
    df.drop(columns=[col], axis=1,inplace=True)

In [6]:
store_db = util.read_data('../data/store.csv')
train_db = util.read_data('../data/train.csv')

In [7]:
db = pd.merge(left=train_db,right=store_db,on='Store',how='inner')

In [10]:
# Process data for dashboard
train = db[(db.Open != 0) & (db.Sales>0)]
def process(df):
    
    #Replacing null values for CompetitionOpenSinceMonth,CompetitionOpenSinceYear
    util.fill_null('CompetitionOpenSinceMonth',df,df['CompetitionOpenSinceMonth'].mean())
    util.fill_null('CompetitionOpenSinceYear',df,df['CompetitionOpenSinceYear'].mean())
    util.fill_null('CompetitionDistance',df,df['CompetitionDistance'].mean())
    df.fillna(0, inplace=True)
    
    #label encode categorical_features  
    mapping = {'0':0, 'a':1, 'b':2, 'c':3}
    df.StateHoliday.replace(mapping, inplace=True)
    df.StoreType.replace(mapping, inplace=True)
    df.Assortment.replace(mapping, inplace=True)
    
    #Get date features  
    get_date_features(df, 'Date')
    
    #Calculate competitor open time in months
    df['CompetitionOpenMonths'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + \
    (df.Month - df.CompetitionOpenSinceMonth)
    df['CompetitionOpenMonths'] = df['CompetitionOpenMonths'].apply(lambda x: x if x > 0 else 0)

    # calculate promo2 open time in months
    df['Promo2OpenMonths'] = 12 * (df.Year - df.Promo2SinceYear) + \
        (df.WeekOfYear - df.Promo2SinceWeek) / 4.0
    df['Promo2OpenMonths'] = df['Promo2OpenMonths'].apply(lambda x: x if x > 0 else 0)
    
    #Check if month in promo2 month
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['month_str'] = df.Month.map(month2str)
    def check(row):
        if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
        else:
            return 0
        
    df['IsPromoMonth'] =  df.apply(lambda row: check(row),axis=1)  
    # select the features we need
    features = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpenMonths', 'Promo2OpenMonths', 'IsPromoMonth']
    X = df[features]
    y = df[['Sales']]
    
    return X,y
    
X,y = process(train)


In [26]:
y.reset_index().drop(columns=['index'], inplace =True)
X.reset_index().drop(columns=['index'], inplace =True)

# Splitting to test and train sets

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Preprocessor

In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [29]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.to_list()

In [30]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant', fill_value=0))
      ,('scaler', StandardScaler())
])


In [31]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
]) 


# Estimator

In [32]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestRegressor())
           ])

In [None]:
rf_model = pipeline.fit(X_train, y_train)
print (rf_model)

# Model Accuracy

MSE is chosen as loss function because the dataset contains outliers that are important to the business

In [96]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
predictions = rf_model.predict(X_test)
print (f'r2_score : {r2_score(y_test, predictions)}\n\
RMSE:{np.sqrt(mean_squared_error(y_test, predictions))}\
 ')



r2_score : 0.9447550063827611
RMSE:691.2508413990057 


# Feature Importance

In [57]:
rf_model.steps[1][1].feature_importances_


array([0.1577928 , 0.04638751, 0.54124672, 0.13206146, 0.00538943,
       0.0106833 , 0.01273608, 0.01301683, 0.00942466, 0.0052261 ,
       0.03068109, 0.03535402])