# Importing Library

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import mlflow
import mlflow.sklearn

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Initiating Logging

In [3]:
import logging
logging.basicConfig(filename='modellingSteps.log', level=logging.INFO)
logging.info('This log file records the steps for modelling for this project')

In [4]:
def log(text:str):
    logging.info(text)

# Importing Scripts

In [5]:
import sys, os 
sys.path.append(os.path.abspath(os.path.join('../scripts/')))
log('Import utility scripts')
from data_cleaner import *

In [6]:
util = Clean_Data()

Utility Functions Imported!!!


# Importing Data

In [7]:
log('Import Train Dataset to db')

In [8]:
db = util.read_data('../data/trainData.csv')
db.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Month,Day
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0,0,7,31
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0,0,7,30


In [9]:
log('Import Test Dataset to test_db')

In [10]:
test_db = util.read_data('../data/testData.csv')
test_db.head(2)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0,0
1,857,1,3,2015-09-16,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0,0


# Data Preprocesssing:

In [11]:
log('Commence Data Processing:')

In [12]:
log('Convert Date columns in db and test_db to Datetime type')
util.to_datetime(test_db,'Date','%Y-%m-%d')
util.to_datetime(db,'Date','%Y-%m-%d')

In [13]:
def get_date_features(df,col):
    df['Year'] = df[col].dt.year
    df['Month'] = df[col].dt.month
    df['Day'] = df[col].dt.day
    df.drop(columns=[col], axis=1,inplace=True)

In [14]:
log('Get date features from date column in db and test_db')
get_date_features(test_db,'Date')
get_date_features(db,'Date')
test_db.head(2)

Unnamed: 0,Id,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day
0,1,1,4,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0,0,2015,9,17
1,857,1,3,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0,0,2015,9,16


# Extracting feature columns:

In [15]:
log('Extract feature columns from db to train_db:')

In [17]:
log('Drop Columns:"Date,Customers,Sales,PromoInterval,CompetitionOpenSinceYear,CompetitionOpenSinceMonth"')
train_db = db.drop(columns=['Customers','Sales',
                           'PromoInterval','CompetitionOpenSinceYear',
                           'CompetitionOpenSinceMonth'], axis=1)

In [18]:
train_db.head(2)

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Promo2SinceWeek,Promo2SinceYear,Month,Day,Year
0,1,5,1,1,0,1,c,a,1270.0,0,0.0,0,7,31,2015
1,1,4,1,1,0,1,c,a,1270.0,0,0.0,0,7,30,2015


In [19]:
log('Drop columns:"Id,PromoInterval,CompetitionOpenSinceYear,CompetitionOpenSinceMonth" in test_db:')
test_db = test_db.drop(columns=['Id','PromoInterval',
                                'CompetitionOpenSinceYear',
                                'CompetitionOpenSinceMonth'], axis=1)

In [20]:
test_db.head(2)

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day
0,1,4,1.0,1,0,0,c,a,1270.0,0,0.0,0,2015,9,17
1,1,3,1.0,1,0,0,c,a,1270.0,0,0.0,0,2015,9,16


In [21]:
log('Extract Sales column from db into y for target variable')
y = db[['Sales']]
y.head(2)

Unnamed: 0,Sales
0,5263
1,5020


# Encoding categorical data

In [22]:
log('Encode Categorical variables in train_db and test_db:')

In [23]:
def encode(df,columns):
    df = pd.get_dummies(df,columns=columns)
    return df

In [24]:
log('Encode Columns:"StoreType,Assortment,StateHoliday" in train_db.')
train_db = encode(train_db,['StoreType','Assortment','StateHoliday'])

In [25]:
log('Encode Columns:"StoreType,Assortment" in test_db.')
test_db = encode(test_db,['StoreType','Assortment'])

# Scaling

In [27]:
log('Start Feature Scaling:')
log('Import StandardScaler from sklearn.preprocessing')
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [28]:
log('Scale train_db into X')
X = sc_X.fit_transform(train_db)

# Split data into train and test set

In [29]:
log('Split data into train and test set:')
log('Import train_test_split from sklearn.model_selection')
log('Split X into X_train, X_test and y into Y_train and Y_test in ratio 80:20')
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

# Train model

In [30]:
log('Train model:')
log('Import Linear Regression from sklearn.linear_model')
log('Create instance of Linear Regression in regressor')
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [31]:
log('Fit regressor to X_train,Y_train')
regressor.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
log('Predict Y_pred from X_test using regressor.predict')
Y_pred = regressor.predict(X_test)

# Check Model Accuracy

In [33]:
log('Check Model Accuracy:')

In [34]:
log('Import rmse,mae,rs2 libraries from sklearn.metrics')
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [35]:
log('Compute rmse,mae,rs2 values')

In [36]:
def eval_metrics(actual,pred):
    """"
    Function to obtain rmse,mae,r2 values for actual and predicted values
    """
    rmse = np.sqrt(mean_squared_error(actual,pred))
    mae = mean_absolute_error(actual,pred)
    r2 = r2_score(actual,pred)
    return rmse,mae,r2

In [43]:
print(f'Sales prediction model: \n \t RMSE:{eval_metrics(Y_test,Y_pred)[0]} \n \
\t MAE:{eval_metrics(Y_test,Y_pred)[1]} \n \
\t R2:{eval_metrics(Y_test,Y_pred)[2]}')

Sales prediction model: 
 	 RMSE:2518.74539978359 
 	 MAE:1756.551981485628 
 	 R2:0.5696415441328841
