In [1]:
import pandas as pd
import numpy as np


# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import os, pickle

In [3]:
#model evaluation
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [4]:
test = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/test.csv')
stores = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/stores.csv')

sample_submission = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/sample_submission.csv')

holidays_events = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/holidays_events.csv')

oil = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/oil.csv')

train = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/train.csv')

transaction = pd.read_csv('/content/drive/MyDrive/timeSeriesAnalysis/transactions.csv')

# Date Transformation

In [5]:
train["date"] = pd.to_datetime(train.date)

In [6]:
test["date"] = pd.to_datetime(test.date)

In [7]:
transaction['date']=pd.to_datetime(transaction.date)

In [8]:
oil['date']=pd.to_datetime(oil.date)


In [9]:
holidays_events['date']=pd.to_datetime(holidays_events.date)

# Merging of data sets

In [10]:
merged_data = pd.merge(train, transaction, on=['date', 'store_nbr'])

In [11]:
merged_data2 = pd.merge(merged_data, holidays_events, on='date')

In [12]:
merged_data3 = pd.merge(merged_data2, oil, on='date')

In [13]:
merged_data4 = pd.merge(merged_data3, stores, on='store_nbr')

In [14]:
new_merged_data = merged_data4.rename(columns={"type_x": "holiday_type", "type_y": "store_type"})

In [15]:
#compress tha data set
new_merged_data.to_parquet('train_data.parquet')
train1 = pd.read_parquet('train_data.parquet')
train1.rename(columns={'dcoilwtico':'oil_price'}, inplace=True)

In [16]:
new_merged_data = train1
new_merged_data.date = pd.to_datetime(new_merged_data.date)

In [17]:
final_data = new_merged_data.drop(columns=['id','locale', 'locale_name','state', 'description', 'transferred','store_type'], inplace=True)

In [18]:
new_merged_data['year'] = new_merged_data.date.dt.year

new_merged_data['month'] = new_merged_data.date.dt.month


new_merged_data['dayofmonth'] = new_merged_data.date.dt.day


new_merged_data['dayofweek'] = new_merged_data.date.dt.dayofweek




In [19]:
new_merged_data.set_index('date', inplace=True)

In [20]:
new_merged_data.head()

Unnamed: 0_level_0,store_nbr,family,sales,onpromotion,transactions,holiday_type,oil_price,city,cluster,year,month,dayofmonth,dayofweek
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-01,25,AUTOMOTIVE,0.0,0,770,Holiday,,Salinas,1,2013,1,1,1
2013-01-01,25,BABY CARE,0.0,0,770,Holiday,,Salinas,1,2013,1,1,1
2013-01-01,25,BEAUTY,2.0,0,770,Holiday,,Salinas,1,2013,1,1,1
2013-01-01,25,BEVERAGES,810.0,0,770,Holiday,,Salinas,1,2013,1,1,1
2013-01-01,25,BOOKS,0.0,0,770,Holiday,,Salinas,1,2013,1,1,1


In [21]:
#split the data set
X = new_merged_data.drop(['sales'], axis = 1)
Y = new_merged_data['sales']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

A pipeline of transformers to handle missing values, numeric features and categorical features.

In [22]:
numeric_transformer = Pipeline(steps = [('num_imputer',SimpleImputer(strategy = 'mean')),('scaler',StandardScaler())])
categorical_transformer = Pipeline(steps = [('cat_imputer',SimpleImputer(strategy ='most_frequent')),('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))])

A pipeline of column transformers to handle transformation on column basis

In [23]:
categorical_feature =["family", "city", "holiday_type"]
numeric_feature = ['store_nbr', 'onpromotion', 'transactions', 'oil_price', 'cluster','year', 'month', 'dayofmonth', 'dayofweek']
preprocessor = ColumnTransformer(transformers=[('numeric_transformer',numeric_transformer,numeric_feature),('categorical_transformer',categorical_transformer,categorical_feature)],remainder='drop')

### Linear Regression Model

In [24]:
l_regression = LinearRegression()

l_regression_pipeline = Pipeline(steps=[('preprocessor',preprocessor),('estimator',l_regression)])

In [25]:
# model training
LRg_train = l_regression_pipeline.fit(X_train,y_train)
LR_predicted = LRg_train.predict(X_test)
LR_predicted= abs(LR_predicted)

# model evaluation
y_test_abs = abs(y_test)
ms_error =mean_squared_error(y_test, LR_predicted)
m_squared_log_error = mean_squared_log_error(y_test_abs, LR_predicted)
root_ms_error = np.sqrt(mean_squared_error(y_test, LR_predicted)).round(2)
mslog_error = np.sqrt(mean_squared_log_error(y_test_abs, LR_predicted)).round(2)




In [26]:
results = pd.DataFrame([['Linear Regression', ms_error, m_squared_log_error, root_ms_error, mslog_error]], columns = ['Model', 'MSE', 'MSLE', 'RMSE', 'RMSLE'])
results

Unnamed: 0,Model,MSE,MSLE,RMSE,RMSLE
0,Linear Regression,742672.604941,9.024186,861.78,3.0


In [None]:
#print(ms_error,m_squared_log_error,root_ms_error,mslog_error)

In [None]:
#to get the list of parameters that can be adjusted
#l_regression_pipeline.get_params()

In [None]:
# hyperparameter tuning with GridSearch cv
#create a dictionary of tuning parameters { ‘tuning parameter’ : ‘possible value’, … }


### Decision Tree Regressor Model

In [27]:
#Decision tree regressor
D_tree = DecisionTreeRegressor()
D_tree_pipeline = Pipeline(steps=[('preprocessor',preprocessor),('estimator',D_tree)])

y_test_abs = abs(y_test)

In [28]:
D_tree_train = D_tree_pipeline.fit(X_train,y_train)
D_tree_predicted = D_tree_train.predict(X_test)
D_tree_predicted= abs(D_tree_predicted)




In [29]:
ms_error =mean_squared_error(y_test, D_tree_predicted)
m_squared_log_error = mean_squared_log_error(y_test_abs, D_tree_predicted)
root_ms_error = np.sqrt(mean_squared_error(y_test, D_tree_predicted)).round(2)
mslog_error = np.sqrt(mean_squared_log_error(y_test_abs, D_tree_predicted)).round(2)


In [30]:
model_results = pd.DataFrame([['Decision Tree', ms_error, m_squared_log_error, root_ms_error, mslog_error]], columns = ['Model', 'MSE', 'MSLE', 'RMSE', 'RMSLE'])
results = results.append(model_results, ignore_index = True)

  results = results.append(model_results, ignore_index = True)


In [None]:
#D_tree_pipeline.get_params()

### XGB Regressor Model

In [31]:
#XGBoost Model
xgboost = XGBRegressor(n_estimators=100)
xgboost_pipeline =Pipeline(steps=[('preprocessor',preprocessor),('estimator',xgboost)])

In [32]:
xgboos_train = xgboost_pipeline.fit(X_train,y_train)
xgboos_predicted = xgboos_train.predict(X_test)
xgboos_predicted= abs(xgboos_predicted)




In [33]:
ms_error =mean_squared_error(y_test, xgboos_predicted)
m_squared_log_error = mean_squared_log_error(y_test_abs, xgboos_predicted)
root_ms_error = np.sqrt(mean_squared_error(y_test, xgboos_predicted)).round(2)
mslog_error = np.sqrt(mean_squared_log_error(y_test_abs, xgboos_predicted)).round(2)

In [34]:
model_results = pd.DataFrame([['XGBoost',  ms_error, m_squared_log_error, root_ms_error, mslog_error]], columns = ['Model', 'MSE', 'MSLE', 'RMSE', 'RMSLE'])
results = results.append(model_results, ignore_index = True)

  results = results.append(model_results, ignore_index = True)


In [None]:
#xgboost_pipeline.get_params()

### Random Forest Regressor Model

In [35]:
# Random Forest Regression Model
R_forest = RandomForestRegressor(n_estimators=20, random_state=42)
R_forest_pipeline = Pipeline(steps=[('preprocessor',preprocessor),('estimator',R_forest)])

In [36]:
R_forest_train = R_forest_pipeline.fit(X_train,y_train)
R_forest_predicted = R_forest_train.predict(X_test)
R_forest_predicted= abs(R_forest_predicted)



In [37]:
ms_error =mean_squared_error(y_test, R_forest_predicted)
m_squared_log_error = mean_squared_log_error(y_test_abs, R_forest_predicted)
root_ms_error = np.sqrt(mean_squared_error(y_test, R_forest_predicted)).round(2)
mslog_error = np.sqrt(mean_squared_log_error(y_test_abs, R_forest_predicted)).round(2)

In [38]:

model_results = pd.DataFrame([['Random Forest', ms_error, m_squared_log_error, root_ms_error, mslog_error]], columns = ['Model', 'MSE', 'MSLE', 'RMSE', 'RMSLE'])
results = results.append(model_results, ignore_index = True)

  results = results.append(model_results, ignore_index = True)


In [None]:
#R_forest_pipeline.get_params()

### Gradient Boosting Regressor Model

In [39]:
#Gradientboosting Regressor
Gboost = GradientBoostingRegressor(n_estimators=100, random_state=42)
Gboost_pipeline = Pipeline(steps=[('preprocessor',preprocessor),('estimator',Gboost)])

In [40]:
Gboost_train = Gboost_pipeline.fit(X_train,y_train)
Gboost_predicted = Gboost_train.predict(X_test)
Gboost_predicted= abs(Gboost_predicted)

pickle.dump(Gboost_train, open('gboost_model.pkl', 'wb'))
#joblib.dump(Gboost_train,'/content/drive/MyDrive/Colab Notebooks/Gboost_train.pkl')



In [None]:
ms_error =mean_squared_error(y_test, Gboost_predicted)
m_squared_log_error = mean_squared_log_error(y_test_abs, Gboost_predicted)
root_ms_error = np.sqrt(mean_squared_error(y_test, Gboost_predicted)).round(2)
mslog_error = np.sqrt(mean_squared_log_error(y_test_abs, Gboost_predicted)).round(2)

In [None]:
model_results = pd.DataFrame([['Gradient Boost', ms_error, m_squared_log_error, root_ms_error, mslog_error]], columns = ['Model', 'MSE', 'MSLE', 'RMSE', 'RMSLE'])
results = results.append(model_results, ignore_index = True)

  results = results.append(model_results, ignore_index = True)


In [None]:
#Gboost_pipeline.get_params()

### Extral Tree Regressor Model

In [None]:
xtree = ExtraTreesRegressor(n_estimators=100, random_state=42)
xtree_pipeline =Pipeline(steps=[('preprocessor',preprocessor),('estimator',xtree)])

In [None]:
xtree_train = xtree_pipeline.fit(X_train,y_train)
xtree_predicted = xtree_train.predict(X_test)
xtree_predicted = abs(xtree_predicted)



In [None]:
ms_error =mean_squared_error(y_test, xtree_predicted)
m_squared_log_error = mean_squared_log_error(y_test_abs, xtree_predicted)
root_ms_error = np.sqrt(mean_squared_error(y_test,xtree_predicted)).round(2)
mslog_error = np.sqrt(mean_squared_log_error(y_test_abs, xtree_predicted)).round(2)

In [None]:
model_results = pd.DataFrame([['Extral Tree', ms_error, m_squared_log_error, root_ms_error, mslog_error]], columns = ['Model', 'MSE', 'MSLE', 'RMSE', 'RMSLE'])
results = results.append(model_results, ignore_index = True)
results

  results = results.append(model_results, ignore_index = True)


Unnamed: 0,Model,MSE,MSLE,RMSE,RMSLE
0,Linear Regression,742672.604941,9.024186,861.78,3.0
1,Decision Tree,400394.721847,0.332794,632.77,0.58
2,XGBoost,282005.564196,2.327639,531.04,1.53
3,Random Forest,270107.768847,0.230528,519.72,0.48
4,Gradient Boost,422376.826555,4.259683,649.91,2.06
5,Extral Tree,253894.537162,0.234198,503.88,0.48


In [None]:
#xtree_pipeline.get_params()

In [None]:
#pickle.dump(xtree_train , open('model.pkl', 'wb'))
import joblib
joblib.dump(xtree_train,'/content/drive/MyDrive/Colab Notebooks/xtree_train.pkl')


['/content/drive/MyDrive/Colab Notebooks/xtree_train.pkl']

### Hyperparameter Tuning
The best model from the above evaluation is ExtralTreeRegressor, therefore we will be doing hyperparameter tuning for the model using GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [2,8,16,32,50]

   }

In [None]:
grid = GridSearchCV( xtree_train, param_grid, refit = True, verbose = 3,n_jobs=-1)

#grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)
grid_predictions = grid.predict(X_test)

# print classification report
print(classification_report(y_test, grid_predictions))
