<br><br><br><br><br><br><br><br><br><br>

<h1>A COMPARISON OF<br>ENSEMBLE LEARNING METHODS<br>IN RETAIL SALES FORECASTING</h1>
<b>by Serhan SÜER</b>

<br><br><br><br><br>

<h1>1. Introduction</h1>

Importing libraries:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import time

from math import sqrt
import statistics
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import category_encoders as ce

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb
from sklearn.ensemble import VotingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None
pd.set_option('display.float_format', '{:.2f}'.format)

Reading the data files:

In [2]:
train = pd.read_csv("data/train.csv", sep=',', header=0, 
                    names=['store_id', 'department_id', 'date', 'weekly_sales', 'is_holiday'])

features = pd.read_csv("data/features.csv", sep=',', header=0, 
                       names=['store_id', 'date', 'temperature', 'fuel_price', 'markdown_1', 'markdown_2', 
                              'markdown_3', 'markdown_4', 'markdown_5', 'cpi', 'unemployment', 'is_holiday'])

stores = pd.read_csv("data/stores.csv", sep=',', header=0, 
                     names=['store_id', 'type', 'store_size'])

In [None]:
print(train.shape)
print(features.shape)
print(stores.shape)

In [None]:
train.head()

In [None]:
features.head()

In [None]:
stores.head()

Merging all data files:

In [3]:
data = pd.merge(train, features, on=['store_id', 'date', 'is_holiday'], how='left')
data = pd.merge(data, stores, on=['store_id'], how='left')
del train, features, stores

Sorting data by date/store number/department number, moving date column to the beginning and weekly_sales column to the end:

In [4]:
data = data.sort_values(['date', 'store_id', 'department_id']).reset_index(drop=True)
data = pd.concat([data.date, data.drop('date', axis=1)], axis=1)
data = pd.concat([data.drop('weekly_sales', axis=1), data.weekly_sales], axis=1)

In [None]:
data.shape

The merged data contains 421570 rows and 16 columns.

Looking at the first 5 rows of data:

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.apply(lambda x: [x.nunique()])

In [None]:
data.apply(lambda x: [x.unique()])

In [None]:
data.isna().mean()

<br>

***

<br>

<h1>2. Exploratory Data Analysis</h1><br>

<h2>2.1. Univariate Analysis</h2>

In [None]:
cats = data[['date', 'store_id', 'department_id', 'is_holiday', 'type']]
nums = data.drop(cats.columns, axis=1).fillna(0)

<br>

<h3>- Categorical</h3>

In [None]:
pd.DataFrame({'Categorical Variables':cats.columns})

In [None]:
def analyze_cats(dataframe, column_name):
    print('-' * 100 + '\n' + 'Number of Unique Values:')
    print(str(dataframe[column_name].nunique()) + '\n' + '-' * 100)
    print('Unique Values:')
    print(np.sort(dataframe[column_name].unique()), '\n' + '-' * 100)
    uniques = list(dataframe[column_name].value_counts().index)
    counts = list(dataframe[column_name].value_counts().values)
    percentages = list(dataframe[column_name].value_counts(normalize=True).values)
    freq_table_list =  list(zip(uniques, counts, percentages))
    freq_table = pd.DataFrame(freq_table_list, columns = [column_name.capitalize(), 'Count' , 'Count%'])
    plt.figure(figsize=(18, 8))
    if len(dataframe[column_name].unique()) < 5:
        display(freq_table, dataframe[column_name].value_counts(normalize = True).plot(kind='pie', 
                                                          labels=dataframe[column_name].unique(), 
                                                          autopct='%1.1f%%', 
                                                          startangle=90))
    else:
        display(freq_table, dataframe[column_name].value_counts(normalize = True).plot(kind='bar', legend=True))

<br>

<b>date</b>

In [None]:
analyze_cats(data, 'date')

Dataset consists of weekly sales of Wallmart Stores from 2010-02-05 to 2012-10-26.  
So time period of the data can be considered as 2 years and 9 months or 143 weeks.

<br>

<b>store_id</b>

In [None]:
analyze_cats(data, 'store_id')

There are 45 Wallmart Stores in the dataset.

<br>

<b>department_id</b>

In [None]:
analyze_cats(data, 'department_id')

Dataset contains 81 different departments in the stores.  
Furthermore, it can be understood that while some departments exists in most of stores, some exists in only few stores.

<br>

<b>is_holiday</b>

In [None]:
analyze_cats(data, 'is_holiday')

The outputs shows that "is_holiday" variable is highly unbalanced since 93% of the weekly sales did not occur in the holidays.

<br>

<b>type</b>

In [None]:
analyze_cats(data, 'type')

While roundly half of the weekly sales records are related to Type A stores, almost 40% of them occurred in Type B stores and Type C stores have only 10 percentage.

<br>

<h3>- Numerical</h3>

In [None]:
pd.DataFrame({'Numerical Variables':nums.columns})

In [None]:
nums.describe()

In [None]:
pd.DataFrame({'Features':nums.var().index, 'Variance':nums.var().values}).sort_values('Variance')

In [None]:
pd.DataFrame({'Features':(nums.std() / nums.mean()).index, 
              'CV':(nums.std() / nums.mean()).values}).sort_values('CV', ascending=False)

In [None]:
pd.DataFrame({'Features':nums.skew().index, 'Skewness':nums.skew().values}).sort_values('Skewness')

In [None]:
pd.DataFrame({'Features':nums.kurtosis().index, 'Kurtosis':nums.kurtosis().values}).sort_values('Kurtosis')

In [None]:
for i in nums.columns:
    plt.figure(figsize=(12, 2))
    sns.distplot(nums[i])

In [None]:
nums.hist(figsize=(20, 15), bins=20, xlabelsize=9, ylabelsize=9);

In [None]:
for i in nums.columns:
    plt.figure(figsize=(12, 2))
    sns.boxplot(x=nums[i])

<br><br>

<h2>2.2. Bivariate Analysis</h2><br>

<h3>- Categorical & Numerical</h3>

In [None]:
cats['is_holiday'] = cats['is_holiday'].replace({False:0, True:1})

In [None]:
cats['type'] = cats['type'].replace({'A':3, 'B':2, 'C':1})

In [None]:
cats = cats.astype({'date':'category'})
cats['date'] = cats['date'].cat.codes

In [None]:
for i in cats.columns:
    print(i)
    print(stats.kruskal(cats[i], nums['weekly_sales']))
    print('\n')

According to Kruskal Wallis test result, sample distributions in the categorical variables are not equal.

<br>

<h3>- Numerical & Numerical</h3>

Correlation Analysis based on "Spearman" method will be used since numerical features don't have gaussian distribution.

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(round(abs(nums.corr(method ='spearman')), 2), vmin=0, vmax=1, 
            center=0.5, annot=True, cmap=plt.cm.Reds, square=True);

In [None]:
round(abs(nums.corr(method ='spearman')), 2)[round(abs(nums.corr(method ='spearman')), 2) > 0.7] \
    [round(abs(nums.corr(method ='spearman')), 2) < 1.0].dropna(how='all', axis=[0, 1])

In [None]:
for i in nums.drop('weekly_sales', axis=1).columns:
    plt.figure(figsize=(12, 2))
    sns.scatterplot(x=i, y="weekly_sales", data=nums);

<br><br>

<h2>2.3. Multivariate Analysis</h2>

For Multivariate Analysis, ANCOVA will be used since data has both categorical and numerical features.

In [None]:
encoder = ce.BinaryEncoder(cols=['date', 'store_id', 'department_id', 'type'], drop_invariant=True)

cats = encoder.fit_transform(cats)

In [None]:
all_columns = " + ".join(pd.concat([cats, nums], axis=1).columns)[:-15]

In [None]:
formula = "weekly_sales ~ " + " + ".join(pd.concat([cats, nums], axis=1).columns)[:-15]

In [None]:
results = ols(formula, data=pd.concat([cats, nums], axis=1)).fit()

results.summary()

<br>

***

<br>

<h1>3. Methodology</h1><br>

<h2>3.1. Data Preprocessing</h2>

Firstly, variables will be split into two sets as "features" and "target" in order to implement preprocessing easily.

In [5]:
features = data.drop('weekly_sales', axis=1)
target = pd.DataFrame(data['weekly_sales'], columns=['weekly_sales'])

<br>

<h3>- Missing Value Treatment</h3>

In [None]:
features.isna().mean()

In [None]:
target.isna().mean()

In [6]:
features[['markdown_1','markdown_2','markdown_3','markdown_4', 'markdown_5']] = \
    features[['markdown_1','markdown_2','markdown_3','markdown_4', 'markdown_5']].fillna(0)

In [None]:
features.isna().sum()

<br>

<h3>- Outlier Treatment</h3>

Identifying the outliers in continuous variables based on IQR Score Method:

In [None]:
def find_outliers(data, column_list):
    for i in column_list:
        Q1 = data[i].quantile(0.25)
        Q3 = data[i].quantile(0.75)
        IQR = Q3 - Q1
        print(i + ' ' * (13 - len(i)) + ': ' + 
              str(len(data[i][(data[i] < (Q1 - 3 * IQR)) | (data[i] > (Q3 + 3 * IQR))])))
        
find_outliers(features, features.select_dtypes(include=['int64', 'float64']).columns)

In [None]:
find_outliers(features[features > 0], ['markdown_1', 'markdown_2', 'markdown_3', 'markdown_4', 'markdown_5'])

In [None]:
find_outliers(target, ['weekly_sales'])

<br>

<h3>- Feature Engineering</h3>

In [7]:
features = features.astype({'date':'datetime64'})
features['week_number'] = features['date'].dt.week
features = features.drop('date', axis=1)

<br>

<h3>- Label Encoding</h3>

In [8]:
le = LabelEncoder()
features['is_holiday'] = le.fit_transform(features['is_holiday'])
features['type'] = le.fit_transform(features['type'])

<br>

<h3>- Feature Selection</h3>

In [None]:
print(sorted(features[features['is_holiday'] == True]['week_number'].unique()))
print(sorted(features[features['is_holiday'] == False]['week_number'].unique()))

Since "week_number" represents the information which "is_holiday" variable offers, "is_holiday" will be dropped.

In [9]:
features = features.drop('is_holiday', axis=1)

<br>

<b>Based on Pairwise Correlation</b>

In [None]:
corr_data = round(abs(pd.concat([features, target], axis=1).corr(method ='spearman')), 2)

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(corr_data, vmin=0, vmax=1, center=0.5, annot=True, cmap=plt.cm.Reds, square=True);

In [None]:
corr_data[corr_data > 0.7][corr_data < 1.0].dropna(how='all', axis=[0, 1])

In [None]:
pd.DataFrame({'Features':corr_data['weekly_sales'].sort_values(ascending=False).index, 
              'Corr. with Target':corr_data['weekly_sales'].sort_values(ascending=False).values}).drop(0, axis=0)

Since "markdown_4" and "store_size" features have higher correlation with the target, other variables having high correlation will be dropped:

In [10]:
features = features.drop(['markdown_3', 'markdown_2', 'markdown_5', 'markdown_1', 'type'], axis=1)

<br>

<b>Based on Variance</b>

In [None]:
pd.DataFrame({'Features':features.var().index, 'Variance':features.var().values}).sort_values('Variance')

Since "fuel_price" has very low variation, it will be dropped.

In [11]:
features = features.drop('fuel_price', axis=1)

<br>

<b>Based on Feature Importance</b>

In [None]:
model = RandomForestRegressor(random_state=1)
model.fit(features, target)
feat_importances = pd.Series(model.feature_importances_, index=features.columns)
plt.figure(figsize=(12, 6))
feat_importances.sort_values().plot(kind='barh', grid=True)
plt.show()

In [None]:
pd.DataFrame({'Features':feat_importances.sort_values(ascending=False).index, 
              'Importances':feat_importances.sort_values(ascending=False).values})

According to feature importance output, 'markdown_4' will be dropped since it doesn't have a significant feature importance.

In [12]:
features = features.drop('markdown_4', axis=1)

<br>

<h3>- One Hot Encoding</h3>

Since the scope of this study contains only tree-based models, one hot encoding will not be applied.

<br>

<h3>- Feature Scaling</h3>

Since the scope of this study contains only tree-based models, feature scaling will not be applied.

<br>

<h3>- Train-Test Split</h3>

In [13]:
x_train_val, x_test, y_train_val, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=0)

In [None]:
print('x_train : {0:.{1}f}%'.format(x_train.shape[0] / features.shape[0] * 100, 0))
print('y_train : {0:.{1}f}%'.format(y_train.shape[0] / target.shape[0] * 100, 0))
print('\n')
print('x_val   : {0:.{1}f}%'.format(x_val.shape[0] / features.shape[0] * 100, 0))
print('y_val   : {0:.{1}f}%'.format(y_val.shape[0] / target.shape[0] * 100, 0))
print('\n')
print('x_test  : {0:.{1}f}%'.format(x_test.shape[0] / features.shape[0] * 100, 0))
print('y_test  : {0:.{1}f}%'.format(y_test.shape[0] / target.shape[0] * 100, 0))

<br><br>

<h2>3.2. Model Building</h2>

<b>A. Bootstrap Aggregation</b>  
--- Bagging (BaggingRegressor)  
--- Random Forest (RandomForestRegressor)  
--- Extremely Randomized Trees (ExtraTreesRegressor)

<b>B. Boosting</b>  
--- Adaptive Boosting (AdaBoostRegressor)  
--- Extreme Gradient Boosting (XGBoost)

<b>C. Stacked Generalization</b>  
--- Voting Regressor (VotingRegressor)  
<br>

In order to obtain reasonable comparison, the maximum depths of the model was selected same in all models as 7 for the beginning.  
Different values of depth will be tried in the stage of hyperparameter tuning.

In [None]:
models = {'Bagging' : BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=7, random_state=1), 
                                       n_estimators=100, 
                                       n_jobs=-1, 
                                       random_state=1), 
          'Random F.' : RandomForestRegressor(n_estimators=100, 
                                              max_depth=7, 
                                              n_jobs=-1, 
                                              random_state=1),
          'Extra T.' : ExtraTreesRegressor(n_estimators=100, 
                                           max_depth=7, 
                                           n_jobs=-1, 
                                           random_state=1),
          'AdaBoost' : AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=7, random_state=1), 
                                         n_estimators=100, 
                                         learning_rate=0.1, 
                                         random_state=1),
          'XGBoost' : xgb.XGBRegressor(n_estimators=100, 
                                       learning_rate=0.1,
                                       max_depth=7, 
                                       n_jobs=-1, 
                                       random_state=1)}

In [None]:
model_name = list(models.keys())
mae_scores = []
rmse_scores = []
r2_scores = []
times = []

for i in models:
    start = time.time()
    model = models[i]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    mae_scores.append(mean_absolute_error(y_val, y_pred))
    rmse_scores.append(sqrt(mean_squared_error(y_val, y_pred)))
    r2_scores.append(r2_score(y_val, y_pred))
    end = time.time()
    times.append(end - start)

compare_list =  list(zip(model_name, mae_scores, rmse_scores, r2_scores, times))
compare = pd.DataFrame(compare_list, columns = ['Model', 'MAE' , 'RMSE', 'R2', 'Time(sec)'])
compare

In stacked generalization algorithm, XGBoost, AdaBoost and Random Forest will be used since they have higher scores than other models.

In [None]:
model = VotingRegressor(estimators=[('xg', xgb.XGBRegressor(n_estimators=100, 
                                                             learning_rate=0.1, 
                                                             max_depth=7, 
                                                             n_jobs=-1, 
                                                             random_state=1)), 
                                    ('ad', AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=7, 
                                                                                                  random_state=1),
                                                             n_estimators=100, 
                                                             learning_rate=0.1, 
                                                             random_state=1)),
                                    ('rf', RandomForestRegressor(n_estimators=100, 
                                                                 max_depth=7, 
                                                                 n_jobs=-1, 
                                                                 random_state=1))], 
                        n_jobs=-1)

model_name.append('VotingReg')
models['VotingReg'] = model
start = time.time()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
mae_scores.append(mean_absolute_error(y_val, y_pred))
rmse_scores.append(sqrt(mean_squared_error(y_val, y_pred)))
r2_scores.append(r2_score(y_val, y_pred))
end = time.time()
times.append(end - start)

compare_list =  list(zip(model_name, mae_scores, rmse_scores, r2_scores, times))
compare = pd.DataFrame(compare_list, columns = ['Model', 'MAE' , 'RMSE', 'R2', 'Time(sec)'])
compare

In [None]:
compare.plot(kind='barh', 
             x='Model', 
             y=['MAE' , 'RMSE', 'R2'], 
             figsize=(14, 8), 
             logx=True,
             grid=True, 
             legend='reverse');

<br><br>

<h2>3.3. Model Evaluation</h2>

<h3>3.3.1. Performance Metrics</h3>

In [None]:
def take_second(x):
    return x[1]

<br>

<b>Mean Absolute Error</b>

In [None]:
for i, j in sorted(zip(model_name, mae_scores), key=take_second, reverse=False):
    print(i + ' ' * (12 - len(i)) + ': {0:.{1}f}'.format(j, 2))

In [None]:
compare.sort_values('MAE', ascending=False).plot(kind='barh', 
                                                 x='Model', 
                                                 y='MAE', 
                                                 figsize=(12, 6), 
                                                 legend=False);

<br>

<b>Root Mean Squared Error</b>

In [None]:
for i, j in sorted(zip(model_name, rmse_scores), key=take_second, reverse=False):
    print(i + ' ' * (12 - len(i)) + ': {0:.{1}f}'.format(j, 2))

In [None]:
compare.sort_values('RMSE', ascending=False).plot(kind='barh', 
                                                  x='Model', 
                                                  y='RMSE', 
                                                  figsize=(12, 6),
                                                  legend=False);

<br>

<b>R-Squared</b>

In [None]:
for i, j in sorted(zip(model_name, r2_scores), key=take_second, reverse=True):
    print(i + ' ' * (12 - len(i)) + ': {0:.{1}f}'.format(j, 2))

In [None]:
compare.sort_values('R2').plot(kind='barh', 
                               x='Model', 
                               y='R2', 
                               figsize=(12, 6),
                               legend=False);

<br>

<b>Runtime</b>

In [None]:
for i, j in sorted(zip(model_name, times), key=take_second, reverse=False):
    print(i + ' ' * (12 - len(i)) + ': {0:.{1}f}'.format(j, 2))

In [None]:
compare.sort_values('Time(sec)', ascending=False).plot(kind='barh', 
                                                       x='Model', 
                                                       y='Time(sec)', 
                                                       figsize=(12, 6), 
                                                       legend=False);

<br><br>

<h3>3.3.2. Cross-Validation</h3>

In order to achieve an unbiased estimate of the model performance, 5-fold cross-validation will be used.

In [None]:
all_mae_scores = []
all_rmse_scores = []
all_r2_scores = []
all_times = []

for i in models:
    mae_scores = []
    rmse_scores = []
    r2_scores = []
    times = [] 
    model = models[i]
    cv = KFold(n_splits=5)
    for train_index, test_index in cv.split(x_train_val.values):
        start = time.time()
        x_train, x_val, y_train, y_val = x_train_val.iloc[train_index], x_train_val.iloc[test_index], \
                                            y_train_val.iloc[train_index], y_train_val.iloc[test_index]       
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)    
        mae_scores.append(mean_absolute_error(y_val, y_pred))
        rmse_scores.append(sqrt(mean_squared_error(y_val, y_pred)))
        r2_scores.append(r2_score(y_val, y_pred))
        end = time.time()
        times.append(end - start)
    all_mae_scores.append(sum(mae_scores) / len(mae_scores))
    all_rmse_scores.append(sum(rmse_scores) / len(rmse_scores))
    all_r2_scores.append(sum(r2_scores) / len(r2_scores))
    all_times.append(round(sum(times) / len(times)))

compare_list_cv =  list(zip(model_name, all_mae_scores, all_rmse_scores, all_r2_scores, all_times))
compare_cv = pd.DataFrame(compare_list_cv, columns = ['Model', 'MAE' , 'RMSE', 'R2', 'Time(sec)'])
compare_cv.sort_values('MAE')

In [None]:
compare_cv.sort_values('MAE', ascending=False).plot(kind='barh', 
                                                    x='Model', 
                                                    y=['MAE' , 'RMSE', 'R2'], 
                                                    logx=True, 
                                                    legend='reverse',
                                                    figsize=(12, 6));

<br><br>

<h3>3.3.3. Hyperparameter Optimization</h3>

In [22]:
grid_param = {'n_estimators'     : range(100, 1001, 100), 
              'learning_rate'    : [0.01, 0.05, 0.1], 
              'max_depth'        : range(3, 11), 
              'gamma'            : [0, 1, 5], 
              'subsample'        : [0.8, 0.9, 1.0], 
              'colsample_bytree' : [0.8, 0.9, 1.0]}

xgb_grid = xgb.XGBRegressor(n_jobs=-1, 
                            random_state=1)

cv = KFold(n_splits=3)

grid = GridSearchCV(estimator=xgb_grid,
                    param_grid=grid_param,
                    scoring='neg_mean_absolute_error',
                    cv=cv,
                    n_jobs=-1)

grid.fit(x_train, y_train)

print(grid.best_params_, -grid.best_score_)

KeyboardInterrupt: 

In [29]:
grid_param = {'n_estimators' : range(100, 1001, 100),
              'max_depth'    : range(3, 11)}

xgb_grid = xgb.XGBRegressor(learning_rate=0.3,
                            gamma=1,
                            subsample=0.8,
                            colsample_bytree=1,
                            n_jobs=-1, 
                            random_state=1)

cv = KFold(n_splits=5)

grid = GridSearchCV(estimator=xgb_grid,
                    param_grid=grid_param,
                    scoring='neg_mean_absolute_error',
                    cv=cv,
                    n_jobs=-1)

grid.fit(x_train_val, y_train_val)

print(grid.best_params_, -grid.best_score_)

{'max_depth': 10, 'n_estimators': 1000} 1452.0846638777512


In [21]:
start = time.time()
model = xgb.XGBRegressor(learning_rate=0.05, 
                         n_estimators=4800,
                         max_depth=10,
                         n_jobs=-1, 
                         random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_val, y_pred), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_val, y_pred), 2))
end = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end - start, 0))

print('-' * 100)

y_pred_test = model.predict(x_test)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_test, y_pred_test), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_test, y_pred_test), 2))
end2 = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end2 - start, 0))

Mean Absolute Error : 1321.26
R-Square            : 0.98
Runtime             : 795sec
----------------------------------------------------------------------------------------------------
Mean Absolute Error : 1316.63
R-Square            : 0.98
Runtime             : 833sec


In [16]:
start = time.time()
model = xgb.XGBRegressor(learning_rate=0.3, 
                         n_estimators=700,
                         max_depth=10,
                         n_jobs=-1, 
                         random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_val, y_pred), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_val, y_pred), 2))
end = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end - start, 0))

print('-' * 100)

y_pred_test = model.predict(x_test)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_test, y_pred_test), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_test, y_pred_test), 2))
end2 = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end2 - start, 0))

Mean Absolute Error : 1424.71
R-Square            : 0.98
Runtime             : 118sec
----------------------------------------------------------------------------------------------------
Mean Absolute Error : 1412.67
R-Square            : 0.98
Runtime             : 120sec


In [15]:
start = time.time()
model = xgb.XGBRegressor(learning_rate=0.3, 
                         n_estimators=700,
                         max_depth=10,
                         gamma=1,
                         subsample=0.8,
                         colsample_bytree=1,
                         n_jobs=-1, 
                         random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_val, y_pred), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_val, y_pred), 2))
end = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end - start, 0))

print('-' * 100)

y_pred_test = model.predict(x_test)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_test, y_pred_test), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_test, y_pred_test), 2))
end2 = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end2 - start, 0))

Mean Absolute Error : 1486.13
R-Square            : 0.98
Runtime             : 136sec
----------------------------------------------------------------------------------------------------
Mean Absolute Error : 1471.06
R-Square            : 0.98
Runtime             : 138sec


In [17]:
start = time.time()
model = xgb.XGBRegressor(learning_rate=0.1, 
                         n_estimators=2100,
                         max_depth=10,
                         n_jobs=-1, 
                         random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_val, y_pred), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_val, y_pred), 2))
end = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end - start, 0))

print('-' * 100)

y_pred_test = model.predict(x_test)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_test, y_pred_test), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_test, y_pred_test), 2))
end2 = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end2 - start, 0))

Mean Absolute Error : 1331.12
R-Square            : 0.98
Runtime             : 363sec
----------------------------------------------------------------------------------------------------
Mean Absolute Error : 1322.12
R-Square            : 0.98
Runtime             : 375sec


In [18]:
start = time.time()

model = xgb.XGBRegressor(learning_rate=0.05, 
                         n_estimators=4200,
                         max_depth=10,
                         n_jobs=-1, 
                         random_state=1)

model.fit(x_train, y_train)

y_pred = model.predict(x_val)
print('Validation')
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_val, y_pred), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_val, y_pred), 2))
end = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end - start, 0))

print('-' * 100)

y_pred_test = model.predict(x_test)
print('Test')
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_test, y_pred_test), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_test, y_pred_test), 2))
end2 = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end2 - start, 0))

Validation
Mean Absolute Error : 1326.50
R-Square            : 0.98
Runtime             : 697sec
----------------------------------------------------------------------------------------------------
Test
Mean Absolute Error : 1321.54
R-Square            : 0.98
Runtime             : 727sec


In [25]:
grid_param = {'n_estimators' : range(100, 1001, 100)}

xgb_grid = xgb.XGBRegressor(learning_rate=0.3, 
                            max_depth=10,
                            gamma=1,
                            subsample=0.8,
                            colsample_bytree=1,
                            n_jobs=-1, 
                            random_state=1)

cv = KFold(n_splits=3)

grid = GridSearchCV(estimator=xgb_grid,
                          param_grid=grid_param,
                          scoring='neg_mean_absolute_error',
                          cv=cv,
                          n_jobs=-1)

grid.fit(x_train_val, y_train_val)

print(grid.best_params_, -grid.best_score_)

{'n_estimators': 1000} 1519.4321286818501


In [26]:
start = time.time()
model = xgb.XGBRegressor(learning_rate=0.05, 
                         n_estimators=6000,
                         max_depth=10,
                         gamma=1,
                         subsample=0.8,
                         colsample_bytree=1,
                         n_jobs=-1, 
                         random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_val, y_pred), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_val, y_pred), 2))
end = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end - start, 0))

print('-' * 100)

y_pred_test = model.predict(x_test)
print('Mean Absolute Error : {0:.{1}f}'.format(mean_absolute_error(y_test, y_pred_test), 2))
print('R-Square            : {0:.{1}f}'.format(r2_score(y_test, y_pred_test), 2))
end2 = time.time()
print('Runtime             : {0:.{1}f}sec'.format(end2 - start, 0))

Mean Absolute Error : 1308.63
R-Square            : 0.98
Runtime             : 1070sec
----------------------------------------------------------------------------------------------------
Mean Absolute Error : 1297.22
R-Square            : 0.98
Runtime             : 1124sec


<br><br><br><br><br><br><br><br>