In [1]:
# load basics library
import datetime
import pandas as pd
import numpy as np
import calendar
from sklearn.cluster import MiniBatchKMeans
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.svm.libsvm import cross_validation
from sklearn.model_selection import *
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.utils.multiclass import type_of_target

'''use this model_selection rather than grid-server,
#  https://stackovlem o ferflow.com/questions/40257492/gridsearchcv-typeerror-stratifiedkfold-object-is-not-iterable?rq=1
#  can solve problem of “'StratifiedKFold' object is not iterable”
'''
from sklearn.model_selection import GridSearchCV


#import commond.ipynb from same folder
import import_ipynb
from common_func import *

importing Jupyter notebook from common_func.ipynb


In [2]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

## Scoring	Function	Comment

####　Classification	 	 
‘accuracy’	metrics.accuracy_score	 
‘balanced_accuracy’	metrics.balanced_accuracy_score	for binary targets
‘average_precision’	metrics.average_precision_score	 
‘brier_score_loss’	metrics.brier_score_loss	 
‘f1’	metrics.f1_score	for binary targets
‘f1_micro’	metrics.f1_score	micro-averaged
‘f1_macro’	metrics.f1_score	macro-aeraged
‘f1_weighted’	metrics.f1_score	weighted average
‘f1_samples’	metrics.f1_score	by multilabel sample
‘neg_log_loss’	metrics.log_loss	requires predict_proba support
‘precision’ etc.	metrics.precision_score	suffixes apply as with ‘f1’
‘recall’ etc.	metrics.recall_score	suffixes apply as with ‘f1’
‘roc_auc’	metrics.roc_auc_score	 

####　Clustering	 	 
‘adjusted_mutual_info_score’	metrics.adjusted_mutual_info_score	 
‘adjusted_rand_score’	metrics.adjusted_rand_score	 
‘completeness_score’	metrics.completeness_score	 
‘fowlkes_mallows_score’	metrics.fowlkes_mallows_score	 
‘homogeneity_score’	metrics.homogeneity_score	 
‘mutual_info_score’	metrics.mutual_info_score	 
‘normalized_mutual_info_score’	metrics.normalized_mutual_info_score	 
‘v_measure_score’	metrics.v_measure_score	 

####　Regression	 	 
‘explained_variance’	metrics.explained_variance_score	 
‘neg_mean_absolute_error’	metrics.mean_absolute_error	 
‘neg_mean_squared_error’	metrics.mean_squared_error	 
‘neg_mean_squared_log_error’	metrics.mean_squared_log_error	 
‘neg_median_absolute_error’	metrics.median_absolute_error	 
‘r2’	metrics.r2_score	 

In [3]:
X_train, y_train, X_test, y_test = load_dataSet()
X_train, y_train, X_test, y_test = X_train.values, y_train.values, X_test.values, y_test.values

   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2017-01-07 12:47:25   2017-01-07 13:07:19                  N   
1         2  2017-01-21 12:21:04   2017-01-21 12:22:57                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0           1            40           141              1.0           2.91   
1           1            74            74              1.0           0.36   

   fare_amount ...   ehail_fee  improvement_surcharge  total_amount  \
0         13.0 ...         NaN                    0.3         16.56   
1          3.5 ...         NaN                    0.3          4.30   

   payment_type  trip_type  duration  day_of_week  weekend      speed  tip  
0             1        1.0    1194.0            5        1   8.773869    1  
1             2        1.0     113.0            5        1  11.469027    0  

[2 rows x 24 columns]


## since Size of dataset is very big, it's chosed 5000 data point as input.

In [4]:
#X_train = X_train[:5000]
#y_train = y_train[:5000]

length = X_train.shape[0]
# scale data since DL model is sentive to feature order of magnitude
scalarX, scalarY = MinMaxScaler(), MinMaxScaler()
scalarX.fit(X_train)
scalarY.fit(y_train.reshape(length,1))
X_train_scaled = scalarX.transform(X_train)
y_train_scaled = scalarY.transform(y_train.reshape(length,1))

length = X_test.shape[0]
# scale data since DL model is sentive to feature order of magnitude
scalarX, scalarY = MinMaxScaler(), MinMaxScaler()
scalarX.fit(X_test)
scalarY.fit(y_test.reshape(length,1))
X_test_scaled = scalarX.transform(X_test)
y_test_scaled = scalarY.transform(y_test.reshape(length,1))

In [5]:
X_train_scaled.shape, y_train_scaled.shape, X_test_scaled.shape, y_test_scaled.shape

((2086637, 15), (2086637, 1), (1019792, 15), (1019792, 1))

In [6]:
##　https://xgboost.readthedocs.io/en/latest/parameter.html?highlight=objective

In [None]:
start_time  = datetime.datetime.now()

# Various hyper-parameters to tune
xgb1 = XGBRegressor()
parameters = {'nthread':[6], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [0.01, 0.02, .05], #so called `eta` value
              'max_depth': [5, 6, 7, 9, 12],
              'min_child_weight': [4, 6, 8],
              'silent': [1],
              'subsample': [0.8, 0.9],
              'colsample_bytree': [0.7],
              'n_estimators': [500, 600],
              'tree_method': ['gpu_hist']}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 8,
                        verbose=True)

xgb_grid.fit(X_train_scaled, y_train_scaled)
end_time = datetime.datetime.now()
    
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
print('Training Done..., Time Cost: %d' % ((end_time - start_time).seconds))

In [47]:
## with GPU is enabled,


![GPU]("../picture/GPU.png")

/bin/sh: 1: Syntax error: word unexpected (expecting ")")


In [13]:
xlf = xgb.XGBRegressor(max_depth=12,
                       tree_method='gpu_hist',
                       objective='reg:linear',
                       learning_rate=0.05,
                       n_estimators=600,
                       silent=True,
                       gamma=0,
                       min_child_weight=8,
                       max_delta_step=0,
                       subsample=0.8,
                       colsample_bytree=0.7,
                       colsample_bylevel=1,
                       reg_alpha=1e0,
                       reg_lambda=0,
                       scale_pos_weight=1,
                       seed=9,
                       missing=None,
                       nthread=6)

In [8]:
X_train, y_train, X_test, y_test = load_dataSet()
X_train, y_train, X_test, y_test = X_train.values, y_train.values, X_test.values, y_test.values

   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2017-01-07 12:47:25   2017-01-07 13:07:19                  N   
1         2  2017-01-21 12:21:04   2017-01-21 12:22:57                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0           1            40           141              1.0           2.91   
1           1            74            74              1.0           0.36   

   fare_amount ...   ehail_fee  improvement_surcharge  total_amount  \
0         13.0 ...         NaN                    0.3         16.56   
1          3.5 ...         NaN                    0.3          4.30   

   payment_type  trip_type  duration  day_of_week  weekend      speed  tip  
0             1        1.0    1194.0            5        1   8.773869    1  
1             2        1.0     113.0            5        1  11.469027    0  

[2 rows x 24 columns]


In [15]:
length = X_test.shape[0]
# scale data since DL model is sentive to feature order of magnitude
scalarX, scalarY = MinMaxScaler(), MinMaxScaler()
scalarX.fit(X_test)
scalarY.fit(y_test.reshape(length,1))
X_test_scaled = scalarX.transform(X_test)
y_test_scaled = scalarY.transform(y_test.reshape(length,1))

In [10]:
X_test_scaled.shape, y_test_scaled.shape


((1019792, 15), (1019792, 1))

In [11]:
eval_sets = [(X_train_scaled, y_train_scaled), (X_test_scaled, y_test_scaled)]
eval_metrics = ["mae","error","rmse"]

In [14]:
xlf.fit(X_train_scaled, 
        y_train_scaled, 
        eval_set=eval_sets, 
        verbose=True, 
        eval_metric=eval_metrics, 
        early_stopping_rounds=500)

[0]	validation_0-mae:0.471429	validation_0-error:0.003773	validation_0-rmse:0.471476	validation_1-mae:0.471389	validation_1-error:0.003813	validation_1-rmse:0.47144
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 500 rounds.
[1]	validation_0-mae:0.44787	validation_0-error:0.003773	validation_0-rmse:0.447915	validation_1-mae:0.447832	validation_1-error:0.003813	validation_1-rmse:0.447881
[2]	validation_0-mae:0.425485	validation_0-error:0.003773	validation_0-rmse:0.425527	validation_1-mae:0.425449	validation_1-error:0.003813	validation_1-rmse:0.425494
[3]	validation_0-mae:0.404218	validation_0-error:0.003773	validation_0-rmse:0.404259	validation_1-mae:0.404183	validation_1-error:0.003813	validation_1-rmse:0.404228
[4]	validation_0-mae:0.384014	validation_0-error:0.003773	validation_0-rmse:0.384054	validation_1-mae:0.383982	validation_1-error:0.003813	validation_1-rmse:0.384024
[5]	validati

[49]	validation_0-mae:0.038302	validation_0-error:0.003773	validation_0-rmse:0.038356	validation_1-mae:0.038289	validation_1-error:0.003813	validation_1-rmse:0.038358
[50]	validation_0-mae:0.036394	validation_0-error:0.003773	validation_0-rmse:0.03645	validation_1-mae:0.03638	validation_1-error:0.003813	validation_1-rmse:0.036453
[51]	validation_0-mae:0.034579	validation_0-error:0.003773	validation_0-rmse:0.034639	validation_1-mae:0.034565	validation_1-error:0.003813	validation_1-rmse:0.034641
[52]	validation_0-mae:0.032856	validation_0-error:0.003773	validation_0-rmse:0.032918	validation_1-mae:0.032842	validation_1-error:0.003813	validation_1-rmse:0.032921
[53]	validation_0-mae:0.031219	validation_0-error:0.003773	validation_0-rmse:0.031284	validation_1-mae:0.031204	validation_1-error:0.003813	validation_1-rmse:0.031288
[54]	validation_0-mae:0.029662	validation_0-error:0.003773	validation_0-rmse:0.02973	validation_1-mae:0.029647	validation_1-error:0.003813	validation_1-rmse:0.029735
[

[99]	validation_0-mae:0.003052	validation_0-error:0.003773	validation_0-rmse:0.003616	validation_1-mae:0.00304	validation_1-error:0.003813	validation_1-rmse:0.003785
[100]	validation_0-mae:0.002908	validation_0-error:0.003773	validation_0-rmse:0.003494	validation_1-mae:0.002896	validation_1-error:0.003813	validation_1-rmse:0.00367
[101]	validation_0-mae:0.002771	validation_0-error:0.003773	validation_0-rmse:0.00338	validation_1-mae:0.002759	validation_1-error:0.003813	validation_1-rmse:0.003561
[102]	validation_0-mae:0.002641	validation_0-error:0.003773	validation_0-rmse:0.003273	validation_1-mae:0.002629	validation_1-error:0.003813	validation_1-rmse:0.003461
[103]	validation_0-mae:0.002518	validation_0-error:0.003773	validation_0-rmse:0.003175	validation_1-mae:0.002507	validation_1-error:0.003813	validation_1-rmse:0.003369
[104]	validation_0-mae:0.002402	validation_0-error:0.003773	validation_0-rmse:0.003083	validation_1-mae:0.002391	validation_1-error:0.003813	validation_1-rmse:0.003

[148]	validation_0-mae:0.000539	validation_0-error:0.003773	validation_0-rmse:0.001989	validation_1-mae:0.000535	validation_1-error:0.003813	validation_1-rmse:0.002286
[149]	validation_0-mae:0.00053	validation_0-error:0.003773	validation_0-rmse:0.001987	validation_1-mae:0.000527	validation_1-error:0.003813	validation_1-rmse:0.002285
[150]	validation_0-mae:0.000522	validation_0-error:0.003773	validation_0-rmse:0.001986	validation_1-mae:0.000519	validation_1-error:0.003813	validation_1-rmse:0.002283
[151]	validation_0-mae:0.000514	validation_0-error:0.003773	validation_0-rmse:0.001984	validation_1-mae:0.000512	validation_1-error:0.003813	validation_1-rmse:0.002282
[152]	validation_0-mae:0.000506	validation_0-error:0.003773	validation_0-rmse:0.001982	validation_1-mae:0.000504	validation_1-error:0.003813	validation_1-rmse:0.00228
[153]	validation_0-mae:0.000499	validation_0-error:0.003773	validation_0-rmse:0.00198	validation_1-mae:0.000496	validation_1-error:0.003813	validation_1-rmse:0.00

[197]	validation_0-mae:0.00039	validation_0-error:0.003773	validation_0-rmse:0.001919	validation_1-mae:0.000393	validation_1-error:0.003813	validation_1-rmse:0.002208
[198]	validation_0-mae:0.000389	validation_0-error:0.003773	validation_0-rmse:0.001918	validation_1-mae:0.000392	validation_1-error:0.003813	validation_1-rmse:0.002207
[199]	validation_0-mae:0.000389	validation_0-error:0.003773	validation_0-rmse:0.001918	validation_1-mae:0.000392	validation_1-error:0.003813	validation_1-rmse:0.002206
[200]	validation_0-mae:0.000389	validation_0-error:0.003773	validation_0-rmse:0.001917	validation_1-mae:0.000392	validation_1-error:0.003813	validation_1-rmse:0.002206
[201]	validation_0-mae:0.000389	validation_0-error:0.003773	validation_0-rmse:0.001916	validation_1-mae:0.000392	validation_1-error:0.003813	validation_1-rmse:0.002205
[202]	validation_0-mae:0.000388	validation_0-error:0.003773	validation_0-rmse:0.001914	validation_1-mae:0.000391	validation_1-error:0.003813	validation_1-rmse:0.

[246]	validation_0-mae:0.00038	validation_0-error:0.003773	validation_0-rmse:0.001876	validation_1-mae:0.000384	validation_1-error:0.003813	validation_1-rmse:0.002157
[247]	validation_0-mae:0.00038	validation_0-error:0.003773	validation_0-rmse:0.001875	validation_1-mae:0.000384	validation_1-error:0.003813	validation_1-rmse:0.002155
[248]	validation_0-mae:0.000379	validation_0-error:0.003773	validation_0-rmse:0.001874	validation_1-mae:0.000384	validation_1-error:0.003813	validation_1-rmse:0.002153
[249]	validation_0-mae:0.000379	validation_0-error:0.003773	validation_0-rmse:0.001873	validation_1-mae:0.000383	validation_1-error:0.003813	validation_1-rmse:0.002152
[250]	validation_0-mae:0.000378	validation_0-error:0.003773	validation_0-rmse:0.001873	validation_1-mae:0.000383	validation_1-error:0.003813	validation_1-rmse:0.002152
[251]	validation_0-mae:0.000378	validation_0-error:0.003773	validation_0-rmse:0.001872	validation_1-mae:0.000383	validation_1-error:0.003813	validation_1-rmse:0.0

[295]	validation_0-mae:0.000369	validation_0-error:0.003773	validation_0-rmse:0.001844	validation_1-mae:0.000374	validation_1-error:0.003813	validation_1-rmse:0.002118
[296]	validation_0-mae:0.000369	validation_0-error:0.003773	validation_0-rmse:0.001844	validation_1-mae:0.000374	validation_1-error:0.003813	validation_1-rmse:0.002118
[297]	validation_0-mae:0.000369	validation_0-error:0.003773	validation_0-rmse:0.001844	validation_1-mae:0.000374	validation_1-error:0.003813	validation_1-rmse:0.002118
[298]	validation_0-mae:0.000369	validation_0-error:0.003773	validation_0-rmse:0.001844	validation_1-mae:0.000374	validation_1-error:0.003813	validation_1-rmse:0.002118
[299]	validation_0-mae:0.000369	validation_0-error:0.003773	validation_0-rmse:0.001844	validation_1-mae:0.000374	validation_1-error:0.003813	validation_1-rmse:0.002117
[300]	validation_0-mae:0.000369	validation_0-error:0.003773	validation_0-rmse:0.001844	validation_1-mae:0.000374	validation_1-error:0.003813	validation_1-rmse:0

[344]	validation_0-mae:0.000358	validation_0-error:0.003772	validation_0-rmse:0.001811	validation_1-mae:0.000363	validation_1-error:0.003812	validation_1-rmse:0.002077
[345]	validation_0-mae:0.000357	validation_0-error:0.003772	validation_0-rmse:0.001811	validation_1-mae:0.000362	validation_1-error:0.003812	validation_1-rmse:0.002077
[346]	validation_0-mae:0.000357	validation_0-error:0.003772	validation_0-rmse:0.001811	validation_1-mae:0.000362	validation_1-error:0.003812	validation_1-rmse:0.002076
[347]	validation_0-mae:0.000357	validation_0-error:0.003772	validation_0-rmse:0.001811	validation_1-mae:0.000362	validation_1-error:0.003812	validation_1-rmse:0.002076
[348]	validation_0-mae:0.000357	validation_0-error:0.003772	validation_0-rmse:0.00181	validation_1-mae:0.000362	validation_1-error:0.003812	validation_1-rmse:0.002076
[349]	validation_0-mae:0.000357	validation_0-error:0.003772	validation_0-rmse:0.001809	validation_1-mae:0.000362	validation_1-error:0.003812	validation_1-rmse:0.

[393]	validation_0-mae:0.000351	validation_0-error:0.003772	validation_0-rmse:0.001791	validation_1-mae:0.000357	validation_1-error:0.003812	validation_1-rmse:0.002051
[394]	validation_0-mae:0.000351	validation_0-error:0.003772	validation_0-rmse:0.00179	validation_1-mae:0.000356	validation_1-error:0.003812	validation_1-rmse:0.002051
[395]	validation_0-mae:0.000351	validation_0-error:0.003772	validation_0-rmse:0.00179	validation_1-mae:0.000356	validation_1-error:0.003812	validation_1-rmse:0.002049
[396]	validation_0-mae:0.000351	validation_0-error:0.003772	validation_0-rmse:0.001789	validation_1-mae:0.000356	validation_1-error:0.003812	validation_1-rmse:0.002049
[397]	validation_0-mae:0.000351	validation_0-error:0.003772	validation_0-rmse:0.001789	validation_1-mae:0.000356	validation_1-error:0.003812	validation_1-rmse:0.002049
[398]	validation_0-mae:0.000351	validation_0-error:0.003772	validation_0-rmse:0.001788	validation_1-mae:0.000356	validation_1-error:0.003812	validation_1-rmse:0.0

[442]	validation_0-mae:0.000343	validation_0-error:0.003771	validation_0-rmse:0.00177	validation_1-mae:0.000348	validation_1-error:0.003811	validation_1-rmse:0.002027
[443]	validation_0-mae:0.000343	validation_0-error:0.003771	validation_0-rmse:0.00177	validation_1-mae:0.000348	validation_1-error:0.003811	validation_1-rmse:0.002026
[444]	validation_0-mae:0.000342	validation_0-error:0.003771	validation_0-rmse:0.00177	validation_1-mae:0.000348	validation_1-error:0.003811	validation_1-rmse:0.002026
[445]	validation_0-mae:0.000342	validation_0-error:0.003771	validation_0-rmse:0.00177	validation_1-mae:0.000348	validation_1-error:0.003811	validation_1-rmse:0.002026
[446]	validation_0-mae:0.000342	validation_0-error:0.003771	validation_0-rmse:0.001769	validation_1-mae:0.000348	validation_1-error:0.003811	validation_1-rmse:0.002026
[447]	validation_0-mae:0.000342	validation_0-error:0.003771	validation_0-rmse:0.001769	validation_1-mae:0.000348	validation_1-error:0.003811	validation_1-rmse:0.002

[491]	validation_0-mae:0.000338	validation_0-error:0.003771	validation_0-rmse:0.001755	validation_1-mae:0.000343	validation_1-error:0.003811	validation_1-rmse:0.002008
[492]	validation_0-mae:0.000337	validation_0-error:0.003771	validation_0-rmse:0.001755	validation_1-mae:0.000343	validation_1-error:0.003811	validation_1-rmse:0.002008
[493]	validation_0-mae:0.000337	validation_0-error:0.003771	validation_0-rmse:0.001755	validation_1-mae:0.000343	validation_1-error:0.003811	validation_1-rmse:0.002008
[494]	validation_0-mae:0.000337	validation_0-error:0.003771	validation_0-rmse:0.001755	validation_1-mae:0.000343	validation_1-error:0.003811	validation_1-rmse:0.002008
[495]	validation_0-mae:0.000337	validation_0-error:0.003771	validation_0-rmse:0.001755	validation_1-mae:0.000343	validation_1-error:0.003811	validation_1-rmse:0.002007
[496]	validation_0-mae:0.000337	validation_0-error:0.003771	validation_0-rmse:0.001755	validation_1-mae:0.000342	validation_1-error:0.003811	validation_1-rmse:0

[540]	validation_0-mae:0.000333	validation_0-error:0.003771	validation_0-rmse:0.001739	validation_1-mae:0.000338	validation_1-error:0.003811	validation_1-rmse:0.001988
[541]	validation_0-mae:0.000333	validation_0-error:0.003771	validation_0-rmse:0.001739	validation_1-mae:0.000338	validation_1-error:0.003811	validation_1-rmse:0.001988
[542]	validation_0-mae:0.000333	validation_0-error:0.003771	validation_0-rmse:0.001739	validation_1-mae:0.000338	validation_1-error:0.003811	validation_1-rmse:0.001988
[543]	validation_0-mae:0.000333	validation_0-error:0.003771	validation_0-rmse:0.001738	validation_1-mae:0.000338	validation_1-error:0.003811	validation_1-rmse:0.001987
[544]	validation_0-mae:0.000333	validation_0-error:0.003771	validation_0-rmse:0.001738	validation_1-mae:0.000338	validation_1-error:0.003811	validation_1-rmse:0.001986
[545]	validation_0-mae:0.000332	validation_0-error:0.003771	validation_0-rmse:0.001737	validation_1-mae:0.000338	validation_1-error:0.003811	validation_1-rmse:0

[589]	validation_0-mae:0.000328	validation_0-error:0.003771	validation_0-rmse:0.001728	validation_1-mae:0.000334	validation_1-error:0.003811	validation_1-rmse:0.001974
[590]	validation_0-mae:0.000328	validation_0-error:0.003771	validation_0-rmse:0.001728	validation_1-mae:0.000334	validation_1-error:0.003811	validation_1-rmse:0.001974
[591]	validation_0-mae:0.000328	validation_0-error:0.003771	validation_0-rmse:0.001727	validation_1-mae:0.000334	validation_1-error:0.003811	validation_1-rmse:0.001973
[592]	validation_0-mae:0.000328	validation_0-error:0.003771	validation_0-rmse:0.001727	validation_1-mae:0.000334	validation_1-error:0.003811	validation_1-rmse:0.001973
[593]	validation_0-mae:0.000327	validation_0-error:0.003771	validation_0-rmse:0.001727	validation_1-mae:0.000333	validation_1-error:0.003811	validation_1-rmse:0.001973
[594]	validation_0-mae:0.000327	validation_0-error:0.003771	validation_0-rmse:0.001726	validation_1-mae:0.000333	validation_1-error:0.003811	validation_1-rmse:0

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=12, min_child_weight=8, missing=None, n_estimators=600,
       n_jobs=1, nthread=6, objective='reg:linear', random_state=0,
       reg_alpha=1.0, reg_lambda=0, scale_pos_weight=1, seed=9,
       silent=True, subsample=0.8, tree_method='gpu_hist')

In [16]:
train_x, val_x, train_y, val_y = train_test_split(X_train_scaled, y_train_scaled, test_size = 0.30, random_state=1)

In [17]:
fit_params={'early_stopping_rounds': 30, 
            'eval_metric': 'rmse',
            'verbose': False,
            'eval_set': [[val_x, val_y]]}
xgb_cv = cross_val_score(xlf, train_x, train_y, 
                         cv = 5, 
                         scoring = 'neg_mean_absolute_error',
                         fit_params = fit_params)

In [18]:
print("Cross validation scores:", xgb_cv)
print("Mean={0:0.4f}, Var={1:0.4f}".format(np.mean(xgb_cv), np.var(xgb_cv)))

Cross validation scores: [-0.00037391 -0.00038812 -0.00038898 -0.00037234 -0.00037381]
Mean=-0.0004, Var=0.0000


In [19]:
xgb_acc = cross_val_score(xlf, train_x, train_y, 
                         cv = 5, 
                         scoring = 'r2',
                         fit_params = fit_params)

In [20]:
print("Cross validation scores of accuracy", xgb_acc)

Cross validation scores of accuracy [0.88126198 0.91217333 0.92780538 0.84769413 0.89836213]


## get the prediction and inverse transform back

In [25]:
test_dataset = pd.DataFrame()
test_dataset['Prediction'] = np.expm1(xlf.predict(X_test_scaled))
filename = 'tip_amount-prediction.csv'
pd.DataFrame({'tip_amount': test_dataset.Prediction}).to_csv(filename, index=False)

In [26]:
print(test_dataset['Prediction'].head())
print(test_dataset['Prediction'].count())

0    0.000109
1    0.003105
2    0.000005
3    0.005974
4   -0.000048
Name: Prediction, dtype: float32
1019792


In [29]:
test_dataset.shape

(1019792, 1)

In [31]:
y_prediction = scalarY.inverse_transform(test_dataset['Prediction'].values.reshape(test_dataset.shape[0],1))

In [34]:
print(y_prediction)


[[ 3.3142291e-02]
 [ 9.4074166e-01]
 [ 1.5802723e-03]
 ...
 [ 1.3854591e+00]
 [-4.4352625e-02]
 [ 1.6968850e+00]]


In [36]:
r2_score(y_test, y_prediction)

0.9367478125840831