In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('../Data/half_processed_train.csv')
test = pd.read_csv('../Data/processes_X_test.csv')

In [3]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
xgbr = XGBRegressor()
rfr = RandomForestRegressor()

In [5]:
xgbr_params = {'n_estimators' : [10,20,30,40,50,60,70],
               'min_child_weight' : [1, 10, 20, 30], 
               'max_depth' : [2, 3, 4 ,5, 6, 7],
               'booster' : ['gbtree'],
               'eval_metric' : ['rmsle'],
               'eta' : [0.01, 0.05, 0.1, 0.2]               
}

In [6]:
xgbr_grid = GridSearchCV(param_grid=xgbr_params, estimator=xgbr, verbose=1)

In [7]:
X_train = train.drop(['xg_op_pred', 'rf_op_pred', 'time_spent'], axis=1)
y_train = train['time_spent']

In [8]:
from sklearn.model_selection import train_test_split
val_X_train, val_X_test, val_y_train, val_y_test = train_test_split(X_train, y_train, test_size=0.15)

In [9]:
cols = ['session_number', 'purchased', 'added_in_cart', 'checked_out', 'app']

In [10]:
xgbr_grid.fit(val_X_train[cols], val_y_train)

Fitting 5 folds for each of 672 candidates, totalling 3360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3360 out of 3360 | elapsed:  5.4min finished


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
      

In [11]:
xgbr_grid.best_params_

{'booster': 'gbtree',
 'eta': 0.1,
 'eval_metric': 'rmsle',
 'max_depth': 2,
 'min_child_weight': 30,
 'n_estimators': 50}

In [12]:
xgbr_val_y_pred = xgbr_grid.predict(val_X_test[cols])

In [13]:
print('XGBR: ', np.sqrt(mean_squared_log_error(abs(xgbr_val_y_pred), val_y_test)))
# print('RFR: ', np.sqrt(mean_squared_log_error(rfr_val_y_pred, val_y_test)))

XGBR:  2.038456803414801


In [53]:
xcols = ['session_number', 'purchased', 'added_in_cart', 'checked_out', 'app', 'iPhone']
xg_r = XGBRegressor(n_estimators=  32, 
                    learning_rate= .01,
                    max_depth=2, 
                    booster='gbtree', 
                    min_child_weight = 1,
#                     reg_alpha = 1000,
                    reg_lambda = 600)

xg_r.fit(val_X_train[xcols], val_y_train,eval_metric='rmsle', eval_set=[(val_X_test[xcols], val_y_test)], verbose=0)
print('XGBR: ', np.sqrt(mean_squared_log_error(abs(xg_r.predict(val_X_test[xcols])), val_y_test)))

# 1.5369333665150438

XGBR:  1.6680975021894295


In [30]:
def save_submission(estimator, name):
    pred = estimator.predict(test[xcols])
    sub = pd.read_csv('../Data/Sample Submission.csv')
    sub["time_spent"]=abs(pred)
    sub.to_csv('../Submissions/'+name+'.csv', index=False)
#     estimator.save_model('Models/'+name)

In [51]:
save_submission(xg_r, 'XGB_Manual_CV_1.627560542831945')

In [52]:
xg_r.save_model(fname='XGB_Manual_CV_1.627560542831945')