In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error
import pickle

import os
import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

In [2]:
datapath = 'data'

In [3]:
train_df = pd.read_csv(os.path.join(datapath,'train_feats.csv'))

In [4]:
train_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,...,new_most_frequent_state_others,feature_1_2,feature_1_3,feature_1_5,feature_1_others,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1
0,382.0,278.0,37.0,25.0,7.0,-10.0,1.817983,8.0,307.0,7.0,...,1,0,0,0,1,1,0,0,1,0
1,39.0,879.0,29.0,149.0,20.0,-4.0,0.379389,5.0,2.0,20.0,...,1,1,0,0,0,1,0,0,1,0
2,188.0,705.0,33.0,226.0,16.0,-4.0,0.15318,16.0,80.0,37.0,...,1,0,0,0,1,1,0,0,1,0
3,19.0,307.0,19.0,57.0,5.0,-9.0,-0.342725,1.0,307.0,19.0,...,1,0,0,1,0,0,1,0,0,1
4,26.0,80.0,27.0,69.0,9.0,-3.0,-0.431381,8.0,80.0,27.0,...,0,1,0,0,0,1,0,0,1,0


In [5]:
train_df.columns

Index(['hist_transactions_count', 'hist_most_frequent_merchant_cat',
       'hist_most_frequent_subsector', 'hist_most_frequent_city',
       'hist_most_frequent_state', 'hist_min_month_lag',
       'hist_max_purchase_amount', 'new_transactions_count',
       'new_most_frequent_merchant_cat', 'new_most_frequent_subsector',
       'new_most_frequent_city', 'new_most_frequent_state',
       'new_min_month_lag', 'new_max_purchase_amount', 'first_active_month',
       'feature_1', 'feature_2', 'feature_3', 'target',
       'hist_most_frequent_merchant_cat_307.0',
       'hist_most_frequent_merchant_cat_705.0',
       'hist_most_frequent_merchant_cat_others',
       'hist_most_frequent_subsector_19.0',
       'hist_most_frequent_subsector_33.0',
       'hist_most_frequent_subsector_37.0',
       'hist_most_frequent_subsector_others', 'hist_most_frequent_city_69.0',
       'hist_most_frequent_city_others', 'hist_most_frequent_state_9.0',
       'hist_most_frequent_state_others',
       'new_

In [6]:
xgb_model = XGBRegressor()

test_params = { 'learning_rate': [0.01],
'n_estimators': [100,200,300],
'max_depth': [3,4,5],
'subsample': [0.8, 0.9, 1],
'colsample_bytree': [0.3, 0.5, 0.8, 0.9, 0.95],
'gamma': [0,1,5]
}

model = GridSearchCV(cv=5, estimator = xgb_model,param_grid = test_params) #
model.fit(train_df.drop('target',axis=1),train_df['target'])


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.01], 'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 'subsample': [0.8, 0.9, 1], 'colsample_bytree': [0.3, 0.5, 0.8, 0.9, 0.95], 'gamma': [0, 1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [7]:
print( model.best_params_)

{'colsample_bytree': 0.95, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}


In [8]:
data_dmatrix = xgb.DMatrix(data=train_df.drop('target',axis=1),label=train_df['target'])
params = model.best_params_
params['objective'] = 'reg:linear'

In [9]:
params

{'colsample_bytree': 0.95,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 300,
 'objective': 'reg:linear',
 'subsample': 0.8}

In [82]:

#modify params

In [83]:
params['learning_rate'] = 0.01
params['n_estimators'] = 300
params['silent']=1
#params['colsample_bytree'] =  0.95
#params['gamma'] = 8
#params['max_depth'] = 6

In [110]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=800,early_stopping_rounds=100,
                  evals=[ (data_dmatrix,'train')],verbose_eval=200) 

[0]	train-rmse:3.62208
Will train until train-rmse hasn't improved in 100 rounds.
[200]	train-rmse:3.40958
[400]	train-rmse:3.36868
[600]	train-rmse:3.3423
[799]	train-rmse:3.31988


In [111]:
dev_df = pd.read_csv(os.path.join(datapath,'dev_feats.csv'))

In [112]:
dev_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,...,new_most_frequent_state_others,feature_1_2,feature_1_3,feature_1_5,feature_1_others,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1
0,215.0,705.0,33.0,233.0,9.0,-13.0,0.108101,3.0,80.0,17.0,...,0,0,1,0,0,1,0,0,0,1
1,13.0,661.0,8.0,69.0,9.0,-4.0,-0.285098,8.0,705.0,33.0,...,0,1,0,0,0,1,0,0,1,0
2,3.0,80.0,7.0,69.0,9.0,-3.0,2.630301,3.0,278.0,19.0,...,1,0,0,1,0,0,1,0,0,1
3,21.0,206.0,1.0,-1.0,-1.0,-4.0,-0.386422,1.0,884.0,27.0,...,1,0,0,0,1,0,1,0,1,0
4,203.0,278.0,37.0,25.0,7.0,-13.0,-0.439315,7.0,278.0,37.0,...,1,1,0,0,0,0,1,0,1,0


In [113]:
dev_dmatrix = xgb.DMatrix(data=dev_df.drop('target',axis=1))
y =dev_df['target']

In [114]:
 y_dev = xg_reg.predict(dev_dmatrix)

In [115]:
y_dev

array([-0.91751444, -1.2905118 , -0.36455244, ...,  0.20375445,
       -0.4160422 , -0.1665104 ], dtype=float32)

In [116]:
np.sqrt(mean_squared_error(y_dev,y))

3.4376224231588624

In [117]:
dev_df['xgb'] = y_dev

In [118]:
dev_df.to_csv(os.path.join(datapath,'dev.xgb'),index=False)

In [119]:
 y_train = xg_reg.predict(data_dmatrix)

In [120]:
np.sqrt(mean_squared_error(y_train,train_df['target']))

3.3198783423279083

In [121]:
train_df['xgb'] = y_train
train_df.to_csv(os.path.join(datapath,'train_xgb.csv'),index=False)

In [122]:
val_df = pd.read_csv(os.path.join(datapath,'val_feats.csv'))
val_dmatrix = xgb.DMatrix(data=val_df.drop('target',axis=1))
y_val = xg_reg.predict(val_dmatrix)
val_df['xgb'] = y_val
val_df.to_csv(os.path.join(datapath,'val_xgb.csv'),index=False)

In [123]:
val_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,...,feature_1_2,feature_1_3,feature_1_5,feature_1_others,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1,xgb
0,234.0,307.0,34.0,8.0,12.0,-13.0,-0.329727,2.0,80.0,37.0,...,0,1,0,0,0,0,1,0,1,-0.098036
1,17.0,454.0,39.0,-1.0,-1.0,-4.0,1.311724,4.0,34.0,20.0,...,0,0,1,0,1,0,0,0,1,-1.381747
2,12.0,427.0,27.0,25.0,7.0,-5.0,-0.445175,3.0,87.0,21.0,...,1,0,0,0,0,1,0,1,0,-1.356377
3,24.0,222.0,21.0,341.0,15.0,-9.0,-0.476461,6.0,705.0,33.0,...,0,1,0,0,0,0,1,0,1,-1.697394
4,29.0,307.0,19.0,331.0,16.0,-3.0,2.964641,4.0,278.0,29.0,...,0,0,0,1,0,1,0,1,0,0.424088


In [124]:
#pickle.dump( xg_reg, open("xgboost.pickle.dat", "wb"))

In [125]:
#xg_reg = pickle.load(open("xgboost.pickle.dat", "rb"))

In [126]:
test_df_feats = pd.read_csv(os.path.join(datapath,'test_feats.csv'))


In [127]:
set(train_df.columns) - set(test_df_feats.columns)

{'target', 'xgb'}

In [128]:
set(test_df_feats.columns) - set(train_df.columns) 

set()

In [129]:
test_dmatrix = xgb.DMatrix(data=test_df_feats)
y_test = xg_reg.predict(test_dmatrix)
#test_df_['xgb'] = y_test
#test_df.to_csv(os.path.join(datapath,'test_xgb.csv'),index=False)

predict submission

In [130]:
test_df = pd.read_csv(os.path.join(datapath,'test.csv'))

In [131]:
test_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [132]:
test_df['target'] = y_test

In [133]:
test_df.drop(['first_active_month','feature_1','feature_2','feature_3'],axis=1, inplace=True)

In [134]:
test_df.to_csv(os.path.join(datapath,'test_xgb.csv'),index=False)