In [266]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error
import pickle

import os
import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

In [2]:
datapath = 'data'

In [3]:
train_df = pd.read_csv(os.path.join(datapath,'train_feats.csv'))

In [4]:
train_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,...,first_active_month_others,feature_1_2,feature_1_3,feature_1_5,feature_1_others,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1
0,382.0,278,37,25,7,-10.0,1.817983,8.0,307,7,...,1,0,0,0,1,1,0,0,1,0
1,39.0,879,29,149,20,-4.0,0.379389,5.0,2,20,...,1,1,0,0,0,1,0,0,1,0
2,188.0,705,33,226,16,-4.0,0.15318,16.0,80,37,...,1,0,0,0,1,1,0,0,1,0
3,19.0,307,19,57,5,-9.0,-0.342725,1.0,307,19,...,1,0,0,1,0,0,1,0,0,1
4,26.0,80,27,69,9,-3.0,-0.431381,8.0,80,27,...,1,1,0,0,0,1,0,0,1,0


In [5]:
train_df.columns

Index(['hist_transactions_count', 'hist_most_frequent_merchant_cat',
       'hist_most_frequent_subsector', 'hist_most_frequent_city',
       'hist_most_frequent_state', 'hist_min_month_lag',
       'hist_max_purchase_amount', 'new_transactions_count',
       'new_most_frequent_merchant_cat', 'new_most_frequent_subsector',
       'new_most_frequent_city', 'new_most_frequent_state',
       'new_min_month_lag', 'new_max_purchase_amount', 'first_active_month',
       'feature_1', 'feature_2', 'feature_3', 'target',
       'hist_transactions_count_others', 'hist_most_frequent_merchant_cat_307',
       'hist_most_frequent_merchant_cat_705',
       'hist_most_frequent_merchant_cat_others',
       'hist_most_frequent_subsector_19', 'hist_most_frequent_subsector_33',
       'hist_most_frequent_subsector_37',
       'hist_most_frequent_subsector_others', 'hist_most_frequent_city_69',
       'hist_most_frequent_city_others', 'hist_most_frequent_state_9',
       'hist_most_frequent_state_others',

In [6]:
xgb_model = XGBRegressor()

test_params = { 'learning_rate': [0.01],
'n_estimators': [100,200,300],
'max_depth': [3,4,5],
'subsample': [0.8, 0.9, 1],
'colsample_bytree': [0.3, 0.5, 0.8],
'gamma': [0,1,5]
}

model = GridSearchCV(cv=5, estimator = xgb_model,param_grid = test_params) #
model.fit(train_df.drop('target',axis=1),train_df['target'])


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.01], 'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 'subsample': [0.8, 0.9, 1], 'colsample_bytree': [0.3, 0.5, 0.8], 'gamma': [0, 1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [7]:
print( model.best_params_)

{'colsample_bytree': 0.8, 'gamma': 5, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}


In [8]:
data_dmatrix = xgb.DMatrix(data=train_df.drop('target',axis=1),label=train_df['target'])
params = model.best_params_
params['objective'] = 'reg:linear'

In [9]:
params

{'colsample_bytree': 0.8,
 'gamma': 5,
 'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 300,
 'objective': 'reg:linear',
 'subsample': 0.8}

In [10]:
#modify params

In [221]:
params['learning_rate'] = 0.01
params['n_estimators'] = 600
params['silent']=1
params['colsample_bytree'] =  0.95
params['gamma'] = 8
params['max_depth'] = 6

In [222]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=800,early_stopping_rounds=100,
                  evals=[ (data_dmatrix,'train')],verbose_eval=200) 

[0]	train-rmse:3.6216
Will train until train-rmse hasn't improved in 100 rounds.
[200]	train-rmse:3.37544
[400]	train-rmse:3.31804
[600]	train-rmse:3.27705
[799]	train-rmse:3.24111


In [223]:
dev_df = pd.read_csv(os.path.join(datapath,'dev_feats.csv'))

In [224]:
dev_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,...,first_active_month_others,feature_1_2,feature_1_3,feature_1_5,feature_1_others,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1
0,215.0,705,33,233,9,-13.0,0.108101,3.0,80,17,...,1,0,1,0,0,1,0,0,0,1
1,13.0,661,8,69,9,-4.0,-0.285098,8.0,705,33,...,1,1,0,0,0,1,0,0,1,0
2,3.0,80,7,69,9,-3.0,2.630301,3.0,278,19,...,1,0,0,1,0,0,1,0,0,1
3,21.0,206,1,-1,-1,-4.0,-0.386422,1.0,884,27,...,1,0,0,0,1,0,1,0,1,0
4,203.0,278,37,25,7,-13.0,-0.439315,7.0,278,37,...,1,1,0,0,0,0,1,0,1,0


In [225]:
dev_dmatrix = xgb.DMatrix(data=dev_df.drop('target',axis=1))
y =dev_df['target']

In [226]:
 y_dev = xg_reg.predict(dev_dmatrix)

In [227]:
y_dev

array([-0.86635685, -1.6811938 , -0.25975776, ...,  0.25465256,
       -0.35390782, -0.19132167], dtype=float32)

In [228]:
np.sqrt(mean_squared_error(y_dev,y))

3.434352415292742

In [229]:
dev_df['xgb'] = y_dev

In [236]:
dev_df.to_csv(os.path.join(datapath,'dev.xgb'),index=False)

In [231]:
 y_train = xg_reg.predict(data_dmatrix)

In [232]:
np.sqrt(mean_squared_error(y_train,train_df['target']))

3.2411048918184893

In [237]:
train_df['xgb'] = y_train
train_df.to_csv(os.path.join(datapath,'train_xgb.csv'),index=False)

In [238]:
val_df = pd.read_csv(os.path.join(datapath,'val_feats.csv'))
val_dmatrix = xgb.DMatrix(data=val_df.drop('target',axis=1))
y_val = xg_reg.predict(val_dmatrix)
val_df['xgb'] = y_val
val_df.to_csv(os.path.join(datapath,'val_xgb.csv'),index=False)

In [242]:
val_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,...,feature_1_2,feature_1_3,feature_1_5,feature_1_others,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1,xgb
0,234.0,307,34,8,12,-13.0,-0.329727,2.0,80,37,...,0,1,0,0,0,0,1,0,1,-0.043149
1,17.0,454,39,-1,-1,-4.0,1.311724,4.0,34,20,...,0,0,1,0,1,0,0,0,1,-1.379803
2,12.0,427,27,25,7,-5.0,-0.445175,3.0,87,21,...,1,0,0,0,0,1,0,1,0,-1.074649
3,24.0,222,21,341,15,-9.0,-0.476461,6.0,705,33,...,0,1,0,0,0,0,1,0,1,-1.515783
4,29.0,307,19,331,16,-3.0,2.964641,4.0,278,29,...,0,0,0,1,0,1,0,1,0,0.429525


In [243]:
val_df.columns

Index(['hist_transactions_count', 'hist_most_frequent_merchant_cat',
       'hist_most_frequent_subsector', 'hist_most_frequent_city',
       'hist_most_frequent_state', 'hist_min_month_lag',
       'hist_max_purchase_amount', 'new_transactions_count',
       'new_most_frequent_merchant_cat', 'new_most_frequent_subsector',
       'new_most_frequent_city', 'new_most_frequent_state',
       'new_min_month_lag', 'new_max_purchase_amount', 'first_active_month',
       'feature_1', 'feature_2', 'feature_3', 'target',
       'hist_transactions_count_others', 'hist_most_frequent_merchant_cat_307',
       'hist_most_frequent_merchant_cat_705',
       'hist_most_frequent_merchant_cat_others',
       'hist_most_frequent_subsector_19', 'hist_most_frequent_subsector_33',
       'hist_most_frequent_subsector_37',
       'hist_most_frequent_subsector_others', 'hist_most_frequent_city_69',
       'hist_most_frequent_city_others', 'hist_most_frequent_state_9',
       'hist_most_frequent_state_others',

In [267]:
pickle.dump( xg_reg, open("xgboost.pickle.dat", "wb"))

In [None]:
#xg_reg = pickle.load(open("xgboost.pickle.dat", "rb"))

In [268]:
test_df = pd.read_csv(os.path.join(datapath,'test_feats.csv'))


In [269]:
test_df.head()

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,new_most_frequent_city,new_most_frequent_state,new_min_month_lag,new_max_purchase_amount,first_active_month,feature_1,feature_2,feature_3
0,123.0,307.0,19.0,314.0,9.0,-5.0,0.768095,26.0,278.0,37.0,314.0,9.0,1.0,-0.078318,16.0,4,1,0
1,216.0,278.0,37.0,179.0,-1.0,-9.0,4.554145,31.0,278.0,37.0,69.0,9.0,1.0,3.129932,24.0,5,2,1
2,73.0,705.0,33.0,103.0,18.0,-13.0,-0.336684,3.0,605.0,2.0,69.0,9.0,1.0,-0.611669,41.0,5,1,1
3,30.0,560.0,34.0,233.0,9.0,-11.0,2.577843,1.0,432.0,27.0,23.0,9.0,2.0,-0.641872,25.0,2,2,0
4,70.0,705.0,33.0,344.0,18.0,-9.0,-0.097011,3.0,705.0,33.0,344.0,18.0,1.0,-0.667883,31.0,5,2,1


In [271]:
set(val_df.columns) - set(test_df.columns)

{'feature_1_2',
 'feature_1_3',
 'feature_1_5',
 'feature_1_others',
 'feature_2_1',
 'feature_2_2',
 'feature_2_3',
 'feature_3_0',
 'feature_3_1',
 'first_active_month_others',
 'hist_max_purchase_amount_others',
 'hist_min_month_lag_-13.0',
 'hist_min_month_lag_others',
 'hist_most_frequent_city_69',
 'hist_most_frequent_city_others',
 'hist_most_frequent_merchant_cat_307',
 'hist_most_frequent_merchant_cat_705',
 'hist_most_frequent_merchant_cat_others',
 'hist_most_frequent_state_9',
 'hist_most_frequent_state_others',
 'hist_most_frequent_subsector_19',
 'hist_most_frequent_subsector_33',
 'hist_most_frequent_subsector_37',
 'hist_most_frequent_subsector_others',
 'hist_transactions_count_others',
 'new_max_purchase_amount_others',
 'new_min_month_lag_1.0',
 'new_min_month_lag_2.0',
 'new_most_frequent_city_69',
 'new_most_frequent_city_others',
 'new_most_frequent_merchant_cat_278',
 'new_most_frequent_merchant_cat_307',
 'new_most_frequent_merchant_cat_80',
 'new_most_frequent_

In [272]:
set(test_df.columns) - set(val_df.columns) 

set()

In [270]:
test_dmatrix = xgb.DMatrix(data=test_df)
y_test = xg_reg.predict(test_dmatrix)
test_df['xgb'] = y_test
#val_df.to_csv(os.path.join(datapath,'val_xgb.csv'),index=False)

ValueError: feature_names mismatch: ['hist_transactions_count', 'hist_most_frequent_merchant_cat', 'hist_most_frequent_subsector', 'hist_most_frequent_city', 'hist_most_frequent_state', 'hist_min_month_lag', 'hist_max_purchase_amount', 'new_transactions_count', 'new_most_frequent_merchant_cat', 'new_most_frequent_subsector', 'new_most_frequent_city', 'new_most_frequent_state', 'new_min_month_lag', 'new_max_purchase_amount', 'first_active_month', 'feature_1', 'feature_2', 'feature_3', 'hist_transactions_count_others', 'hist_most_frequent_merchant_cat_307', 'hist_most_frequent_merchant_cat_705', 'hist_most_frequent_merchant_cat_others', 'hist_most_frequent_subsector_19', 'hist_most_frequent_subsector_33', 'hist_most_frequent_subsector_37', 'hist_most_frequent_subsector_others', 'hist_most_frequent_city_69', 'hist_most_frequent_city_others', 'hist_most_frequent_state_9', 'hist_most_frequent_state_others', 'hist_min_month_lag_-13.0', 'hist_min_month_lag_others', 'hist_max_purchase_amount_others', 'new_transactions_count_1.0', 'new_transactions_count_2.0', 'new_transactions_count_3.0', 'new_transactions_count_others', 'new_most_frequent_merchant_cat_80', 'new_most_frequent_merchant_cat_278', 'new_most_frequent_merchant_cat_307', 'new_most_frequent_merchant_cat_others', 'new_most_frequent_subsector_19', 'new_most_frequent_subsector_27', 'new_most_frequent_subsector_37', 'new_most_frequent_subsector_others', 'new_most_frequent_city_69', 'new_most_frequent_city_others', 'new_most_frequent_state_9', 'new_most_frequent_state_others', 'new_min_month_lag_1.0', 'new_min_month_lag_2.0', 'new_max_purchase_amount_others', 'first_active_month_others', 'feature_1_2', 'feature_1_3', 'feature_1_5', 'feature_1_others', 'feature_2_1', 'feature_2_2', 'feature_2_3', 'feature_3_0', 'feature_3_1'] ['hist_transactions_count', 'hist_most_frequent_merchant_cat', 'hist_most_frequent_subsector', 'hist_most_frequent_city', 'hist_most_frequent_state', 'hist_min_month_lag', 'hist_max_purchase_amount', 'new_transactions_count', 'new_most_frequent_merchant_cat', 'new_most_frequent_subsector', 'new_most_frequent_city', 'new_most_frequent_state', 'new_min_month_lag', 'new_max_purchase_amount', 'first_active_month', 'feature_1', 'feature_2', 'feature_3']
expected feature_2_3, feature_1_2, new_most_frequent_merchant_cat_307, hist_most_frequent_merchant_cat_others, hist_most_frequent_subsector_others, hist_most_frequent_state_9, hist_transactions_count_others, hist_most_frequent_subsector_33, new_most_frequent_merchant_cat_278, new_most_frequent_state_others, new_transactions_count_2.0, hist_most_frequent_merchant_cat_705, hist_most_frequent_subsector_37, new_transactions_count_1.0, new_most_frequent_subsector_others, new_most_frequent_merchant_cat_others, new_most_frequent_subsector_27, feature_1_5, hist_most_frequent_merchant_cat_307, new_min_month_lag_2.0, hist_most_frequent_city_69, feature_3_0, new_most_frequent_state_9, feature_3_1, hist_min_month_lag_others, new_transactions_count_others, new_transactions_count_3.0, new_most_frequent_city_others, new_most_frequent_merchant_cat_80, new_most_frequent_city_69, hist_min_month_lag_-13.0, hist_max_purchase_amount_others, new_most_frequent_subsector_37, new_min_month_lag_1.0, new_max_purchase_amount_others, feature_1_others, feature_2_1, hist_most_frequent_state_others, new_most_frequent_subsector_19, hist_most_frequent_subsector_19, hist_most_frequent_city_others, first_active_month_others, feature_1_3, feature_2_2 in input data

In [264]:
# transform the first_activ_month feature in : number of months until 01.01.2019
import datetime as dt

reference_date = dt.datetime.strptime('Jan 1 2019', '%b %d %Y').date()
def months_between_dates(date1):
    date2 = dt.datetime.strptime(date1, '%Y-%m-%d')
    return reference_date.month - date2.month + 12*(reference_date.year - date2.year)

In [265]:
test_df['first_active_month'] = test_df['first_active_month'].map(months_between_dates)

TypeError: strptime() argument 1 must be str, not float