In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error

import os
import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

In [2]:
datapath = 'data'

In [3]:
train_df = pd.read_csv(os.path.join(datapath,'train_feats.csv'))

In [4]:
train_df.head()

Unnamed: 0,card_id,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,new_most_frequent_city,new_most_frequent_state,new_min_month_lag,new_max_purchase_amount,first_active_month,feature_1,feature_2,feature_3,target
0,C_ID_b57bd93808,15.0,705.0,33.0,322.0,11.0,-8.0,3.400409,3.0,879.0,29.0,-1.0,-1.0,1.0,-0.173481,2017-06-01,2,3,0,-1.433714
1,C_ID_fd1d32a99e,12.0,307.0,1.0,69.0,9.0,-4.0,6.060808,8.0,307.0,19.0,19.0,9.0,1.0,11.259275,2014-10-01,5,1,1,-4.474933
2,C_ID_cf7861b198,77.0,705.0,33.0,302.0,7.0,-13.0,5.963316,4.0,307.0,19.0,38.0,7.0,1.0,-0.660565,2015-12-01,5,1,1,2.156978
3,C_ID_80488d9731,131.0,307.0,19.0,173.0,9.0,-13.0,0.754692,4.0,278.0,37.0,140.0,9.0,1.0,11.559805,2016-11-01,5,1,1,-3.134993
4,C_ID_fd67be93ff,131.0,705.0,29.0,213.0,9.0,-5.0,2.645327,12.0,367.0,16.0,213.0,9.0,1.0,-0.326316,2017-09-01,5,1,1,-2.339241


In [5]:
train_df.columns

Index(['card_id', 'hist_transactions_count', 'hist_most_frequent_merchant_cat',
       'hist_most_frequent_subsector', 'hist_most_frequent_city',
       'hist_most_frequent_state', 'hist_min_month_lag',
       'hist_max_purchase_amount', 'new_transactions_count',
       'new_most_frequent_merchant_cat', 'new_most_frequent_subsector',
       'new_most_frequent_city', 'new_most_frequent_state',
       'new_min_month_lag', 'new_max_purchase_amount', 'first_active_month',
       'feature_1', 'feature_2', 'feature_3', 'target'],
      dtype='object')

In [6]:
transform = [col for col in train_df.columns if 'most' in col]

In [7]:
len(train_df) - train_df.count()

card_id                                0
hist_transactions_count            14071
hist_most_frequent_merchant_cat    14071
hist_most_frequent_subsector       14071
hist_most_frequent_city            14071
hist_most_frequent_state           14071
hist_min_month_lag                 14071
hist_max_purchase_amount           14071
new_transactions_count             14071
new_most_frequent_merchant_cat     14071
new_most_frequent_subsector        14071
new_most_frequent_city             14071
new_most_frequent_state            14071
new_min_month_lag                  14071
new_max_purchase_amount            14071
first_active_month                     0
feature_1                              0
feature_2                              0
feature_3                              0
target                                 0
dtype: int64

In [8]:
len(train_df)

129226

In [9]:
train_df.dropna(inplace=True)

In [10]:
len(train_df)

115155

In [11]:
for col in transform:
    train_df[col] =train_df[col].map(lambda x: np.int(x))

In [12]:
def elapsedMonths(datestr):
    days = (datetime.date.today() - datetime.datetime.strptime(datestr, "%Y-%m-%d").date()).days
    return days / 30

In [13]:
train_df['first_active_month'] = train_df['first_active_month'].map(elapsedMonths)

In [14]:
train_df.set_index('card_id',inplace=True)

In [15]:
categorical = transform + ['feature_1', 'feature_2', 'feature_3']

In [16]:
train_df = pd.get_dummies(train_df, categorical, columns=categorical)

In [None]:
train_df.head()

Unnamed: 0_level_0,hist_transactions_count,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_min_month_lag,new_max_purchase_amount,first_active_month,target,hist_most_frequent_merchant_cat_-1,hist_most_frequent_merchant_cat_2,...,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_b57bd93808,15.0,-8.0,3.400409,3.0,1.0,-0.173481,19.466667,-1.433714,0,0,...,0,1,0,0,0,0,0,1,1,0
C_ID_fd1d32a99e,12.0,-4.0,6.060808,8.0,1.0,11.259275,51.933333,-4.474933,0,0,...,0,0,0,0,1,1,0,0,0,1
C_ID_cf7861b198,77.0,-13.0,5.963316,4.0,1.0,-0.660565,37.733333,2.156978,0,0,...,0,0,0,0,1,1,0,0,0,1
C_ID_80488d9731,131.0,-13.0,0.754692,4.0,1.0,11.559805,26.533333,-3.134993,0,0,...,0,0,0,0,1,1,0,0,0,1
C_ID_fd67be93ff,131.0,-5.0,2.645327,12.0,1.0,-0.326316,16.4,-2.339241,0,0,...,0,0,0,0,1,1,0,0,0,1


In [None]:
xgb_model = XGBRegressor()

test_params = { 'learning_rate': [0.01],
'n_estimators': [100,200,300],
'max_depth': [3,4,5],
'subsample': [0.8, 0.9, 1],
'colsample_bytree': [0.3, 0.5, 0.8],
'gamma': [0,1,5]
}

model = GridSearchCV(cv=5, estimator = xgb_model,param_grid = test_params) #
model.fit(train_df.drop('target',axis=1),train_df['target'])


In [None]:
print( model.best_params_)

In [None]:
data_dmatrix = xgb.DMatrix(data=train_df.drop('target',axis=1),label=train_df['target'])
params = model.best_params_
params['objective'] = 'reg:linear'

In [None]:
params

In [None]:
#modify params

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=300,early_stopping_rounds=100,
                  evals=[ (data_dmatrix,'train')],verbose_eval=100)