In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [2]:
import pandas as pd
import numpy as np

In [3]:
import xgboost as xgb

In [4]:
from utils import UtilsKy

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix):
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    print("min ={}, max ={}". format(min(predt), max(predt)))
    predt[predt < -1] = -1 + 1e-6
    #predt = np.where(predt > 0.25, y, predt)
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    print("min ={}, max ={}". format(min(elements), max(elements)))
    
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))

In [7]:
from typing import Tuple
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

def squared_log(predt: np.ndarray,
                dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as
    objective function.
    '''
    predt[predt < -1] = -1 + 1e-6
#     print('==========================================')
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)    
    return grad, hess

In [8]:
# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')

In [9]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude', 'phone_2_norm', 'is_gender_undefined', 'is_city_resolved']

In [10]:
# For Xgboost
from helper import DataHelper
datahelper = DataHelper(db_teach, db_test, COL_FACTORS)
datahelper.create_train_test()
datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()
replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999
datahelper.replaced_na_values(replaced_values)   
train , test = datahelper.get_train_test()

train na columns : Index(['latitude', 'longitude'], dtype='object')
test na columns : Index(['latitude', 'longitude'], dtype='object')
-999
-999
-999
-999
-999
-999
-999
36.90237577890762
-92.53325861542274
-999


In [11]:
train = train.values
test = test.values
label = db_teach.status

In [20]:
dtrain = xgb.DMatrix(train, label)
dtest = xgb.DMatrix(test)

In [21]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

In [22]:
bst = xgb.train(param, dtrain=dtrain, num_boost_round=10, verbose_eval=3, feval=rmsle)

In [23]:
bst.predict(dtest)

array([0.01707395, 0.02037124, 0.00598064, ..., 0.00997019, 0.00997019,
       0.02628254], dtype=float32)