In [1]:
from collections import Counter
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 20)

In [2]:
import xgboost as xgb

In [3]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [4]:
from statistic import Statistic
from utils import UtilsKy
from analyzer import HelperAnalyzer, AnalyzerPrediction

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
# kyw3
#path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [7]:
# ky9
# path_data = '/mnt/files/workdata/work/merchants/merchant_32_ky9_2020-05-12_white_visa/04_experiments/'
# db_teach = pd.read_csv(UtilsKy.DB_TEACH_KY9, dtype=str)
# db_test = pd.read_csv(UtilsKy.DB_TEST_KY9, dtype=str)
# white = pd.read_csv(UtilsKy.WHITE_KY9 , dtype=str)

# for prod
# db_teach = pd.read_csv(UtilsKy.DB_TEACH_KY9_FOR_PROD, dtype=str)
# db_test = pd.read_csv(UtilsKy.DB_TEST_KY9_FOR_PROD , dtype=str)
# white = pd.read_csv(UtilsKy.WHITE_KY9 , dtype=str)

# db_teach = pd.read_csv(path_data + 'ex_05_ky9_xgb_jupiter_2020_07_08/db_teach_ky9_is_frequency_ip.csv', dtype=str)
# db_test = pd.read_csv(path_data + 'ex_05_ky9_xgb_jupiter_2020_07_08/db_test_ky9_is_frequency_ip.csv', dtype=str)

In [8]:
db_teach.columns

Index(['amount', 'amount_deviation', 'bank_currency', 'bin', 'city',
       'count_months_to_end_card', 'day_of_week', 'gender2', 'hour', 'id',
       'is_city_resolved', 'is_gender_undefined', 'latitude', 'longitude',
       'order_id', 'phone_2_norm', 'status'],
      dtype='object')

In [9]:
Statistic.get_table_value_counts(db_teach, 'status')

0    427164
1      6261
Name: status, dtype: int64

In [10]:
Statistic.get_table_value_counts(db_test, 'status')

0    58107
1      755
Name: status, dtype: int64

In [11]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude', 'phone_2_norm', 'is_gender_undefined', 'is_city_resolved']
COL_FACTORS = sorted(COL_FACTORS)

In [12]:
# For Xgboost
from helper import DataHelper
datahelper = DataHelper(db_teach, db_test, COL_FACTORS)
datahelper.create_train_test()
datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()
replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999
datahelper.replaced_na_values(replaced_values)   
train , test = datahelper.get_train_test()

train na columns : Index(['latitude', 'longitude'], dtype='object')
test na columns : Index(['latitude', 'longitude'], dtype='object')
36.90237577890762
-999
-999
-999
-999
-999
-999
-999
-92.53325861542274
-999


##### Concatenate teach set and test set for validation 
##### Using GridSerch() for Best params
###### For any set of parameters we have two experiments(PredefinedSplit)
###### 1. Train is sample from teach set and validate is sample from test set.
###### 2. Train is sample from test set and validate is sample from teach set.
##### For any experiment we calculate custom metric.
##### We take mean custom metric from two experiment. 
##### Best parameters - are parameters with best metric result.

In [13]:
total_train = pd.concat([train, test], ignore_index=True).copy()
total_label = pd.concat([db_teach.status, db_test.status], ignore_index=True, axis=0).copy()

In [14]:
w = 69
total_weight = np.where(total_label =='0', 1, w)

In [15]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.PredefinedSplit.html#sklearn.model_selection.PredefinedSplit
from sklearn.model_selection import PredefinedSplit

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import make_scorer

In [17]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

def custom_amount_score_p(y_true, y_predicted):
       
    if len(y_true) == len(db_test):
        db_test["probability"] = y_predicted
    else: 
        print('custom_amount_score_p')
        return 0
    
    message = ''
    percents_cumsum = 0
    for sample_percent in (1, 2, 3, 4, 5, 6, 7, 10, 20):
        percent_bad_sample = analyzer_prediction.get_amount_3ds(sample_percent)[0]
        percents_cumsum += percent_bad_sample 
        message += 'p_{}={},' . format(sample_percent, percent_bad_sample )  
        
    percents_cumsum = round(percents_cumsum, 2)     
    message += ' total score={}'.format(percents_cumsum)       
    print(message)
    
    return 2 * percents_cumsum

In [18]:
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring

# For GreedsearchCV method
n_test = test.shape[0]
def custom_score_p(y_true, y_predicted):
    ''' Percent bad in sample.'''
    score = 0
    n_bad = y_true.sum()
    print('custom_scoring n_test={} ' . format(len(y_true)))
    message = ''
    for sample_percent in (1,2,3,4,5,6,7,10,20):
        
        q_percent = np.quantile(y_predicted, 1-sample_percent/100, axis=0)
        y_true_p = np.where(y_predicted >= q_percent , y_true, 0)


        n_bad_sample = y_true_p.sum()
        p_bad_sample = n_bad_sample/n_bad
        percent_bad_sample = round(p_bad_sample*100,2)
        message += 'p_{}={},' . format(sample_percent, percent_bad_sample )   
        
        score += percent_bad_sample
    
    score = round(score, 2)     
    message += ' total score={}'.format(score)   
    
    print(message)

    if len(y_true) != n_test:
        print('custom_scoring_p  return 0')
        p_bad_sample = 0
        
    return score

In [19]:
# needs_proba=True - label for geting probability
score = make_scorer(custom_amount_score_p, greater_is_better=True, needs_proba=True)

In [20]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, disable_default_eval_metric=True)

In [21]:
# params = {'max_depth': range(2, 6), 'n_estimators': [500], 'learning_rate': np.arange(0.05, 0.5, 0.05), 
#          'subsample' : np.arange(0.5, 1, 0.05),  } 

In [22]:
params = {'max_depth': [2, 3, 5], 'learning_rate': [0.1, 0.2, 0.35], 'n_estimators': [60, 87, 110]}         

In [23]:
# Gridsearch ps - for crossvalidation on train and test only without folds.
n_train = train.shape[0]
n_test = test.shape[0]
test_fold = [1] *n_train + [0]* n_test
ps = PredefinedSplit(test_fold)

In [24]:
# HalvingGridSearchCV - for default sampling dataset for threee folds: small, mean and big. It works with 'custom_score_p' methric.
# clf = HalvingGridSearchCV(xgb_model, params, cv = ps, scoring=score)
clf = GridSearchCV(xgb_model, params, cv = ps, scoring=score)

In [25]:
fit_params={ 'eval_metric': custom_amount_score_p, 'verbose': False, 'sample_weight': total_weight}
# fit_params={"early_stopping_rounds":50}

In [26]:
clf.fit(total_train, total_label.astype(int),  **fit_params) 

p_1=2.7,p_2=9.8,p_3=11.28,p_4=12.83,p_5=14.56,p_6=16.69,p_7=18.5,p_10=25.04,p_20=44.18, total score=155.58
custom_amount_score_p
p_1=3.49,p_2=9.93,p_3=11.1,p_4=11.97,p_5=14.74,p_6=17.22,p_7=19.34,p_10=26.92,p_20=43.1, total score=157.81
custom_amount_score_p
p_1=5.18,p_2=9.6,p_3=11.02,p_4=12.69,p_5=14.7,p_6=18.72,p_7=18.96,p_10=25.94,p_20=43.85, total score=160.66
custom_amount_score_p
p_1=6.35,p_2=10.11,p_3=12.0,p_4=16.82,p_5=16.95,p_6=19.86,p_7=21.92,p_10=28.79,p_20=41.41, total score=174.21
custom_amount_score_p
p_1=6.88,p_2=12.31,p_3=12.53,p_4=15.32,p_5=18.06,p_6=21.15,p_7=26.03,p_10=30.57,p_20=42.18, total score=185.03
custom_amount_score_p
p_1=6.88,p_2=12.31,p_3=14.44,p_4=16.09,p_5=20.72,p_6=23.16,p_7=26.42,p_10=32.17,p_20=42.13, total score=194.32
custom_amount_score_p
p_1=5.71,p_2=13.45,p_3=14.93,p_4=15.94,p_5=18.31,p_6=20.57,p_7=21.56,p_10=31.35,p_20=46.9, total score=188.72
custom_amount_score_p
p_1=6.75,p_2=11.21,p_3=11.54,p_4=14.14,p_5=17.91,p_6=20.08,p_7=24.08,p_10=33.2,p_

GridSearchCV(cv=PredefinedSplit(test_fold=array([1, 1, ..., 0, 0])),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     disable_default_eval_metric=True,
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_...
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subs

In [27]:
print(clf.best_score_)
print(clf.best_params_)

212.13
{'learning_rate': 0.35, 'max_depth': 3, 'n_estimators': 87}


In [28]:
train = train.values
test = test.values
label = db_teach.status

In [29]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [30]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

In [31]:
metric_scores = ['count', 'amount']
metric_score = metric_scores[1]

config = clf.best_params_

label = label.astype(int)

model = xgb.XGBClassifier(**config,use_label_encoder=False, disable_default_eval_metric=True)

model.fit(train, label, sample_weight=weight)

test_pred = model.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

#config
description = 'best_cv wiht wl-' + '-'.join([str(elem) for elem in (config['max_depth'], config['n_estimators'], config['learning_rate'], metric_score)])                
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric=metric_score)

description = 'best_cv without wl-' + '-'.join([str(elem) for elem in (config['max_depth'], config['n_estimators'], config['learning_rate'], metric_score)])         
analyzer_prediction.white_list = analyzer_prediction.get_empty_white_list()
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric=metric_score)

In [32]:
n = result_df_amount.shape[0]
sub_rows = list(range(n))[::2]
stat_best = result_df_amount.copy().iloc[sub_rows,:]

col_names = [col for col in stat_best.columns if col.startswith('p_') ] 
stat_best.loc[:, col_names] = stat_best.loc[:, col_names].astype(float)
stat_best = stat_best.sort_values(by="rating", ascending=False)

In [33]:
stat_best.iloc[:,:11] 

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
0,best_cv wiht wl-3-87-0.35-amount,8.6,15.67,20.0,21.84,22.96,23.87,26.31,30.77,42.11,212.13
2,best_cv without wl-3-87-0.35-amount,6.35,12.99,17.42,20.09,21.66,23.53,25.35,28.57,40.5,196.46


# cross validation

In [None]:
# TODO: This code dont working yet.

In [271]:
# https://www.kaggle.com/cast42/xg-cv
train_matrix = xgb.DMatrix(train, label=db_teach.status)

In [272]:
clf = xgb.XGBClassifier(missing=9999999999,
                max_depth = 7,
                n_estimators=700,
                learning_rate=0.1, 
                nthread=4,
                subsample=1.0,
                colsample_bytree=0.5,
                min_child_weight = 3,
                seed=1301)

In [273]:
xgb_param = clf.get_xgb_params()

In [294]:
cvresult = xgb.cv(xgb_param, train_matrix, num_boost_round=1000, nfold=3, metrics=['logloss'], #feval= custom_metric_p5,
     early_stopping_rounds=2, stratified=True, seed=1301)

In [295]:
cvresult

Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.603632,0.000030,0.603381,0.000038
1,0.529977,0.000022,0.530074,0.000010
2,0.468865,0.000081,0.468937,0.000024
3,0.417367,0.000077,0.417393,0.000028
4,0.373389,0.000042,0.373447,0.000062
...,...,...,...,...
995,0.032552,0.000311,0.048029,0.001153
996,0.032549,0.000311,0.048027,0.001154
997,0.032535,0.000315,0.048017,0.001153
998,0.032521,0.000310,0.048006,0.001160


In [298]:
clf.set_params(n_estimators=cvresult.shape[0])
print('Fit on the trainingsdata')
clf.fit(train, db_teach.status.astype(int))

Fit on the trainingsdata


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=3, missing=9999999999, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=1301, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=1301, subsample=1.0, tree_method='exact',
              validate_parameters=1, verbosity=None)