In [15]:
import sys
print('python version:', sys.version_info)
import os,inspect

import pandas as pd
print('pandas version: ', pd.__version__)

import numpy as np
print('numpy version: ', np.__version__)

import sklearn
print('sklearn version: ', sklearn.__version__)

import xgboost as xgb
print('xgboost version: ', xgb.__version__)

import joblib
print('joblib version: ', joblib.__version__)

python version: sys.version_info(major=3, minor=6, micro=13, releaselevel='final', serial=0)
pandas version:  1.1.4
numpy version:  1.19.4
sklearn version:  0.21.0
xgboost version:  0.82
joblib version:  0.15.0


In [16]:
assert sys.version_info >= (3, 6) and sys.version_info < (3, 7)
assert sklearn.__version__ >= "0.21" and sklearn.__version__ < "0.22"
assert pd.__version__ >= "1.1.4" and pd.__version__ < "1.1.5"
assert np.__version__ >= "1.19.4" and np.__version__ <= "1.19.4"
assert xgb.__version__ >= "0.82" and xgb.__version__ <= "0.82"
assert joblib.__version__ >= "0.15.0" and joblib.__version__ < "0.16.0"

In [17]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)

In [18]:
from utils import UtilsKy
from analyzer import AnalyzerPrediction
from statistic import Statistic
from helper import DataHelper

In [19]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
path_model = '/mnt/files/workdata/work/merchants/merchant_32_ky9_2020-05-12_white_visa/04_experiments/ex_06_for_prod/'

In [21]:
xgb_ky9_5_80_03_2020_07_08 =  joblib.load( path_model + 'xgb_ky9_5_80_03_2020_07_08')

In [22]:
xgb_ky9_5_90_035_2020_07_08 =  joblib.load( path_model + 'xgb_ky9_5_90_035_2020_07_08')

In [23]:
path_model2 = '/mnt/files/workdata/work/merchant-ky-2019-03-25/09_models/2019-10-30/' 
xgb_v2_3_120_01_ky5_prod =joblib.load( path_model2 + 'xgb_v2-3-120-01-ky5-prod')

In [24]:
print(xgb_v2_3_120_01_ky5_prod._algorithm_name)
print(xgb_v2_3_120_01_ky5_prod._factor_list)

xgboost
['amount', 'bank_currency', 'bin', 'client_hour', 'day_of_week', 'hour', 'is_bank_country_equal_country', 'is_ip_country_equal_country', 'latitude', 'longitude', 'phone_2_norm']


In [25]:
print(xgb_ky9_5_80_03_2020_07_08._algorithm_name)
print(xgb_ky9_5_80_03_2020_07_08._factor_list)

xgboost
['amount', 'bank_currency', 'bin', 'day_of_week', 'hour', 'latitude', 'longitude', 'phone_2_norm']


In [30]:
resolved = pd.read_csv(UtilsKy.KY_10_RESOLVED, dtype=str)

In [26]:
white = pd.read_csv(UtilsKy.WHITE_KY9_FOR_PROD , dtype=str)

In [27]:
white.shape

(1055452, 1)

In [28]:
model_num = 1
if model_num == 1:
    model_xgb = xgb_ky9_5_80_03_2020_07_08
    model_name = 'xgb_ky9_5_80_03_2020_07_08' 
elif model_num == 2:
    model_xgb = xgb_ky9_5_90_035_2020_07_08
    model_name = 'xgb_ky9_5_90_035_2020_07_08'
else:    
    model_xgb = xgb_v2_3_120_01_ky5_prod
    model_name = 'xgb_v2_3_120_01_ky5_prod'   

In [31]:
db_teach, db_test = Statistic.split_train_test_with_diff_ids(resolved, train_size=0.7) # test_has_unique_ids=True
print(db_teach.status.value_counts())
print(db_test.status.value_counts())
# For Xgboost

COL_FACTORS = model_xgb._factor_list
COL_FACTORS = sorted(COL_FACTORS)

datahelper = DataHelper(db_teach, db_test, COL_FACTORS)
datahelper.create_train_test()

datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()

replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999

datahelper.replaced_na_values(replaced_values)   
train , test = datahelper.get_train_test()

train_test = pd.concat([train, test])
train_test.sort_index(inplace=True)

0    143294
1      1011
Name: status, dtype: int64
0    61413
1      388
Name: status, dtype: int64
Statistic na values in columns : 
train na columns : Index(['latitude', 'longitude'], dtype='object')
latitude_na: 736,longitude_na: 736
test na columns : Index(['latitude', 'longitude'], dtype='object')
latitude_na: 323,longitude_na: 323
Replaced na values:
bank_currency_na -> -999
longitude_na -> -77.91555959852059
amount_na -> -999
bin_na -> -999
hour_na -> -999
latitude_na -> 33.251163338185826
day_of_week_na -> -999
phone_2_norm_na -> -999


In [32]:
COL_FACTORS = model_xgb._factor_list
random_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + [11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

In [35]:
results = []
model = model_xgb
for rand in random_list:
    train_index, test_index = Statistic.split_train_test_indexes(resolved, train_size=0.7, random_state=rand, test_has_unique_ids=False)
    
    db_teach, db_test = resolved.loc[train_index, :], resolved.loc[test_index, :]
    train, test = train_test.loc[train_index, :], train_test.loc[test_index, :]
    
    label = np.where(Statistic.is_status_bad(db_teach), 1, 0)
    analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white )#
    result_df_amount = None
    test_pred = model.predict_proba(test.values)
    db_test["probability"] = test_pred[:, 1]
    description = model_name
    result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")  
    results.append(result_df_amount)

In [36]:
result_df = pd.concat(results)
n = result_df.shape[0]
result_df.index = range(n)
sub_rows = list(range(n))[::2]
result_df_without_threshold = result_df.copy().iloc[sub_rows,:]
row = {'description': 'Total'}
for col in list(result_df_without_threshold):
    if col.startswith('p_') or col == 'rating':
        row[col] = result_df_without_threshold[col].astype(float).mean()
result_df_without_threshold = result_df_without_threshold.append([row], ignore_index=True)

In [37]:
# main factors from resolved_ky10 - 30 folds
mask = result_df_without_threshold.description != 'Total'
g = result_df_without_threshold[mask].groupby(['description'])['rating'].mean()
best_param = pd.DataFrame(g, columns=['rating'])
best_param.sort_values(by='rating', ascending=False, inplace=True)
best_param

Unnamed: 0_level_0,rating
description,Unnamed: 1_level_1
xgb_ky9_5_80_03_2020_07_08,152.565


In [38]:
result_df_without_threshold.tail(15)

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating,n_white_list,n_test_in_wl,n_test_bad_in_wl,amount_test_in_wl,amount_test_bad_in_wl,n_teach,n_teach_bad,n_test,n_test_bad,amount_test_bad,amount_test
16,xgb_ky9_5_80_03_2020_07_08,0.16,4.46,6.61,8.03,8.34,10.62,12.71,17.58,32.06,100.57,1055452.0,10424.0,16.0,1045857.17,2376.23,144327.0,1033.0,61779.0,366.0,69692.98,6885047.3
17,xgb_ky9_5_80_03_2020_07_08,6.02,12.84,15.41,18.25,18.76,19.71,19.78,23.72,33.11,167.6,1055452.0,10325.0,10.0,1011884.71,1252.53,144246.0,952.0,61860.0,447.0,85274.05,6940540.02
18,xgb_ky9_5_80_03_2020_07_08,4.95,12.78,17.14,21.03,25.13,26.84,28.44,30.26,44.3,210.87,1055452.0,10239.0,40.0,1007127.45,3618.09,144287.0,993.0,61819.0,406.0,87495.61,7014270.61
19,xgb_ky9_5_80_03_2020_07_08,3.21,9.33,10.76,17.1,18.54,20.14,21.65,24.8,38.05,163.58,1055452.0,10402.0,28.0,1033700.42,3582.53,144260.0,966.0,61846.0,433.0,84602.27,6942585.68
20,xgb_ky9_5_80_03_2020_07_08,4.07,11.06,13.53,17.44,17.81,19.1,19.36,23.99,36.56,162.92,1055452.0,10278.0,12.0,1056457.02,1253.57,144262.0,968.0,61844.0,431.0,82619.69,6950788.29
21,xgb_ky9_5_80_03_2020_07_08,7.2,15.78,21.88,25.6,26.63,28.76,28.89,31.71,41.18,227.63,1055452.0,10224.0,27.0,1009369.43,2731.12,144311.0,1017.0,61795.0,382.0,79367.95,6883951.27
22,xgb_ky9_5_80_03_2020_07_08,1.19,3.1,5.94,8.86,11.44,11.81,13.83,15.91,32.14,104.22,1055452.0,10324.0,31.0,1034945.93,3206.35,144251.0,957.0,61855.0,442.0,78802.93,6924114.49
23,xgb_ky9_5_80_03_2020_07_08,1.9,6.34,9.1,12.21,14.22,17.06,17.9,20.36,29.69,128.78,1055452.0,10291.0,38.0,1042221.34,4893.39,144265.0,971.0,61841.0,428.0,70843.33,6913595.58
24,xgb_ky9_5_80_03_2020_07_08,2.5,5.57,9.48,11.5,13.03,13.4,16.01,21.54,36.61,129.64,1055452.0,10365.0,25.0,1041547.17,2371.78,144283.0,989.0,61823.0,410.0,81101.33,6910941.38
25,xgb_ky9_5_80_03_2020_07_08,1.71,7.02,11.79,13.66,16.04,18.94,21.0,22.78,37.99,150.93,1055452.0,10281.0,21.0,1015886.95,2959.15,144324.0,1030.0,61782.0,369.0,72083.28,6919469.92
