In [1]:
import sys
print('python version:', sys.version_info)
import os,inspect

import pandas as pd
print('pandas version: ', pd.__version__)

import numpy as np
print('numpy version: ', np.__version__)

import sklearn
print('sklearn version: ', sklearn.__version__)

import xgboost as xgb
print('xgboost version: ', xgb.__version__)

import joblib
print('joblib version: ', joblib.__version__)

python version: sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
pandas version:  1.0.1
numpy version:  1.19.2
sklearn version:  0.24.1
xgboost version:  1.2.0
joblib version:  1.0.0


In [2]:
assert sys.version_info >= (3, 7) and sys.version_info < (3, 8)
assert sklearn.__version__ >= "0.24" and sklearn.__version__ < "0.25"
assert pd.__version__ >= "1.0.1" and pd.__version__ < "1.2"
assert np.__version__ >= "1.19.2" and np.__version__ < "1.2"
assert xgb.__version__ >= "1.2.0" and xgb.__version__ < "1.3.0"
assert joblib.__version__ >= "1.0.0" and joblib.__version__ < "1.3.0"

In [3]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)

In [4]:
from utils import UtilsKy
from analyzer import AnalyzerPrediction

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [7]:
# For check order replace
db_test.latitude.values[0] = 'undef'
db_test.longitude.values[0] = 'undef'

In [8]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude',
               'phone_2_norm', 'is_gender_undefined', 'is_city_resolved']
COL_FACTORS = sorted(COL_FACTORS)
COL_FACTORS

['amount',
 'bank_currency',
 'bin',
 'day_of_week',
 'hour',
 'is_city_resolved',
 'is_gender_undefined',
 'latitude',
 'longitude',
 'phone_2_norm']

In [9]:
# For Xgboost
from helper import DataHelper
datahelper = DataHelper(db_teach, db_test, COL_FACTORS)
datahelper.create_train_test()
datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()
replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999
datahelper.replaced_na_values(replaced_values)   
train , test = datahelper.get_train_test()

train na columns : Index(['latitude', 'longitude'], dtype='object')
test na columns : Index(['latitude', 'longitude'], dtype='object')
-999
-999
-999
-999
-999
-999
-92.53325861542274
36.90237577890762
-999
-999


In [10]:
train = train.values
test = test.values
label = db_teach.status

In [11]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [12]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

## Create model 

In [13]:
max_depth = 3
eta = 0.35
nround = 80
config = {'max_depth': max_depth, 'learning_rate': eta
          , 'n_estimators': nround
         }
model = xgb.XGBClassifier(**config)

model.fit(train, label, eval_metric = 'auc', sample_weight=weight)
test_pred = model.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])                    
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [26]:
result_df_amount.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
0,3-80-0.35,7.65,14.72,19.92,20.12,23.02,23.79,26.93,30.95,42.68,159.06
1,threshold amount,0.829273,0.793692,0.775948,0.757474,0.741971,0.730823,0.719396,0.688381,0.615996,159.06


## Save model

In [27]:
conf_model = {'profile': model, 'algorithm_name': 'xgboost', 'factor_list': COL_FACTORS, 'replaced_values': replaced_values}

In [28]:
from datetime import date
today = date.today()
today = today.strftime("%Y-%m-%d")
today

'2021-01-26'

In [31]:
import re 
file_name =  '_' . join([str(elem) for elem in ('xgb',  description, today)]) 
file_name  = re.sub("[.]", "", file_name)

In [32]:
joblib.dump(conf_model, file_name)

['xgb_3-80-035_2021-01-26']

## Load model

In [34]:
file_name = 'xgb_3-80-035_2021-01-26'

In [35]:
m_config = joblib.load(file_name)

In [36]:
m1 = m_config.get('profile')

In [37]:
test_pred = m1.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])                    
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [38]:
test_pred[:5, 1]

array([0.24286704, 0.13331991, 0.2087203 , 0.39174694, 0.5787057 ],
      dtype=float32)

In [40]:
m_config.keys()

dict_keys(['profile', 'algorithm_name', 'factor_list', 'replaced_values'])

In [41]:
m_config.get('replaced_values')

{'latitude': 36.90237577890762,
 'longitude': -92.53325861542274,
 'default': -999}

In [42]:
m_config['factor_list']

['amount',
 'bank_currency',
 'bin',
 'day_of_week',
 'hour',
 'is_city_resolved',
 'is_gender_undefined',
 'latitude',
 'longitude',
 'phone_2_norm']

## Create verification order

In [43]:
check_order = db_test[COL_FACTORS].iloc[0,:]
check_order

amount                 158.85
bank_currency             840
bin                    510932
day_of_week                 2
hour                       00
is_city_resolved            1
is_gender_undefined         1
latitude                undef
longitude               undef
phone_2_norm               20
Name: 0, dtype: object

In [44]:
data = check_order.to_dict()
data

{'amount': 158.85,
 'bank_currency': '840',
 'bin': '510932',
 'day_of_week': '2',
 'hour': '00',
 'is_city_resolved': '1',
 'is_gender_undefined': '1',
 'latitude': 'undef',
 'longitude': 'undef',
 'phone_2_norm': '20'}

In [45]:
order = {'config': {'profile': file_name }, 'data': data}
order

{'config': {'profile': 'xgb_3-80-035_2021-01-26'},
 'data': {'amount': 158.85,
  'bank_currency': '840',
  'bin': '510932',
  'day_of_week': '2',
  'hour': '00',
  'is_city_resolved': '1',
  'is_gender_undefined': '1',
  'latitude': 'undef',
  'longitude': 'undef',
  'phone_2_norm': '20'}}

In [46]:
import json

In [47]:
json.dumps(order)

'{"config": {"profile": "xgb_3-80-035_2021-01-26"}, "data": {"amount": 158.85, "bank_currency": "840", "bin": "510932", "day_of_week": "2", "hour": "00", "is_city_resolved": "1", "is_gender_undefined": "1", "latitude": "undef", "longitude": "undef", "phone_2_norm": "20"}}'