In [1]:
import sys
print('python version:', sys.version_info)
import os,inspect

import pandas as pd
print('pandas version: ', pd.__version__)

import numpy as np
print('numpy version: ', np.__version__)

import sklearn
print('sklearn version: ', sklearn.__version__)

import xgboost as xgb
print('xgboost version: ', xgb.__version__)

import joblib
print('joblib version: ', joblib.__version__)

python version: sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
pandas version:  1.0.1
numpy version:  1.19.2
sklearn version:  0.24.1
xgboost version:  1.2.0
joblib version:  1.0.0


In [2]:
assert sys.version_info >= (3, 7) and sys.version_info < (3, 8)
assert sklearn.__version__ >= "0.24" and sklearn.__version__ < "0.25"
assert pd.__version__ >= "1.0.1" and pd.__version__ < "1.2"
assert np.__version__ >= "1.19.2" and np.__version__ < "1.2"
assert xgb.__version__ >= "1.2.0" and xgb.__version__ < "1.3.0"
assert joblib.__version__ >= "1.0.0" and joblib.__version__ < "1.3.0"

In [3]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)

In [4]:
from utils import UtilsKy
from analyzer import AnalyzerPrediction

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [7]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude',
               'phone_2_norm', 'is_gender_undefined', 'is_city_resolved']
COL_FACTORS = sorted(COL_FACTORS)
COL_FACTORS

['amount',
 'bank_currency',
 'bin',
 'day_of_week',
 'hour',
 'is_city_resolved',
 'is_gender_undefined',
 'latitude',
 'longitude',
 'phone_2_norm']

In [8]:
train = db_teach[COL_FACTORS]
test = db_test[COL_FACTORS]

In [9]:
train = train.apply(pd.to_numeric, errors="coerce")
test = test.apply(pd.to_numeric, errors="coerce")

## Show column with NaN values

In [10]:
mask = train.isnull().any(axis=1)
train[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
218,3.8,76,535016,3,20,0,1,,,19
368,7.13,76,535016,3,22,0,1,,,19
393,11.88,76,535016,3,22,0,1,,,19
398,71.26,76,544731,3,22,0,1,,,19
407,3.56,76,535016,3,23,0,1,,,19


In [11]:
mask = test.isnull().any(axis=1)
test[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
247,76.92,124,543440,2,2,0,0,,,13
703,3.77,76,534543,2,6,0,1,,,19
1050,3.77,76,534543,2,12,0,1,,,19
1088,3.77,76,534543,2,13,0,1,,,19
1390,9.42,76,515590,2,17,0,1,,,19


In [12]:
test.columns[test.isnull().any(axis=0)]

Index(['latitude', 'longitude'], dtype='object')

In [13]:
lat_replace = train.latitude.mean()
long_replace = train.longitude.mean()

replaced_values = {'latitude': lat_replace, 'longitude': long_replace, 'default': -999}

In [14]:
for col in COL_FACTORS:
    replaced_val = replaced_values.get(col) or replaced_values.get('default')
    print(replaced_val)
    train[col] = train[col].fillna(replaced_val)
    test[col] = test[col].fillna(replaced_val)

-999
-999
-999
-999
-999
-999
-999
36.90237577890762
-92.53325861542274
-999


In [15]:
mask = train.isnull().any(axis=1)
train[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm


In [16]:
mask = test.isnull().any(axis=1)
test[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm


In [17]:
train = train.values
test = test.values
label = db_teach.status

In [18]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [19]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

## Create model 

In [20]:
max_depth = 3
eta = 0.35
nround = 80
config = {'max_depth': max_depth, 'learning_rate': eta
          , 'n_estimators': nround
         }
model = xgb.XGBClassifier(**config)

model.fit(train, label, eval_metric = 'auc', sample_weight=weight)
test_pred = model.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])                    
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [21]:
result_df_amount.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
0,3-80-0.35,7.65,14.72,19.92,20.12,23.02,23.79,26.93,30.95,42.68,159.06
1,threshold amount,0.829273,0.793692,0.775948,0.757474,0.741971,0.730823,0.719396,0.688381,0.615996,159.06


## Save model

In [22]:
conf_model = {'profile': model, 'algorithm_name': 'xgboost', 'factor_list': COL_FACTORS, 'replaced_values': replaced_values}

In [23]:
from datetime import date
today = date.today()
today = today.strftime("%Y-%m-%d")
today

'2021-01-22'

In [24]:
import re 
file_name =  '_' . join([str(elem) for elem in ('xgb', 'kyw3',  description, today)]) 
file_name  = re.sub("[.]", "", file_name)

In [25]:
joblib.dump(conf_model, file_name)

['xgb_kyw3_3-80-035_2021-01-22']

## Load model

In [26]:
m_config = joblib.load(file_name)

In [27]:
m1 = m_config.get('profile')

In [28]:
test_pred = m1.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])                    
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [29]:
result_df_amount

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating,n_white_list,n_test_in_wl,n_test_bad_in_wl,amount_test_in_wl,amount_test_bad_in_wl,n_teach,n_teach_bad,n_test,n_test_bad,amount_test_bad,amount_test
0,3-80-0.35,7.65,14.72,19.92,20.12,23.02,23.79,26.93,30.95,42.68,159.06,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755,120400.38,4989361.0
1,threshold amount,0.829273,0.793692,0.775948,0.757474,0.741971,0.730823,0.719396,0.688381,0.615996,159.06,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755,120400.38,4989361.0
2,3-80-0.35,7.65,14.72,19.92,20.12,23.02,23.79,26.93,30.95,42.68,159.06,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755,120400.38,4989361.0
3,threshold amount,0.829273,0.793692,0.775948,0.757474,0.741971,0.730823,0.719396,0.688381,0.615996,159.06,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755,120400.38,4989361.0


In [30]:
m_config.keys()

dict_keys(['profile', 'algorithm_name', 'factor_list', 'replaced_values'])

In [31]:
m_config.get('replaced_values')

{'latitude': 36.90237577890762,
 'longitude': -92.53325861542274,
 'default': -999}