In [1]:
import sys
print('python version:', sys.version_info)
import os,inspect

import pandas as pd
print('pandas version: ', pd.__version__)

import numpy as np
print('numpy version: ', np.__version__)

import sklearn
print('sklearn version: ', sklearn.__version__)

import catboost
print('catboost version: ', catboost.__version__)
from catboost import CatBoostClassifier

import joblib
print('joblib version: ', joblib.__version__)

python version: sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
pandas version:  1.0.1
numpy version:  1.19.2
sklearn version:  0.24.1
catboost version:  0.24.3
joblib version:  1.0.0


In [2]:
assert sys.version_info >= (3, 7) and sys.version_info < (3, 8)
assert sklearn.__version__ >= "0.24" and sklearn.__version__ < "0.25"
assert pd.__version__ >= "1.0.1" and pd.__version__ < "1.2"
assert np.__version__ >= "1.19.2" and np.__version__ < "1.2"
assert catboost.__version__ >= "0.24.0" and catboost.__version__ < "0.25"
assert joblib.__version__ >= "1.0.0" and joblib.__version__ < "1.3.0"

In [3]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)

In [4]:
from utils import UtilsKy
from analyzer import AnalyzerPrediction

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [7]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude', 'phone_2_norm', 
               'is_gender_undefined', 'is_city_resolved']
COL_FACTORS = sorted(COL_FACTORS)
COL_FACTORS

['amount',
 'bank_currency',
 'bin',
 'day_of_week',
 'hour',
 'is_city_resolved',
 'is_gender_undefined',
 'latitude',
 'longitude',
 'phone_2_norm']

In [8]:
CAT_FEATURES =  ['hour', 'day_of_week', 'bank_currency']

In [9]:
def Diff(li1, li2):
     return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [10]:
NUMERIC_FEATURES = Diff(COL_FACTORS, CAT_FEATURES)
NUMERIC_FEATURES

['latitude',
 'is_gender_undefined',
 'amount',
 'bin',
 'is_city_resolved',
 'longitude',
 'phone_2_norm']

In [51]:
train = db_teach[COL_FACTORS].copy()
test = db_test[COL_FACTORS].copy()

In [52]:
train.loc[:, NUMERIC_FEATURES] = train.loc[:,NUMERIC_FEATURES].apply(pd.to_numeric, errors="coerce")
test.loc[:, NUMERIC_FEATURES] = test.loc[:,NUMERIC_FEATURES].apply(pd.to_numeric, errors="coerce")

## Show column with NaN values

In [53]:
mask = train.isnull().any(axis=1)
train[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
218,3.8,76,535016,3,20,0,1,,,19
368,7.13,76,535016,3,22,0,1,,,19
393,11.88,76,535016,3,22,0,1,,,19
398,71.26,76,544731,3,22,0,1,,,19
407,3.56,76,535016,3,23,0,1,,,19


In [54]:
mask = test.isnull().any(axis=1)
test[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
247,76.92,124,543440,2,2,0,0,,,13
703,3.77,76,534543,2,6,0,1,,,19
1050,3.77,76,534543,2,12,0,1,,,19
1088,3.77,76,534543,2,13,0,1,,,19
1390,9.42,76,515590,2,17,0,1,,,19


In [55]:
test.columns[test.isnull().any(axis=0)]

Index(['latitude', 'longitude'], dtype='object')

In [56]:
lat_replace = train.latitude.mean()
long_replace = train.longitude.mean()

replaced_values = {'latitude': lat_replace, 'longitude': long_replace, 'default': -999}

In [57]:
for col in NUMERIC_FEATURES:
    replaced_val = replaced_values.get(col) or replaced_values.get('default')
    #print(replaced_val)
    train[col] = train[col].fillna(replaced_val)
    test[col] = test[col].fillna(replaced_val)

In [58]:
mask = train.isnull().any(axis=1)
train[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm


In [59]:
mask = test.isnull().any(axis=1)
test[mask].head()

Unnamed: 0,amount,bank_currency,bin,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm


In [60]:
seed = 45
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train, db_teach.status, train_size=0.7, random_state=seed)

In [61]:
teach_for_analyzer = X_train.copy()
teach_for_analyzer['status'] = y_train

In [62]:
analyzer_prediction =  AnalyzerPrediction(teach_for_analyzer, db_test, white)

In [63]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

## Create model 

In [78]:
max_depth = 3
eta = 0.15
nround = 75
config = {'max_depth': max_depth, 'learning_rate': eta
          , 'n_estimators': nround
         }            
model = CatBoostClassifier(**config)  
model.fit(
    X_train, y_train
    ,cat_features=CAT_FEATURES
    ,eval_set=(X_validation, y_validation)
    ,verbose=False
)

test_pred = model.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])   
description +=  '_seed_{}' .format(seed)
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [73]:
result_df_amount.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
0,3-75-0.15_random_state_ww_45,7.45,12.77,15.13,17.97,20.07,21.37,24.17,35.74,53.99,163.12
1,threshold amount,0.054274,0.045433,0.040989,0.037476,0.035343,0.033515,0.032292,0.029455,0.024407,163.12
2,3-75-0.15_seed45,7.45,12.77,15.13,17.97,20.07,21.37,24.17,35.74,53.99,163.12
3,threshold amount,0.054274,0.045433,0.040989,0.037476,0.035343,0.033515,0.032292,0.029455,0.024407,163.12


## Save model

In [79]:
conf_model = {'profile': model, 'algorithm_name': 'catboost', 'factor_list': COL_FACTORS, 'replaced_values': replaced_values, 
             'numeric_factors': NUMERIC_FEATURES}

In [80]:
from datetime import date
today = date.today()
today = today.strftime("%Y-%m-%d")
today

'2021-01-22'

In [81]:
import re 
file_name =  '_' . join([str(elem) for elem in ('cat', 'kyw3',  description, today)]) 
file_name  = re.sub("[.]", "", file_name)

In [82]:
joblib.dump(conf_model, file_name)

['cat_kyw3_3-75-015_seed_45_2021-01-22']

## Load model

In [83]:
m_config = joblib.load(file_name)

In [84]:
m1 = m_config.get('profile')

In [85]:
test_pred = m1.predict_proba(test)
db_test["probability"] = test_pred[:, 1]

description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])                    
result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [86]:
result_df_amount

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating,n_white_list,n_test_in_wl,n_test_bad_in_wl,amount_test_in_wl,amount_test_bad_in_wl,n_teach,n_teach_bad,n_test,n_test_bad,amount_test_bad,amount_test
0,3-75-0.15_random_state_ww_45,7.45,12.77,15.13,17.97,20.07,21.37,24.17,35.74,53.99,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
1,threshold amount,0.054274,0.045433,0.040989,0.037476,0.035343,0.033515,0.032292,0.029455,0.024407,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
2,3-75-0.15_seed45,7.45,12.77,15.13,17.97,20.07,21.37,24.17,35.74,53.99,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
3,threshold amount,0.054274,0.045433,0.040989,0.037476,0.035343,0.033515,0.032292,0.029455,0.024407,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
4,3-75-0.15_seed_45,7.45,12.77,15.13,17.97,20.07,21.37,24.17,35.74,53.99,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
5,threshold amount,0.054274,0.045433,0.040989,0.037476,0.035343,0.033515,0.032292,0.029455,0.024407,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
6,3-75-0.15,7.45,12.77,15.13,17.97,20.07,21.37,24.17,35.74,53.99,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0
7,threshold amount,0.054274,0.045433,0.040989,0.037476,0.035343,0.033515,0.032292,0.029455,0.024407,163.12,1019125,22992,34,1640236.53,6191.08,303397,4303,58862,755,120400.38,4989361.0


In [87]:
m_config.keys()

dict_keys(['profile', 'algorithm_name', 'factor_list', 'replaced_values', 'numeric_factors'])

In [88]:
m_config.get('replaced_values')

{'latitude': 36.90237577890762,
 'longitude': -92.53325861542274,
 'default': -999}