In [1]:
# https://catboost.ai/
# https://github.com/catboost/catboost/blob/master/catboost/tutorials/events/pydata_moscow_oct_13_2018.ipynb

# run in comand line
# jupyter nbextension enable --py widgetsnbextension
# model.fit( ...
#      plot=True
# )

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [2]:
from statistic import Statistic
from utils import UtilsKy
from analyzer import HelperAnalyzer, AnalyzerPrediction
from factors import Factor

In [3]:
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
from catboost import CatBoostClassifier

In [4]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [5]:
# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [6]:
COL_FACTORS = ['status', 'bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'phone_2_norm', 'longitude', 'latitude', ] 
COL_FACTORS = COL_FACTORS + ['is_gender_undefined', 'is_city_resolved']

In [7]:
def Diff(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [8]:
cat_features = ['hour', 'day_of_week', 'bank_currency']
feature_names = COL_FACTORS
numeric_features = Diff(COL_FACTORS, cat_features)

In [9]:
train_df = db_teach[COL_FACTORS].copy()
test_df = db_test[COL_FACTORS].copy()

In [10]:
train_df[numeric_features] = train_df[numeric_features].apply(pd.to_numeric, errors="coerce")
test_df[numeric_features] = test_df[numeric_features].apply(pd.to_numeric, errors="coerce")

In [11]:
replace_val = -9999
train_df[numeric_features] = train_df[ numeric_features].fillna(replace_val)
test_df[ numeric_features] = test_df[numeric_features].fillna(replace_val)
replace_val

-9999

In [12]:
X = train_df.drop(columns=['status'])
y = train_df.status

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)

In [15]:
split_diff_ids = False
if split_diff_ids:
    train_df['id'] = db_teach.id    
    X_train, X_validation, y_train, y_validation = Statistic.train_test_split_with_diff_ids(train_df, 
                                                                                            test_has_unique_ids=True)
    train_df.drop(columns=['id'], inplace=True)

In [16]:
X_test = test_df.drop(columns=['status'])

In [17]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [22]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

In [19]:
# if we want to add some attribute to model.
CatBoostClassifier.__algorithm_name = 'catboost'

In [20]:
# Ky9
max_depths =[5]
nrounds = [70, 90]
etas = [0.35, 0.4]

# Kyw3 catboost-best-params
# 3-80-0.1 + valid set -  rating 160.31
# 3-90-0.1 + valid set -  rating 154.40

# Kyw3 catboost
max_depths =[3, 5]
nrounds = [80, 90]
etas = [0.2, 0.1, 0.35]

In [23]:
for nround in nrounds:
    for eta in etas:
        for max_depth in max_depths:
            
            config = {'max_depth': max_depth, 'learning_rate': eta
                      , 'n_estimators': nround
                     }            
            model = CatBoostClassifier(**config)  
#             model.fit(
#                 X, y,
#                 cat_features=cat_features,                
#                 verbose=False
#             )

            model.fit(
                X_train, y_train
                ,cat_features=cat_features
                ,eval_set=(X_validation, y_validation)
                ,verbose=False
#                 ,plot=True
            )

            test_pred = model.predict_proba(X_test)
            db_test["probability"] = test_pred[:, 1]

            description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])   
            description +=  ' + valid set'
            result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")
            #print(description)
            #print(model.get_feature_importance(prettified=True))
            

In [24]:
n = result_df_amount.shape[0]
sub_rows = list(range(n))[::2]
stat_best = result_df_amount.copy().iloc[sub_rows,:]

col_names = [col for col in stat_best.columns if col.startswith('p_') ] 
stat_best.loc[:, col_names] = stat_best.loc[:, col_names].astype(float)
stat_best = stat_best.sort_values(by="rating", ascending=False)

In [25]:
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
6,5-80-0.1 + valid set,8.19,10.97,14.46,16.69,23.52,24.63,29.07,33.77,53.38,160.98
0,3-80-0.2 + valid set,5.69,10.62,16.13,18.53,18.92,22.15,24.05,33.95,53.91,157.75
4,3-80-0.1 + valid set,6.75,12.59,13.76,17.08,21.97,24.31,26.5,33.41,50.73,156.29
16,3-90-0.1 + valid set,6.85,12.43,13.78,16.62,20.97,25.07,26.12,33.26,52.23,156.14
18,5-90-0.1 + valid set,7.23,10.28,13.17,15.55,21.86,26.4,29.44,34.24,52.69,155.02
8,3-80-0.35 + valid set,6.16,13.28,15.54,17.62,21.09,23.31,26.79,30.83,49.88,154.4
12,3-90-0.2 + valid set,4.68,10.28,13.22,17.71,18.94,22.15,23.98,32.53,52.13,149.49
20,3-90-0.35 + valid set,5.84,11.48,15.47,16.83,20.72,24.08,26.91,29.69,49.18,149.21
2,5-80-0.2 + valid set,5.91,9.12,13.83,16.38,19.42,20.79,23.74,31.79,49.02,145.47
10,5-80-0.35 + valid set,6.11,10.04,14.48,18.29,20.87,22.08,23.14,26.19,44.59,140.57


In [54]:
# Example load and save model
import joblib 

In [55]:
path_model = "/mnt/files/workdata/work/python-scripts/prediction_analyzer/cat_boost/model-catboost"

In [56]:
model.__algorithm_name = 'catboost_2'

In [57]:
joblib.dump(model, path_model)

['/mnt/files/workdata/work/python-scripts/prediction_analyzer/cat_boost/model-catboost']

In [58]:
model_load = joblib.load(path_model)

In [59]:
model_load.__algorithm_name 

'catboost'

In [95]:
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
4,3-80-0.1 + valid set,6.03,12.5,14.97,20.55,24.82,25.96,29.69,32.54,48.9,160.31
12,3-90-0.1 + valid set,4.9,9.75,12.11,18.65,25.63,26.49,29.59,33.7,49.66,154.4
10,5-90-0.2 + valid set,5.42,9.89,15.5,19.06,21.56,23.46,26.01,31.85,44.31,147.59
2,5-80-0.2 + valid set,5.07,10.3,14.78,17.34,21.08,25.84,27.63,32.55,45.13,146.25
14,5-90-0.1 + valid set,5.98,9.08,10.4,15.39,20.85,23.55,27.34,35.16,47.31,144.17
6,5-80-0.1 + valid set,4.82,9.08,12.39,16.22,19.21,23.58,27.35,35.29,46.29,143.3
0,3-80-0.2 + valid set,5.0,8.5,13.44,16.1,18.64,21.77,25.13,31.3,49.62,142.6
8,3-90-0.2 + valid set,4.56,8.19,13.01,15.2,19.13,22.47,25.96,30.47,49.76,140.32


In [74]:
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
10,3-80-0.35 + valid set,4.45,7.88,8.63,13.19,15.87,16.45,18.03,26.36,43.95,120.33
14,3-90-0.35 + valid set,4.45,7.7,8.46,13.19,15.87,16.89,18.03,26.1,44.01,119.78
12,3-90-0.2 + valid set,3.97,6.79,10.97,13.27,15.9,17.1,19.0,22.32,42.51,115.73
24,3-90-0.2 + valid set,3.97,6.79,10.97,13.27,15.9,17.1,19.0,22.32,42.51,115.73
8,3-80-0.2 + valid set,3.97,6.79,10.97,13.41,15.65,16.84,19.3,22.32,41.87,114.98
16,3-80-0.2 + valid set,3.97,6.79,10.97,13.41,15.65,16.84,19.3,22.32,41.87,114.98
18,5-80-0.2 + valid set,4.59,6.12,9.55,11.54,13.27,16.43,19.23,25.09,44.21,114.37
26,5-90-0.2 + valid set,4.06,5.59,8.97,10.66,12.43,14.29,18.84,28.48,44.13,114.32
4,5-90-0.35 - validation set,4.72,5.72,6.31,12.78,14.25,16.61,21.12,23.13,44.82,111.73
22,5-80-0.1 + valid set,3.49,6.04,9.18,10.0,16.28,18.02,21.06,25.3,41.39,111.68


In [55]:
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
40,5-80-0.2 - validation set,4.26,9.09,13.6,17.87,19.17,20.16,22.18,27.12,42.77,133.88
6,3-90-0.35encode hour,5.67,9.25,11.76,13.42,18.16,21.09,24.33,30.48,45.09,133.83
14,"3-90-0.35is gender undef, is city resolver",5.67,9.25,11.76,13.42,18.16,21.09,24.33,30.48,45.09,133.83
4,3-90-0.2encode hour,6.11,9.0,10.06,12.67,15.66,18.32,24.13,33.95,46.24,133.69
12,"3-90-0.2is gender undef, is city resolver",6.11,9.0,10.06,12.67,15.66,18.32,24.13,33.95,46.24,133.69
30,3-90-0.35 - validation set,3.73,9.72,11.95,13.68,16.85,18.33,18.86,29.61,45.3,130.84
22,3-90-0.35 - validation set,3.73,9.72,11.95,13.68,16.85,18.33,18.86,29.61,45.3,130.84
38,3-90-0.35 - validation set,3.73,9.72,11.95,13.68,16.85,18.33,18.86,29.61,45.3,130.84
50,5-70-0.4 - validation set,6.14,10.5,11.95,13.37,16.34,17.49,18.76,28.45,42.83,129.58
10,"3-80-0.35is gender undef, is city resolver",3.91,7.38,10.44,13.75,17.28,21.54,24.61,30.15,46.67,129.58


In [45]:
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
6,3-90-0.35encode hour,5.67,9.25,11.76,13.42,18.16,21.09,24.33,30.48,45.09,133.83
14,"3-90-0.35is gender undef, is city resolver",5.67,9.25,11.76,13.42,18.16,21.09,24.33,30.48,45.09,133.83
4,3-90-0.2encode hour,6.11,9.0,10.06,12.67,15.66,18.32,24.13,33.95,46.24,133.69
12,"3-90-0.2is gender undef, is city resolver",6.11,9.0,10.06,12.67,15.66,18.32,24.13,33.95,46.24,133.69
22,3-90-0.35 - validation set,3.73,9.72,11.95,13.68,16.85,18.33,18.86,29.61,45.3,130.84
2,3-80-0.35encode hour,3.91,7.38,10.44,13.75,17.28,21.54,24.61,30.15,46.67,129.58
10,"3-80-0.35is gender undef, is city resolver",3.91,7.38,10.44,13.75,17.28,21.54,24.61,30.15,46.67,129.58
20,3-90-0.2 - validation set,5.8,7.32,10.99,13.53,18.92,20.96,24.02,28.35,44.56,129.47
18,3-80-0.35 - validation set,3.48,9.09,12.03,13.87,16.42,16.77,19.64,28.1,45.29,128.28
0,3-80-0.2encode hour,6.28,8.47,11.23,12.07,12.49,17.0,24.04,31.59,45.21,127.34
