In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [2]:
from statistic import Statistic
from utils import UtilsKy
from analyzer import HelperAnalyzer, AnalyzerPrediction
from factors import Factor

In [3]:
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
from catboost import CatBoostClassifier

In [4]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [5]:
# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [6]:
COL_FACTORS = ['status', 'bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'phone_2_norm', 'longitude', 'latitude', ] 
COL_FACTORS = COL_FACTORS + ['is_gender_undefined', 'is_city_resolved']

In [8]:
CAT_FEATURES = ['hour', 'day_of_week', 'bank_currency']

In [9]:
# For Catboost
from helper import DataHelper
datahelper = DataHelper(db_teach, db_test, COL_FACTORS, CAT_FEATURES)
datahelper.create_train_test()
datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()
replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999
datahelper.replaced_na_values(replaced_values)   
train , test = datahelper.get_train_test()

train na columns : Index(['latitude', 'longitude'], dtype='object')
test na columns : Index(['latitude', 'longitude'], dtype='object')
-999
36.90237577890762
-999
-92.53325861542274
-999
-999
-999
-999


In [7]:
def Diff(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [8]:
cat_features = ['hour', 'day_of_week', 'bank_currency']
feature_names = COL_FACTORS
numeric_features = Diff(COL_FACTORS, cat_features)

In [9]:
train_df = db_teach[COL_FACTORS].copy()
test_df = db_test[COL_FACTORS].copy()

In [10]:
train_df[numeric_features] = train_df[numeric_features].apply(pd.to_numeric, errors="coerce")
test_df[numeric_features] = test_df[numeric_features].apply(pd.to_numeric, errors="coerce")

In [11]:
replace_val = -9999
train_df[numeric_features] = train_df[ numeric_features].fillna(replace_val)
test_df[ numeric_features] = test_df[numeric_features].fillna(replace_val)
replace_val

-9999

In [11]:
X = train.drop(columns=['status'])
y = train.status

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=12)

In [27]:
split_diff_ids = False
if split_diff_ids:
    train_df['id'] = db_teach.id    
    X_train, X_validation, y_train, y_validation = Statistic.train_test_split_with_diff_ids(train_df, 
                                                                                            test_has_unique_ids=True)
    train_df.drop(columns=['id'], inplace=True)

In [13]:
X_test = test.drop(columns=['status'])

In [14]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [15]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

In [16]:
# Ky9
max_depths =[5]
nrounds = [70, 90]
etas = [0.35, 0.4]

# Kyw3 catboost-best-params
# 3-80-0.1 + valid set -  rating 160.31
# 3-90-0.1 + valid set -  rating 154.40

# Kyw3 catboost
max_depths =[3, 5]
nrounds = [80, 90]
etas = [0.2, 0.1, 0.35]

In [18]:
for nround in nrounds:
    for eta in etas:
        for max_depth in max_depths:
            
            config = {'max_depth': max_depth, 'learning_rate': eta
                      , 'n_estimators': nround
                     }            
            model = CatBoostClassifier(**config)  

            model.fit(
                X_train, y_train
                ,cat_features=CAT_FEATURES
                ,eval_set=(X_validation, y_validation)
                ,verbose=False
#                 ,plot=True
            )

            test_pred = model.predict_proba(X_test)
            db_test["probability"] = test_pred[:, 1]

            description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])   
            description +=  ' + valid set'
            result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")
            #print(description)
            #print(model.get_feature_importance(prettified=True))          

In [19]:
n = result_df_amount.shape[0]
sub_rows = list(range(n))[::2]
stat_best = result_df_amount.copy().iloc[sub_rows,:]

col_names = [col for col in stat_best.columns if col.startswith('p_') ] 
stat_best.loc[:, col_names] = stat_best.loc[:, col_names].astype(float)
stat_best = stat_best.sort_values(by="rating", ascending=False)

In [35]:
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
16,3-90-0.1 + valid set,8.75,12.23,16.95,19.9,22.53,25.69,26.57,33.94,54.06,168.36
4,3-80-0.1 + valid set,8.75,13.11,15.77,19.51,23.1,25.3,26.8,34.16,53.74,168.14
6,5-80-0.1 + valid set,8.89,11.98,15.09,17.85,19.55,20.46,24.03,32.02,52.16,157.54
18,5-90-0.1 + valid set,8.56,12.11,14.72,17.85,19.44,21.16,23.43,32.12,51.64,156.44
10,5-80-0.35 + valid set,6.28,11.78,14.35,19.24,22.39,24.33,24.51,27.09,48.79,149.92
12,3-90-0.2 + valid set,6.24,10.76,14.16,14.77,17.62,20.81,21.67,31.0,51.95,146.5
0,3-80-0.2 + valid set,6.32,12.26,14.03,14.98,17.47,18.84,22.01,30.24,49.77,145.07
22,5-90-0.35 + valid set,8.79,9.96,13.0,17.54,19.63,23.69,25.02,26.4,47.81,143.13
2,5-80-0.2 + valid set,4.26,7.86,13.58,15.74,17.5,18.97,22.61,30.87,51.03,140.84
14,5-90-0.2 + valid set,5.12,9.8,14.38,15.83,17.5,19.19,22.54,28.67,48.02,139.32
