In [1]:
from collections import Counter
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 20)

In [2]:
import xgboost as xgb

In [3]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [4]:
from statistic import Statistic
from utils import UtilsKy
from analyzer import HelperAnalyzer, AnalyzerPrediction
from decomposition_pca.encode_strategy import Encode

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
# kyw3
#path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [7]:
db_teach.columns

Index(['amount', 'amount_deviation', 'bank_currency', 'bin', 'city',
       'count_months_to_end_card', 'day_of_week', 'gender2', 'hour', 'id',
       'is_city_resolved', 'is_gender_undefined', 'latitude', 'longitude',
       'order_id', 'phone_2_norm', 'status'],
      dtype='object')

In [8]:
Statistic.get_table_value_counts(db_teach, 'status')

0    427164
1    6261  
Name: status, dtype: int64

In [9]:
Statistic.get_table_value_counts(db_test, 'status')

0    58107
1    755  
Name: status, dtype: int64

In [10]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude', 'phone_2_norm'] 
COL_FACTORS = COL_FACTORS + ['is_gender_undefined', 'is_city_resolved']

In [11]:
experiment = ''
is_factor_encode = True
if is_factor_encode:
    col_name = 'bin'
    col_enc_name = 'enc_' + col_name
    print("col_enc_name={}" . format(col_enc_name))
    ret_val = Encode().ordered_target_st(db_teach, db_test, 'bin')
    db_teach[col_enc_name] =  ret_val.get('encode_teach')
    db_test[col_enc_name] =  ret_val.get('test_encode_last')
    experiment = col_enc_name + '_' + 'encode_last'
    print("experiment={}" . format(experiment))
    COL_FACTORS.remove(col_name)
    COL_FACTORS.append(col_enc_name)

col_enc_name=enc_bin
experiment=enc_bin_encode_last


In [12]:
train = db_teach[COL_FACTORS]
test = db_test[COL_FACTORS]

In [13]:
train = train.apply(pd.to_numeric, errors="coerce")
test = test.apply(pd.to_numeric, errors="coerce")

In [14]:
replace_val = -9999
train = train.fillna(replace_val)
test = test.fillna(replace_val)
replace_val

-9999

In [15]:
train = train.values
test = test.values
label = db_teach.status

In [16]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [17]:
result_df_amount = None
weight = analyzer_prediction.get_xgb_weight()

In [25]:
# Kyw3
max_depths =[3]
nrounds = [80, 90]
etas = [0.2, 0.3, 0.35]

nrounds = [80]
etas = [0.3]


In [26]:
for nround in nrounds:
    for eta in etas:
        for max_depth in max_depths:
            config = {'max_depth': max_depth, 'learning_rate': eta
                      , 'n_estimators': nround
                     }
            model = xgb.XGBClassifier(**config)
            
            model.fit(train, label, eval_metric = 'auc', sample_weight=weight)
             
            test_pred = model.predict_proba(test)
            db_test["probability"] = test_pred[:, 1]

            description = '-' . join([str(elem) for elem in (max_depth, nround, eta)])                    
            result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")

In [20]:
n = result_df_amount.shape[0]
sub_rows = list(range(n))[::2]
stat_best = result_df_amount.copy().iloc[sub_rows,:]

col_names = [col for col in stat_best.columns if col.startswith('p_') ] 
stat_best.loc[:, col_names] = stat_best.loc[:, col_names].astype(float)
stat_best = stat_best.sort_values(by="rating", ascending=False)

In [27]:
data = {'feature': COL_FACTORS, 'impotance': model.feature_importances_}
df = pd.DataFrame(data)
#df.sort_values(by='impotance', ascending=False)

In [28]:
compare_result_on_diff_subsample = True
if compare_result_on_diff_subsample:
    result_df_amount = None
    description = '-' . join([str(elem) for elem in (max_depth, nround, eta, experiment)])                    

    db_test['random_folds'] = Statistic.add_random_list_column(db_test)
    db_tests = [db_test]

    for i in range(1,4):
        db_test_sub = db_test[db_test.random_folds == i].copy()
        db_tests.append(db_test_sub)
    for db_test_ in db_tests:
        analyzer_prediction =  AnalyzerPrediction(db_teach, db_test_, white)    
        result_df_amount = analyzer_prediction.get_table_prediction(
        description=description, result_df=result_df_amount, metric="amount")
    #result_df_amount[:,:20]    

In [24]:
df.sort_values(by='impotance', ascending=False)

Unnamed: 0,feature,impotance
0,bin,0.314239
6,latitude,0.183306
7,phone_2_norm,0.160393
5,longitude,0.126023
1,amount,0.091653
2,bank_currency,0.037643
3,hour,0.03437
4,day_of_week,0.021277
8,is_gender_undefined,0.018003
9,is_city_resolved,0.013093


In [25]:
# Kyw3 
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
0,3-80-0.2,9.05,11.52,17.02,20.53,22.26,22.26,22.73,27.29,42.22,149.89
4,3-90-0.2,8.39,10.99,15.7,20.46,21.64,22.58,22.92,27.28,42.22,146.68
6,3-90-0.35,7.37,14.51,17.72,18.76,19.59,21.04,22.38,26.45,39.89,144.29
2,3-80-0.35,5.84,12.83,17.29,18.62,19.51,21.77,22.34,26.74,40.87,141.7


In [31]:
df.sort_values(by='impotance', ascending=False)

Unnamed: 0,feature,impotance
0,bin,0.309951
6,latitude,0.164763
7,phone_2_norm,0.161501
5,longitude,0.106036
1,amount,0.102773
2,bank_currency,0.042414
3,hour,0.042414
8,is_gender_undefined,0.030995
4,day_of_week,0.027732
9,is_city_resolved,0.011419


In [35]:
result_df_amount.iloc[:,:20] # Kyw3 difference samples

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating,n_white_list,n_test_in_wl,n_test_bad_in_wl,amount_test_in_wl,amount_test_bad_in_wl,n_teach,n_teach_bad,n_test,n_test_bad
0,3-90-0.2-,8.39,10.99,15.7,20.46,21.64,22.58,22.92,27.28,42.22,146.68,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755
1,threshold amount,0.807446,0.774622,0.755202,0.740178,0.728026,0.717245,0.708831,0.685745,0.621256,146.68,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755
2,3-90-0.2-,9.44,9.96,12.96,14.27,19.31,22.46,22.46,25.34,38.81,130.09,1019125,7596,11,535330.08,2679.11,433425,6261,19546,245
3,threshold amount,0.808451,0.780928,0.758491,0.746396,0.734189,0.722675,0.712144,0.691609,0.62897,130.09,1019125,7596,11,535330.08,2679.11,433425,6261,19546,245
4,3-90-0.2-,10.37,13.76,19.11,19.72,20.45,21.11,21.35,27.31,45.31,156.03,1019125,7719,8,550371.1,1327.4,433425,6261,19698,262
5,threshold amount,0.803806,0.768511,0.749789,0.735944,0.725309,0.714398,0.705789,0.683541,0.61854,156.03,1019125,7719,8,550371.1,1327.4,433425,6261,19698,262
6,3-90-0.2-,4.84,8.82,16.92,22.91,23.05,24.46,25.6,30.18,40.31,147.03,1019125,7677,15,554535.35,2184.57,433425,6261,19618,248
7,threshold amount,0.807804,0.774622,0.75421,0.739522,0.726546,0.716642,0.704182,0.681457,0.617657,147.03,1019125,7677,15,554535.35,2184.57,433425,6261,19618,248


In [23]:
df.sort_values(by='impotance', ascending=False)

Unnamed: 0,feature,impotance
5,latitude,0.224026
4,longitude,0.186688
9,enc_bin,0.176948
6,phone_2_norm,0.175325
0,amount,0.121753
2,hour,0.042208
1,bank_currency,0.024351
7,is_gender_undefined,0.024351
3,day_of_week,0.019481
8,is_city_resolved,0.00487


In [24]:
# Kyw3 
stat_best.iloc[:,:11]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating
2,3-80-0.3,5.4,8.81,12.75,19.75,22.88,28.38,31.01,34.93,51.83,156.35
8,3-90-0.3,4.57,8.55,12.31,18.41,23.13,27.17,30.74,34.67,48.2,149.84
6,3-90-0.2,6.11,9.41,14.19,16.83,21.33,22.64,24.04,30.41,49.86,148.14
0,3-80-0.2,5.23,9.17,14.19,15.77,19.09,21.73,24.01,27.52,49.69,140.66
4,3-80-0.35,3.03,7.78,12.91,15.44,17.4,21.34,23.8,30.12,51.33,138.01
10,3-90-0.35,3.03,7.78,13.33,15.33,17.91,19.79,22.33,30.17,47.97,135.52


In [29]:
df.sort_values(by='impotance', ascending=False)

Unnamed: 0,feature,impotance
5,latitude,0.239927
9,enc_bin,0.184982
6,phone_2_norm,0.181319
4,longitude,0.142857
0,amount,0.135531
2,hour,0.03663
1,bank_currency,0.029304
3,day_of_week,0.02381
7,is_gender_undefined,0.018315
8,is_city_resolved,0.007326


In [30]:
result_df_amount.iloc[:,:20] # Kyw3 difference samples

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating,n_white_list,n_test_in_wl,n_test_bad_in_wl,amount_test_in_wl,amount_test_bad_in_wl,n_teach,n_teach_bad,n_test,n_test_bad
0,3-80-0.3-enc_bin_encode_last,5.4,8.81,12.75,19.75,22.88,28.38,31.01,34.93,51.83,156.35,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755
1,threshold amount,0.78283,0.728424,0.694892,0.670368,0.652248,0.634721,0.617341,0.572282,0.458472,156.35,1019125,22992,34,1640236.53,6191.08,433425,6261,58862,755
2,3-80-0.3-enc_bin_encode_last,3.93,10.75,12.06,18.89,23.73,25.75,28.58,33.15,52.49,155.0,1019125,7596,11,535330.08,2679.11,433425,6261,19546,245
3,threshold amount,0.796954,0.735915,0.704681,0.680283,0.661418,0.644226,0.628796,0.584066,0.466075,155.0,1019125,7596,11,535330.08,2679.11,433425,6261,19546,245
4,3-80-0.3-enc_bin_encode_last,4.85,8.98,13.88,18.01,20.68,27.13,31.46,34.93,53.33,154.66,1019125,7719,8,550371.1,1327.4,433425,6261,19698,262
5,threshold amount,0.785083,0.731004,0.704094,0.671665,0.656404,0.642284,0.618501,0.573331,0.457865,154.66,1019125,7719,8,550371.1,1327.4,433425,6261,19698,262
6,3-80-0.3-enc_bin_encode_last,4.21,7.56,14.8,19.76,29.13,32.29,32.29,35.57,49.51,160.54,1019125,7677,15,554535.35,2184.57,433425,6261,19618,248
7,threshold amount,0.766166,0.710486,0.684408,0.65489,0.636084,0.619104,0.601683,0.559829,0.448496,160.54,1019125,7677,15,554535.35,2184.57,433425,6261,19618,248
