In [46]:
# %load ../import_data.py
from collections import Counter
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 20)

import xgboost as xgb

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from statistic import Statistic
from utils import UtilsKy
from analyzer import HelperAnalyzer, AnalyzerPrediction

# for autoreload modules
%load_ext autoreload
%autoreload 2

# kyw3
path_data = '/mnt/files/workdata/work/merchants/merchant_33_kyw3_2020-06-05/04_experiments/ex_01_some_teach/'
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
# %load ../prepare_data.py
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude', 'phone_2_norm', 'is_gender_undefined', 'is_city_resolved']
COL_FACTORS = sorted(COL_FACTORS)

# For Xgboost
from helper import DataHelper
datahelper = DataHelper(db_teach, db_test, COL_FACTORS)
datahelper.create_train_test()
datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()
replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999
datahelper.replaced_na_values(replaced_values)   
train , test = datahelper.get_train_test()

Statistic na values in columns : 
train na columns : Index(['latitude', 'longitude'], dtype='object')
latitude_na: 1537,longitude_na: 1537
test na columns : Index(['latitude', 'longitude'], dtype='object')
latitude_na: 63,longitude_na: 63
Replaced na values:
hour_na -> -999
bank_currency_na -> -999
is_gender_undefined_na -> -999
phone_2_norm_na -> -999
bin_na -> -999
day_of_week_na -> -999
longitude_na -> -92.53325861542274
latitude_na -> 36.90237577890762
is_city_resolved_na -> -999
amount_na -> -999


In [48]:
from sklearn.ensemble import RandomForestClassifier

In [99]:
# shuffle's need because fraud rows locate in one place in df
import random
n = db_test.shape[0]
x = list(range(n))
random.shuffle(x)

In [114]:
db_test_sh = db_test.iloc[x,:].copy()
test_sh = test.iloc[x,:].copy()

In [115]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test_sh ) # white
weight = analyzer_prediction.get_xgb_weight()

In [132]:
label = db_teach.status
result_df_amount = None

In [133]:
num_p = AnalyzerPrediction.get_numbers_p_for_empty_prediction_df([1,2,3,4,5,6,7,10,20,40,50,70])
result_df_amount = AnalyzerPrediction.get_empty_prediction_df(num_p)

In [134]:
for factor in COL_FACTORS:
    clf = RandomForestClassifier(random_state=0)
    clf.fit(train[[factor]], label, sample_weight=weight)
    
    test_pred = clf.predict_proba(test_sh[[factor]]) 
    db_test_sh["probability"] = test_pred[:, 1]
    
    n_unique_value = db_test_sh[factor].nunique()
    n_unique_probability = db_test_sh["probability"].nunique()
    description = "{} - {}/{}" .format(factor, n_unique_probability, n_unique_value)
    result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="count")

In [135]:
n = result_df_amount.shape[0]
sub_rows = list(range(n))[::2]
stat_best = result_df_amount.copy().iloc[sub_rows,:]

col_names = [col for col in stat_best.columns if col.startswith('p_') ] 
stat_best.loc[:, col_names] = stat_best.loc[:, col_names].astype(float)
stat_best = stat_best.sort_values(by="rating", ascending=False)

In [121]:
stat_best.iloc[:,:15]# amount methric

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,p_40,p_50,p_70,rating,n_white_list
0,amount - 461/1635,0.96,3.79,4.62,5.33,6.61,9.72,10.91,18.78,31.83,64.63,71.68,85.79,92.55,0
16,longitude - 951/6061,1.04,4.13,4.42,5.0,5.28,7.87,9.83,15.02,27.42,48.24,54.16,68.49,80.01,0
14,latitude - 949/5983,0.6,3.47,4.42,5.0,5.28,7.87,9.75,15.4,27.39,48.24,54.81,68.98,79.18,0
18,phone_2_norm - 80/93,1.06,1.89,4.78,6.93,8.76,9.22,9.26,13.56,22.52,44.78,53.95,71.67,77.98,0
8,hour - 24/24,2.05,3.5,4.15,5.26,6.23,7.44,8.06,12.97,26.81,41.57,50.72,68.73,76.47,0
12,is_gender_undefined - 2/2,1.09,2.13,3.39,4.97,5.68,7.04,9.1,12.96,24.56,50.35,57.94,71.43,70.92,0
10,is_city_resolved - 2/2,0.87,1.86,3.73,5.1,5.78,6.61,8.56,12.71,25.26,39.94,50.58,70.82,70.48,0
6,day_of_week - 7/7,1.72,2.04,3.26,4.37,5.13,5.74,7.16,11.19,22.54,45.13,56.36,73.98,63.15,0
4,bin - 322/1152,1.22,1.22,3.11,5.1,6.04,7.66,7.7,10.15,18.14,50.49,57.72,73.19,60.34,0
2,bank_currency - 18/36,0.56,0.95,1.64,3.42,4.23,4.89,4.98,7.65,17.95,36.32,44.43,66.18,46.27,0


In [113]:
stat_best.iloc[:,:15] # count methric

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,p_40,p_50,p_70,rating,n_white_list
0,amount - 461/1635,1.99,5.03,8.61,11.39,12.45,15.36,16.03,23.05,40.0,63.97,71.92,83.71,133.91,0
8,hour - 24/24,1.32,2.38,3.71,6.49,7.68,9.01,9.8,11.92,25.17,43.58,52.32,71.13,77.48,0
16,longitude - 951/6061,2.12,3.84,5.03,5.43,6.62,8.34,9.27,11.79,23.18,47.42,52.72,78.15,75.62,0
14,latitude - 949/5983,1.99,3.44,5.03,5.3,6.62,8.34,8.61,12.98,19.6,47.42,52.05,78.01,71.91,0
12,is_gender_undefined - 2/2,0.79,2.25,3.44,4.37,4.77,6.23,7.15,11.13,30.6,49.67,56.82,80.79,70.73,0
4,bin - 322/1152,1.19,1.19,1.32,4.11,4.9,8.87,11.26,12.58,23.97,60.66,67.81,81.06,69.39,0
6,day_of_week - 7/7,0.66,1.72,2.65,3.58,3.84,4.77,5.56,10.46,22.25,43.58,54.44,74.7,55.49,0
18,phone_2_norm - 80/93,0.26,1.46,2.38,3.71,4.64,5.43,5.83,7.42,17.62,43.31,53.11,75.5,48.75,0
10,is_city_resolved - 2/2,0.79,1.32,1.59,1.99,2.65,3.97,5.17,7.95,18.15,43.05,64.5,75.89,43.58,0
2,bank_currency - 18/36,0.4,0.79,1.06,1.59,2.12,2.65,3.31,5.3,14.7,51.26,56.03,74.57,31.92,0
