# lib

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,recall_score,precision_score
from sklearn.metrics import roc_curve
from sklearn.utils import shuffle
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV   #Perforing grid search
import matplotlib.pylab as plt
from collections import Counter
from matplotlib.pylab import rcParams
from sklearn.model_selection import StratifiedKFold
from xgboost import plot_importance
from sklearn.model_selection import validation_curve
from sklearn.metrics import fbeta_score, make_scorer
from xgboost import plot_tree
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from inspect import signature
import shap
import scikitplot as skplt
rcParams['figure.figsize'] = 12, 4
import math as m
import time
import seaborn as sns
import scipy.stats as scs
from sklearn.preprocessing import LabelEncoder

# fix random seed for reproducibility

In [2]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [4]:
params =  {'base_score': 0.17,
            'booster': 'gbtree',
            'colsample_bylevel': 1,
            'colsample_bynode': 1,
            'colsample_bytree': 1,
            'gamma': 0,
            'learning_rate': 0.1,
            'max_delta_step': 0,
            'max_depth': 8,
            'min_child_weight': 1,
            'missing': None,
            'n_estimators': 150,
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'random_state': 0,
            'reg_alpha': 0.5,
            'reg_lambda': 50,
            'scale_pos_weight': 5,
            'seed': 7,
            'silent': True,
            'subsample': 1,
            'verbosity': 1,}

In [6]:

def xgb_cv(X,y, params, n_estimators=150, folds=5, metric='error', verbose=30, cv=True, test_size=0.3):
    start_time = time.time()
    features = X.columns
    params['n_estimators'] = n_estimators
    estimators = []
    total_progress = []
    if cv:
        kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
        params['eval_metric'] = metric
        i = 0
        print(f'Num of splits: {folds}\nX_train size: {int(X.shape[0]*(folds-1)/folds)} ----- X_test size: {int(X.shape[0]/folds)}\n')
        for nfold, (train_index, test_index) in enumerate(kfold.split(X, y)):
            progress={}
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()
            
            xgtrain = xgb.DMatrix(X_train, label=y_train, feature_names = features)
            xgeval = xgb.DMatrix(X_test, label=y_test, feature_names = features)
            gbm = xgb.train(params, xgtrain, evals=[(xgtrain,'train'), (xgeval,'eval')],evals_result = progress, verbose_eval = verbose, num_boost_round = n_estimators)
            #model.fit(X_train, y_train, eval_metric=metric)
            estimators.append(gbm)
            total_progress.append(progress)
            print(f'{nfold+1} of {folds} is processed')
        
        #eval_results
        train_eval = np.array(total_progress[0]['train'][metric])
        test_eval = np.array(total_progress[0]['eval'][metric])
        for i in range(1,5):
            train_eval += np.array(total_progress[i]['train'][metric])
            test_eval += np.array(total_progress[i]['eval'][metric])
        train_eval = train_eval/folds
        test_eval = test_eval/folds
        total_progress = [train_eval, test_eval]
    
    else:
        gbm = XGBClassifier()
        gbm.set_params(**params)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
        print(f'X_train size: {X_train.shape[0]} ----- X_test size: {X_test.shape[0]}\n')
        gbm.fit(X_train, y_train,eval_metric=metric, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=verbose)
        estimators.append(gbm)
        total_progress = [gbm.evals_result_['validation_0'][metric], gbm.evals_result_['validation_1'][metric]]
    print("\n--- %s seconds ---" % (time.time() - start_time))
    return estimators, total_progress

In [18]:
def precision_calc(alg, X, y, recall=0.9, lim=0.0001, cv=False, to_print=False, auc=False): #returns (precision, threshold)
    lst = []
    for model in alg:
        y_predict_proba = model.predict(xgb.DMatrix(X)) if cv else model.predict_proba(X)[:,1]
        first, last, ix = 0.1, 0.5, 0.1
        rec_temp = recall_score(y,  np.where(y_predict_proba<ix,0,1))
        
        while abs(recall-rec_temp)>lim:
            ix = (first+last)/2
            rec_temp = recall_score(y,  np.where(y_predict_proba<ix,0,1))
            if recall>rec_temp:
                last = ix
            elif rec_temp>recall:
                first = ix
        
        lst.append([precision_score(y, np.where(y_predict_proba<ix,0,1)), ix])
    lst = np.array(lst).mean(0)
    if to_print:
        print(f'precision = {np.round(lst[0],5)} -- (recall = {recall}, th = {np.round(lst[1],5)})')
    return lst[0], lst[1], np.where(y_predict_proba<ix,0,1)
    
    #precision_calc(l2, df.drop(todrop,1), df['Target'], 0.9, 0.0001, cv=True)

# Read the data

In [21]:

"""with open('C:/Users/ivanov.al/Desktop/Projects/purchasing-behavior-model-master/out/data_train_test_val.csv', 'rb') as f:
    df = pd.read_csv(f, sep=',')
with open('../out/data_predict.csv','rb') as f:
    df_pred = pd.read_csv(f)"""
with open('C:/Users/ivanov.al/Desktop/Projects/purchasing-behavior-model-master/4SASHA/customers_variables/df_ready_shifted.csv', 'rb') as f:
          df_shifted = pd.read_csv(f, sep=',')
with open('C:/Users/ivanov.al/Desktop/Projects/purchasing-behavior-model-master/4SASHA/customers_variables/df_ready.csv', 'rb') as f:
          df = pd.read_csv(f, sep=',')
df = df[~df.GUID.str.contains('_t')]
#df_shifted = df_shifted[~df_shifted.GUID.str.contains('_t3')]

In [24]:
features = ['Monetary value mean', 'Monetary value sum','TOT_ALL','TOT_NOTE', 'previous t_1 Galaxy S',
       'previous t_2 Galaxy S', 'previous t_3 Galaxy S',
       'previous t_4 Galaxy S', 'previous t_5 Galaxy S', 'Tot_Galaxy',
       'TOT_OTHER', 'abandon_note_t_4', 'abandon_note_t_3', 'abandon_note_t_2',
       'abandon_note_t_1', 'abandon_note', 'back_to_NOTE_t_3',
       'back_to_NOTE_t_2', 'back_to_NOTE_t_1', 'back_to_NOTE_NUM',
       'abandon_note_NUM', 'back_to_NOTE','Recency DEVICE', 'no purchase device_2 t_1_2', 'Recency_from_now',
       'Recency DEVICE_T_MIN_T_1', 'Recency DEVICE_T_MIN_today',
       'Recency_DEVICE_avg', 'Recency NOTE', 'no purchase Note',
       'Currently using device','S Health', 'S Pay', 'Samsung Members','Segment code','Usage period', 'Year lapsed',
       'Gender', 'No longer contact', 'Samsung Deleted', 'long_commun', 'age','no purchase device_2 t_1_3', 'TOT_NOTE_gr_1', 'TOT_ALL_gr_1', 'TOT_OTHER_gr_1', 'Tot_Galaxy_gr1',           'TOT_NOTE_gr_2', 'TOT_ALL_gr_2', 'TOT_OTHER_gr_2', 'Tot_Galaxy_gr_2', 'TOT_NOTE_gr_3', 'TOT_ALL_gr_3', 'TOT_OTHER_gr_3', 'Tot_Galaxy_gr_3', 'age1', 'Recency NOTE1',                'Recency NOTE1_gr', 'Recency NOTE1_gr2','Target']
#features = list(set(features) - set(['back_to_NOTE', 'Samsung Deleted', 'no purchase device_2 t_1_2', 'no purchase Note', 'back_to_NOTE_t_3', 'back_to_NOTE_t_1', 
#                                     'previous t_4 Galaxy S', 'abandon_note_NUM', 'abandon_note_t_4']))

df_shifted[['S Health','S Pay','Samsung Members','Segment code']] = df_shifted[['S Health','S Pay','Samsung Members','Segment code']].replace('55000', np.nan)
df_shifted[['S Health','S Pay','Samsung Members','Segment code']] = df_shifted[['S Health','S Pay','Samsung Members','Segment code']].fillna('N')
df[['S Health','S Pay','Samsung Members','Segment code']] = df[['S Health','S Pay','Samsung Members','Segment code']].replace('55000', np.nan)
df[['S Health','S Pay','Samsung Members','Segment code']] = df[['S Health','S Pay','Samsung Members','Segment code']].fillna('N')


dict_object_cols1 = {}
for col in['S Health','S Pay','Samsung Members','Segment code']:
    le = LabelEncoder()
    le.fit(df_shifted[col])
    df_shifted[col] = le.transform(df_shifted[col])
    df[col] = le.transform(df[col])
    dict_object_cols1[col] = dict(zip(le.classes_, range(len(le.classes_))))
df_shifted = df_shifted[features]

df_guids = df['GUID'].values
df = df[features]

features.remove('Target')

In [26]:
#l2, evals2 = xgb_cv(df_shifted.drop('Target',1), df_shifted['Target'], params,n_estimators=50, cv = True, verbose=False)
score = precision_calc(l2, df.drop('Target',1), df['Target'], 0.9, 0.0001, cv=True, to_print=True)

precision = 0.20261 -- (recall = 0.9, th = 0.19969)


In [42]:
pd.DataFrame(np.stack([df_guids, score[2]]).T, columns=['GUID', 'Purchase'])

Unnamed: 0,GUID,Purchase
0,0064xfpxlz,1
1,006tazj5sh,1
2,0070lirtjq,1
3,009zjirxth,0
4,00a9vcanrf,1
...,...,...
94083,zzyzvpppsj,1
94084,zzz5fobgxh,1
94085,zzzd7mzxcj,1
94086,zzzfhwbzah,0
