In [75]:
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA,TruncatedSVD,FactorAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from utils.ScoreFunction import score_function, profit_scorer
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

from utils.paramsearch import paramsearch
from itertools import product,chain

import sys
import pickle
sys.path.append('..')

import pandas as pd

import numpy as np
random_seed = 42
np.random.seed(random_seed)

import warnings
warnings.filterwarnings("ignore")

In [76]:
def most_common(lst):
    return max(set(lst), key=lst.count)

## Импорт данных

In [77]:
data = pd.read_csv("../data/train.csv",sep='|')
test = pd.read_csv("../data/test.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

X['grandTotalCat'] = pd.cut(X['grandTotal'], 10,labels =[1,2,3,4,5,6,7,8,9,10])
X['totalScanTimeInSecondsCat'] = pd.cut(X['totalScanTimeInSeconds'], 2,labels =[1,2])
X['lineItemVoidsPerPositionCat'] = pd.cut(X['lineItemVoidsPerPosition'], 10,labels =[1,2,3,4,5,6,7,8,9,10])
X['avgTimePerScan'] = pd.cut(X['avgTimePerScan'], 4,labels =[1,2,3,4])
for column in X.columns:
    X[column] = X[column].astype('float64')

In [78]:
pca = PCA(n_components=8)
pca.fit(data.drop(["fraud"],axis=1).append(test, ignore_index = True))

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [79]:
pca_ = pca.transform(data.drop(["fraud"],axis=1))

In [80]:
X = np.array(X)
X = np.concatenate((X,pca_),axis=1)
y = np.array(y)
X = pd.DataFrame(X)
y = pd.DataFrame(y)

# XGBOOST

In [81]:
def crossvaltest_xg(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def xgboost_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params_xg)
    for prms in chain(ps.grid_search(['n_estimators','learning_rate']),
                      ps.grid_search(['max_depth','min_child_weight'])):
        res = crossvaltest_xg(prms,X, y,n_splits)
        ps.register_result(res,prms)
    return ps.bestparam(), ps.bestscore()

# CATBOOST

In [82]:
def crossvaltest_cat(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = CatBoostClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def cat_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest_cat(prms,X, y,n_splits)
        ps.register_result(res,prms)
    return ps.bestparam(), ps.bestscore()

# ADA_BOOST

In [83]:
def crossvaltest_ada(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = AdaBoostClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def ada_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['n_estimators', 'learning_rate', 'algorithm'])):
        res = crossvaltest_ada(prms,X, y,n_splits)
        ps.register_result(res,prms)
    return ps.bestparam(), ps.bestscore()

# LightGBM

In [84]:
def crossvaltest_lgb(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = lgb.LGBMClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def lgb_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['num_iterations', 'learning_rate']),
                      ps.grid_search(['max_depth', 'num_leaves']),
                      ps.grid_search(['boosting'])):
        res = crossvaltest_lgb(prms,X, y,n_splits)
        ps.register_result(res,prms)
    return ps.bestparam(), ps.bestscore()

# LOGISTIC_REGRESSION

In [85]:
def crossvaltest_log_reg(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = LogisticRegression(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def log_reg_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['C'])):
        res = crossvaltest_log_reg(prms,X, y,n_splits)
        ps.register_result(res,prms)
    return ps.bestparam(), ps.bestscore()

# MODEL_TUNING

In [86]:
params_cat = {'depth':[1,3,5,7],
              'iterations':[100, 200,400,600,800,1000,2000],
              'learning_rate':[0.03,0.001,0.01,0.1], 
              'l2_leaf_reg':[1,5,10,100],
              'border_count':[2,5,10,20,50,100],
              'thread_count':4,
              'silent': True}
params_cat, best_cat = cat_param_tune(params_cat,X, y)

In [87]:
params_lgb = {'max_depth':[1,3,5,-1],
              'boosting': ['gbdt', 'dart', 'goss'],
              'num_leaves': [20, 31, 40, 50],
              'num_iterations':[100, 200,400,600,800,1000,1500],
              'learning_rate':[0.03,0.001,0.01,0.1]}
params_lgb, best_lgb = lgb_param_tune(params_lgb,X, y)

In [88]:
params_xg = {'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9],
             'n_estimators':[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100],
             'learning_rate':[0.03,0.001,0.01,0.1], 
             'min_child_weight': [1,2,3,4]}
params_xg_boost, best_xg = xgboost_param_tune(params_xg,X, y)

In [89]:
params_ada = {'learning_rate':[0.03,0.001,0.01,0.1],
              'n_estimators':[100, 300, 500, 700, 900, 1000],
              'algorithm': ['SAMME', 'SAMME.R']}
params_ada, best_ada = ada_param_tune(params_ada,X, y)

In [90]:
params_log_reg = {'C':[0.001,0.01,0.1,1,10,100,1000]}
params_log_reg, best_log_reg = log_reg_param_tune(params_log_reg,X, y)

In [91]:
print('CatBoost best score = {0}'.format(best_cat))
print('LightGbm best score = {0}'.format(best_lgb))
print('XgBoost best score = {0}'.format(best_xg))
print('AdaBoost best score = {0}'.format(best_ada))
print('LogisticRegression best score = {0}'.format(best_log_reg))

CatBoost best score = 39.00000000002088
LightGbm best score = 64.00000000003571
XgBoost best score = 51.99999999989377
AdaBoost best score = 65.99999999983409
LogisticRegression best score = 67.99999999985357


In [74]:
params_lgb

{'boosting': 'goss',
 'learning_rate': 0.1,
 'max_depth': 1,
 'num_iterations': 1500,
 'num_leaves': 20}

# MODEL_TESTING

In [92]:
skf = StratifiedKFold(n_splits=5)
accuracy, score, f1 = [], [], []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf = LogisticRegression(**params_log_reg)
    clf.fit(X_train, y_train)
    y_pred_log_reg = np.array(clf.predict(X_test))
    
    clf = XGBClassifier(**params_xg_boost)
    clf.fit(X_train, y_train)
    y_pred_xg = np.array(clf.predict(X_test))
    
    clf = AdaBoostClassifier(**params_ada)
    clf.fit(X_train, y_train)
    y_pred_ada = np.array(clf.predict(X_test))
    
    clf = CatBoostClassifier(**params_cat)
    clf.fit(X_train, y_train)
    y_pred_cat = np.array(clf.predict(X_test))
    
    clf = lgb.LGBMClassifier(**params_lgb)
    clf.fit(X_train, y_train)
    y_pred_lgb = np.array(clf.predict(X_test))
    
    
    y_pred = []
    for i in range(len(y_pred_cat)):
        temp_prediction = [float(y_pred_log_reg[i]), float(y_pred_xg[i]), float(y_pred_lgb[i]), float(y_pred_ada[i]), float(y_pred_cat[i])]
        y_pred.append(most_common(temp_prediction))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy.append(accuracy_score(y_test, y_pred))
    score.append(score_function(tp,fp,fn,tn))
    f1.append(f1_score(y_test, y_pred))

print('Mean Score = {0}'.format(np.mean(score)))
print('Mean Accuracy = {0}'.format(np.mean(accuracy)))
print('Mean F1_score = {0}'.format(np.mean(f1)))

Mean Score = 65.0
Mean Accuracy = 0.9920156028368794
Mean F1_score = 0.9233022533022532
