In [6]:
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from utils.ScoreFunction import score_function
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

from utils.paramsearch import paramsearch
from itertools import product,chain

import sys
sys.path.append('..')

import pandas as pd

import numpy as np
random_seed = 42
np.random.seed(random_seed)

import warnings
warnings.filterwarnings("ignore")

In [7]:
def most_common(lst):
    return max(set(lst), key=lst.count)

## Импорт данных

In [8]:
data = pd.read_csv("../data/train.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

#X['lineItemVoidsPerPosition'] = X['lineItemVoids']/X['totalScanned'] #
#X['lineItemVoidsPerTotal'] = X['lineItemVoids']/X['grandTotal'] #
#X['withoutRegistrationPerTotal'] = X['scansWithoutRegistration']/X['grandTotal'] #
#X['quantiModsPerTotal'] = X['quantityModifications']/X['grandTotal'] #
#X['lineItemVoidsPerTime'] = X['lineItemVoids']/X['totalScanTimeInSeconds'] #
#X['withoutRegistrationPerTime'] = X['scansWithoutRegistration']/X['totalScanTimeInSeconds'] #
#X['quantiModesPerTime'] = X['quantityModifications']/X['totalScanTimeInSeconds'] #

# XGBOOST

In [9]:
def crossvaltest_xg(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def xgboost_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params_xg)
    for prms in chain(ps.grid_search(['n_estimators','learning_rate']),
                      ps.grid_search(['max_depth','min_child_weight'])):
        res = crossvaltest_xg(prms,X, y,n_splits)
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
        print()
    return ps.bestparam(), ps.bestscore()

# CATBOOST

In [10]:
def crossvaltest_cat(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = CatBoostClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def cat_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest_cat(prms,X, y,n_splits)
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
        print()
    return ps.bestparam(), ps.bestscore()

# ADA_BOOST

In [11]:
def crossvaltest_ada(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = AdaBoostClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def ada_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['n_estimators', 'learning_rate', 'algorithm'])):
        res = crossvaltest_ada(prms,X, y,n_splits)
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
        print()
    return ps.bestparam(), ps.bestscore()

# BaggingClassifier

In [12]:
def crossvaltest_bag(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = BaggingClassifier(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def bag_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['n_estimators', 'max_samples'])):
        res = crossvaltest_bag(prms,X, y,n_splits)
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
        print()
    return ps.bestparam(), ps.bestscore()

# LOGISTIC_REGRESSION

In [13]:
def crossvaltest_log_reg(params, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = LogisticRegression(**params)
        clf.fit(X_train, y_train)
        
        y_pred = np.array(clf.predict(X_test))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
        
    return np.mean(score)

def log_reg_param_tune(params, X, y ,n_splits=5):
    ps = paramsearch(params)
    for prms in chain(ps.grid_search(['C'])):
        res = crossvaltest_log_reg(prms,X, y,n_splits)
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
        print()
    return ps.bestparam(), ps.bestscore()

# DNN

In [14]:
def DNN(params):
    skf = StratifiedKFold(n_splits=5)
    accuracy, score, f1 = [], [], []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train_dnn = sc.fit_transform(X_train)
        X_test_dnn = sc.transform(X_test)
        clf = Sequential()
        clf.add(Dense(128, activation='relu', kernel_initializer='random_normal', input_dim=14))
        clf.add(Dense(128, activation='relu', kernel_initializer='random_normal'))
        clf.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
        clf.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
        clf.fit(X_train_dnn, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose = False)

        y_pred = np.array(clf.predict(X_test_dnn))
        y_pred[y_pred>0.5] = 1
        y_pred[y_pred<0.5] = 0
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        accuracy.append(accuracy_score(y_test, y_pred))
        score.append(score_function(tp,fp,fn,tn))
        f1.append(f1_score(y_test, y_pred))
    return np.mean(score)

# MODEL_TUNING

In [None]:
params_xg = {'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9],
             'n_estimators':[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100],
             'learning_rate':[0.03,0.001,0.01,0.1], 
             'min_child_weight': [1,2,3,4]}

params_cat = {'depth':[1,3,5,7],
              'iterations':[100, 200,400,600,800,1000,2000],
              'learning_rate':[0.03,0.001,0.01,0.1], 
              'l2_leaf_reg':[1,5,10,100],
              'border_count':[2,5,10,20,50,100],
              'thread_count':4,
              'silent': True}

params_ada = {'learning_rate':[0.03,0.001,0.01,0.1],
              'n_estimators':[100, 300, 500, 700, 900, 1000],
              'algorithm': ['SAMME', 'SAMME.R']}

params_log_reg = {'C':[0.001,0.01,0.1,1,10,100,1000]}

params_dnn = {'epochs': 500, 'batch_size': 32}

params_log_reg, best_log_reg = log_reg_param_tune(params_log_reg,X, y)
params_xg_boost, best_xg = xgboost_param_tune(params_xg,X, y)
params_ada, best_ada = ada_param_tune(params_ada,X, y)
params_cat, best_cat = cat_param_tune(params_cat,X, y)
best_dnn = DNN(params_dnn)

-64.0 {'C': 0.01} best: -64.00000000000446 {'C': 0.01}

36.0 {'C': 0.1} best: 36.000000000136325 {'C': 0.1}

46.0 {'C': 1} best: 46.000000000050775 {'C': 1}

44.0 {'C': 10} best: 46.000000000050775 {'C': 1}

45.0 {'C': 100} best: 46.000000000050775 {'C': 1}

43.0 {'C': 1000} best: 46.000000000050775 {'C': 1}

-104.0 {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.001, 'min_child_weight': 1} best: -104.00000000007043 {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.001, 'min_child_weight': 1}

-104.0 {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.01, 'min_child_weight': 1} best: -104.00000000001448 {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.01, 'min_child_weight': 1}

18.0 {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.1, 'min_child_weight': 1} best: 17.999999999928836 {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.1, 'min_child_weight': 1}

-16.0 {'max_depth': 1, 'n_estimators': 200, 'learning_rate': 0.03, 'min_child_weight':

# MODEL_TESTING

In [213]:
skf = StratifiedKFold(n_splits=5)
accuracy, score, f1 = [], [], []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf = LogisticRegression(**params_log_reg)
    clf.fit(X_train, y_train)
    y_pred_log_reg = np.array(clf.predict(X_test))
    
    clf = XGBClassifier(**params_xg_boost)
    clf.fit(X_train, y_train)
    y_pred_xg = np.array(clf.predict(X_test))
    
    clf = AdaBoostClassifier(**params_ada)
    clf.fit(X_train, y_train)
    y_pred_ada = np.array(clf.predict(X_test))
    
    clf = CatBoostClassifier(**params_cat)
    clf.fit(X_train, y_train)
    y_pred_cat = np.array(clf.predict(X_test))
    
    X_train_dnn = sc.fit_transform(X_train)
    X_test_dnn = sc.transform(X_test)
    clf = Sequential()
    clf.add(Dense(128, activation='relu', kernel_initializer='random_normal', input_dim=14))
    clf.add(Dense(128, activation='relu', kernel_initializer='random_normal'))
    clf.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
    clf.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
    clf.fit(X_train_dnn, y_train, epochs=params_dnn['epochs'], batch_size=params_dnn['batch_size'], verbose = False)
    y_pred_dnn = np.array(clf.predict(X_test_dnn))
    y_pred_dnn[y_pred_dnn>0.5] = 1
    y_pred_dnn[y_pred_dnn<0.5] = 0
    
    y_pred = []
    for i in range(len(y_pred_cat)):
        temp_prediction = [float(y_pred_log_reg[i]), float(y_pred_xg[i]), float(y_pred_ada[i]), float(y_pred_cat[i]), float(y_pred_dnn[i])]
        y_pred.append(most_common(temp_prediction))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy.append(accuracy_score(y_test, y_pred))
    score.append(score_function(tp,fp,fn,tn))
    f1.append(f1_score(y_test, y_pred))

print('Mean Score = {0}'.format(np.mean(score)))
print('Mean Accuracy = {0}'.format(np.mean(accuracy)))
print('Mean F1_score = {0}'.format(np.mean(f1)))

Mean Score = 59.0
Mean Accuracy = 0.990418439716312
Mean F1_score = 0.906794425087108
