# trustLevel_1_2

**Idea:** Train separate models based on different trustLevel scores

# Prerequisites

### Packages and Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn import svm as SVM
from sklearn.naive_bayes import GaussianNB as NB
from xgboost import XGBClassifier

In [2]:
# custom imports
from funcs import plot_cv_confidence_vs_profit, score_dmc_profit,dmc_profit,cv_preds_and_confusion_matrix,cv_profits_for_models, profit_scoring
from customClassifiers import CustomModelWithThreshold, TrustHard, PerceptronLearner
from pipes import CustomAttributeAdder,Scaling,RandomAttributeAdder,Transformer,ClfSwitcher

from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import VotingClassifier

In [3]:
# use sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import itertools

### Evaluation Function 

In [4]:
# Nico's script
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer

cv = StratifiedKFold(n_splits=10, random_state=42)
def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

### Load the Data

In [5]:
# load the data
df_train = pd.read_csv('train.csv' ,delimiter="|")
df_test = pd.read_csv('test.csv', delimiter="|")
X_train, y_train = df_train.drop(columns='fraud'), df_train['fraud']
#y_test = test.pop('fraud')

In [6]:
print(X_train.shape)
X_train.head(3)

(1879, 9)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769


# trustLevel==1

### Split the Data

In [59]:
# create X_train trustLevel==1
df_train = pd.read_csv('train.csv' ,delimiter="|")

is_trust1 = df_train['trustLevel']==1
df_train_trust1 = df_train[is_trust1]

X_train_trust1, y_train_trust1 = df_train_trust1.drop(columns='fraud'), df_train_trust1['fraud']

print(X_train_trust1.shape)
print(y_train_trust1.shape)
X_train_trust1.head(3)

(332, 9)
(332,)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
5,1,770,11.09,11,5,2,0.033766,0.014403,0.423077
15,1,870,32.45,3,1,5,0.006897,0.037299,0.5
24,1,71,78.91,1,4,4,0.014085,1.111408,1.0


In [60]:
# show distribution
from collections import Counter

print("training data size trustLevel==1: {}".format(len(y_train_trust1)))
print(sorted(Counter(y_train_trust1).items()))
print("Max. profit: {}".format(89*5))

training data size trustLevel==1: 332
[(0, 243), (1, 89)]
Max. profit: 445


**Note:** Maximal profit equals **445**

### Preprocess the Data

In [9]:
# select one or more out of feature list below that will be added in the featureGenerationPipeline
feature_list = ['scannedLineItemsTotal',
                'valuePerLineItem',
                'quantityModificationsPerLineItem',
                'lineItemVoids*scansWithoutRegistration',
                #'totalScanTimeInSeconds/trustLevel',
                #'trustLevel_Log', 
               ]

**Note:** Only two preprocessing steps at the moment are adding newly designed features (see above) and scaling

In [10]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder(featurelist=feature_list)),                
    #("RandomAttributeAdder", RandomAttributeAdder())         #  This class is still void
    ])


preprocessing_pipeline = Pipeline([
    #("transformer", Transformer()),                           # This class is still void
    ("scaler", Scaling(strategy='Standard')),
])

In [11]:
# combine two pipeline into a single data_preparation_pipeline
data_preparation_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

X_train_trust1_prepared = data_preparation_pipeline.fit_transform(X_train_trust1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [12]:
# delete trustLevel feature!
X_train_trust1_prepared = np.delete(X_train_trust1_prepared, 0, 1)

In [13]:
X_train_trust1_prepared.shape

(332, 12)

### Model Selection

In [26]:
# Install the library for Bayesian optimization from here: https://github.com/fmfn/BayesianOptimization
from bayes_opt import BayesianOptimization

### Logistic Regression

In [15]:
def evaluateLogistic(C):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42)
    
    return np.mean(cross_validate(model, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                                scoring=profit_scoring,
                                  #scoring="f1"
                                 )['test_score'])

In [16]:
params_logistic = {
    'C': (1,35),
}

In [None]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=231)
optimization_logistic.maximize(n_iter=50, init_points=50)

In [None]:
optimization_logistic.max

In [17]:
sum(cross_validate(LogisticRegression(C=2.32,
                                      solver='liblinear',
                                      random_state=42),
                   X_train_trust1_prepared,
                   y=y_train_trust1,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

335

In [18]:
335/445

0.7528089887640449

**Observation:** For **trustLevel==1** Logistic Regression achieves profit of **335** or **0.75** percent of maximal profit

### SGD

In [None]:
def evaluateSgd(alpha, l1_ratio, tol, penalty, loss):
    
    # 3 options, l1 by default
    penalty_str = 'l1'
    if int(penalty) == 0:
        penalty_str = 'l2'
    elif int(penalty) == 1:
        penalty_str = 'elasticnet'
    
    # 3 options, modified_huber by default
    loss_str = 'modified_huber'
    if int(loss) == 0:
        loss_str = 'hinge'
    elif int(loss) == 1:
        loss_str = 'log'
        
    
    model = SGDClassifier(alpha=alpha, l1_ratio=l1_ratio, tol=tol,
                          penalty=penalty_str, loss=loss_str, random_state=231)
    
    # This integrates sampling into the training. Trains on oversampled data but evaluates on unsampled data
    #return cross_val_imbalanced(model, X_train_prepared, y_train, RandomOverSampler(random_state=42))
    
    # this trains the classifier on the unbalanced folds
    return sum(cross_validate(model, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                              #scoring="f1",
                              scoring=profit_scoring,
                             )['test_score'])

In [None]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'penalty': (0, 3),
    'loss': (0, 3)
}

In [None]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=200, init_points=20)

In [None]:
optimization_sgd.max

In [19]:
sum(cross_validate(SGDClassifier(alpha=0.448373,
                                 l1_ratio=0.0859590,
                                 tol=0.0403,
                                 penalty='elasticnet',
                                 loss='modified_huber',
                                 random_state=231),
                   X_train_trust1_prepared,
                   y=y_train_trust1,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

320

### BorderLineSMOTE + SGD

In [20]:
# import sampling classes
from imblearn.pipeline import make_pipeline

# oversampling
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE

# combination of over and undersampling
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [21]:
def evaluateSgd(alpha,
                l1_ratio,
                tol,
                n_neighbors,
                #k_neighbors,
                #m_neighbors
               ):
    
    sampler = RandomOverSampler(random_state=231,
                     #n_neighbors=int(n_neighbors),
                     #sampling_strategy='all',
                     #k_neighbors=int(k_neighbors),
                     #m_neighbors=int(m_neighbors)
                    )
    
    model = SGDClassifier(alpha=alpha,
                          l1_ratio=l1_ratio,
                          tol=tol,
                          penalty='elasticnet',
                          loss='modified_huber',
                          random_state=231)
    
    sampler_model_pipeline = make_pipeline(sampler,
                                          model)
    
    # this trains the classifier on the unbalanced folds
    return sum(cross_validate(sampler_model_pipeline, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                              #scoring="f1",
                              scoring=profit_scoring,
                             )['test_score'])

In [None]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'n_neighbors': (2, 20),
    #'k_neighbors': (2, 20),
    #'m_neighbors': (2, 20)
}

In [None]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=200, init_points=50)

In [None]:
optimization_sgd.max

### AdaBoost + Logistic Regression 

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
def evaluateAda(C,
                n_estimators,
                learning_rate,
               ):
    
    
    model = AdaBoostClassifier(base_estimator=LogisticRegression(C=C,
                                                                 solver='liblinear',
                                                                 random_state=42),
                               n_estimators=int(n_estimators),
                               learning_rate=learning_rate,
                               random_state=231)
    
    # this trains the classifier on the unbalanced folds
    return sum(cross_validate(model, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                              #scoring="f1",
                              scoring=profit_scoring,
                             )['test_score'])

In [None]:
params_ada = {
    'C': (1, 100),
    'n_estimators': (50, 500),
    'learning_rate': (0.01, 1),
}

In [None]:
optimization_ada = BayesianOptimization(evaluateAda, params_ada, random_state=231)
optimization_ada.maximize(n_iter=200, init_points=20)

In [None]:
optimization_ada.max

### XGB - booster:'gblinear'

In [None]:
from xgboost import XGBClassifier

In [None]:
def evaluateXGB(reg_alpha,
                reg_lambda,
                #subsample, colsample_bytree,
                #gamma,
                #max_depth,
                #min_child_weight,
                n_estimators,
               ):
    
    model = XGBClassifier(reg_lambda=reg_lambda,
                          reg_alpha=reg_alpha,
                          updater='coord_descent',
                          learning_rate =0.01,
                          n_estimators=int(n_estimators),
                          
                          #max_depth=int(max_depth),
                          #min_child_weight=int(min_child_weight),
                          #gamma=gamma,
                          #subsample=subsample,
                          #colsample_bytree=colsample_bytree,
                          #reg_alpha=reg_alpha,
                          #reg_lambda=reg_lambda,
                          
                          objective= 'binary:logistic',
                          booster='gblinear',
                          n_jobs=-1,
                          scale_pos_weight=1,
                          seed=231)
    
    return sum(cross_validate(model, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                              scoring=profit_scoring,
                              #scoring="f1",
                              #scoring="precision"
                              n_jobs=-1)['test_score'])

In [None]:
params_XGB = {
    'n_estimators': (50,1000),
    #'max_depth':(3,10.5),
    #'min_child_weight':(1,6.5),
    #'gamma': (0,0.6),
    #'subsample':(0.6,1),
    #'colsample_bytree':(0.6,1),
    'reg_alpha':(0, 100),
    'reg_lambda':(0, 100)
}

In [None]:
optimization_XGB = BayesianOptimization(evaluateXGB, params_XGB, random_state=42)
optimization_XGB.maximize(n_iter=200, init_points=50)

In [None]:
optimization_XGB.max

### XGB - booster:'gbtree'

In [None]:
from xgboost import XGBClassifier

In [None]:
def evaluateXGB(reg_alpha,
                #reg_lambda,
                subsample, colsample_bytree,
                gamma,
                max_depth,
                min_child_weight,
                n_estimators,
               ):
    
    model = XGBClassifier(learning_rate =0.01,
                          n_estimators=int(n_estimators),
                          max_depth=int(max_depth),
                          min_child_weight=int(min_child_weight),
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha,
                          #reg_lambda=reg_lambda,
                          objective= 'binary:logistic',
                          booster='gbtree',
                          n_jobs=-1,
                          scale_pos_weight=1,
                          seed=231)
    
    return sum(cross_validate(model, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                              scoring=profit_scoring,
                              #scoring="f1",
                              #scoring="precision"
                              n_jobs=-1)['test_score'])

In [None]:
params_XGB = {
    'n_estimators': (1,1000),
    'max_depth':(3,10.5),
    'min_child_weight':(1,6.5),
    'gamma': (0,0.6),
    'subsample':(0.6,1),
    'colsample_bytree':(0.6,1),
    'reg_alpha':(0.0005, 100),
    #'reg_lambda':(0.3, 0.7)
}

In [None]:
optimization_XGB = BayesianOptimization(evaluateXGB, params_XGB, random_state=42)
optimization_XGB.maximize(n_iter=200, init_points=50)

In [None]:
optimization_XGB.max

# trustLevel==2

### Split the Data

In [22]:
# create X_train trustLevel==2
df_train = pd.read_csv('train.csv' ,delimiter="|")

is_trust2 = df_train['trustLevel']==2
df_train_trust2 = df_train[is_trust2]

X_train_trust2, y_train_trust2 = df_train_trust2.drop(columns='fraud'), df_train_trust2['fraud']

print(X_train_trust2.shape)
print(y_train_trust2.shape)
X_train_trust2.head(3)

(347, 9)
(347,)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
7,2,1545,22.8,0,8,4,0.006472,0.014757,0.0
9,2,725,41.08,10,2,4,0.037241,0.056662,0.37037
23,2,125,25.5,5,6,2,0.192,0.204,0.208333


In [23]:
# show distribution
from collections import Counter

print("training data size trustLevel==1: {}".format(len(y_train_trust2)))
print(sorted(Counter(y_train_trust2).items()))
print("Max. profit: {}".format(15*5))

training data size trustLevel==1: 347
[(0, 332), (1, 15)]
Max. profit: 75


**Note:** Maximal profit equals **75**

### Preprocess the Data

In [24]:
# select one or more out of feature list below that will be added in the featureGenerationPipeline
feature_list = ['scannedLineItemsTotal',
                'valuePerLineItem',
                'quantityModificationsPerLineItem',
                'lineItemVoids*scansWithoutRegistration',
                #'totalScanTimeInSeconds/trustLevel',
                #'trustLevel_Log', 
               ]

**Note:** Only two preprocessing steps at the moment are adding newly designed features (see above) and scaling

In [25]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder(featurelist=feature_list)),                
    #("RandomAttributeAdder", RandomAttributeAdder())         #  This class is still void
    ])


preprocessing_pipeline = Pipeline([
    #("transformer", Transformer()),                           # This class is still void
    ("scaler", Scaling(strategy='Standard')),
])

In [26]:
# combine two pipeline into a single data_preparation_pipeline
data_preparation_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

X_train_trust2_prepared = data_preparation_pipeline.fit_transform(X_train_trust2)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [27]:
# delete trust level
X_train_trust2_prepared = np.delete(X_train_trust2_prepared, 0, 1)

In [28]:
X_train_trust2_prepared.shape

(347, 12)

### Model Selection

### Logistic Regression

In [None]:
def evaluateLogistic(C):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42)
    
    return np.mean(cross_validate(model, X_train_trust2_prepared, y=y_train_trust2, cv=cv,
                              #scoring=profit_scoring,
                             scoring="precision",
                             )['test_score'])

In [None]:
params_logistic = {
    'C': (1,1000),
}

In [None]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=231)
optimization_logistic.maximize(n_iter=50, init_points=50)

In [None]:
optimization_logistic.max

In [29]:
sum(cross_validate(LogisticRegression(C=140.91,
                                      solver='liblinear',
                                      random_state=42),
                   X_train_trust2_prepared,
                   y=y_train_trust2,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

-55

**Observation:** For **trustLevel==2** Logistic Regression achieves profit of **-55**

### SGD

In [None]:
def evaluateSgd(alpha, l1_ratio, tol, penalty, loss):
    
    # 3 options, l1 by default
    penalty_str = 'l1'
    if int(penalty) == 0:
        penalty_str = 'l2'
    elif int(penalty) == 1:
        penalty_str = 'elasticnet'
    
    # 3 options, modified_huber by default
    loss_str = 'modified_huber'
    if int(loss) == 0:
        loss_str = 'hinge'
    elif int(loss) == 1:
        loss_str = 'log'
        
    
    model = SGDClassifier(alpha=alpha, l1_ratio=l1_ratio, tol=tol,
                          penalty=penalty_str, loss=loss_str, random_state=231)
    
    # This integrates sampling into the training. Trains on oversampled data but evaluates on unsampled data
    #return cross_val_imbalanced(model, X_train_prepared, y_train, RandomOverSampler(random_state=42))
    
    # this trains the classifier on the unbalanced folds
    return sum(cross_validate(model, X_train_trust2_prepared, y=y_train_trust2, cv=cv,
                              scoring=profit_scoring)['test_score'])

In [None]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'penalty': (0, 3),
    'loss': (0, 3)
}

In [None]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=200, init_points=20)

In [None]:
optimization_sgd.max

In [30]:
sum(cross_validate(SGDClassifier(alpha=0.002964,
                                 l1_ratio=0.702337,
                                 tol=0.022649,
                                 penalty='elasticnet',
                                 loss='modified_huber',
                                 random_state=231),
                   X_train_trust2_prepared,
                   y=y_train_trust2,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

10

### XGB

In [31]:
from xgboost import XGBClassifier

In [None]:
def evaluateXGB(reg_alpha,
                #reg_lambda,
                subsample, colsample_bytree,
                gamma,
                max_depth,
                #min_child_weight,
                n_estimators,
               ):
    
    model = XGBClassifier(learning_rate =0.01,
                          n_estimators=int(n_estimators),
                          max_depth=int(max_depth),
                          min_child_weight=1,
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha,
                          #reg_lambda=reg_lambda,
                          objective= 'binary:logistic',
                          n_jobs=-1,
                          scale_pos_weight=1,
                          seed=231)
    
    return sum(cross_validate(model, X_train_trust2_prepared, y=y_train_trust2, cv=cv,
                              scoring=profit_scoring,
                              #scoring="f1",
                              #scoring="precision"
                              n_jobs=-1)['test_score'])

In [None]:
params_XGB = {
    'n_estimators': (500,2500),
    'max_depth':(7.5,9.5),
    #'min_child_weight':(1,7),
    'gamma': (0,0.1),
    'subsample':(0.85,1),
    'colsample_bytree':(0.85,1),
    'reg_alpha':(0.001, 1),
    #'reg_lambda':(0.3, 0.7)
}

In [None]:
optimization_XGB = BayesianOptimization(evaluateXGB, params_XGB, random_state=42)
optimization_XGB.maximize(n_iter=200, init_points=50)

In [None]:
optimization_XGB.max

In [32]:
sum(cross_validate(XGBClassifier(learning_rate =0.1,
                          n_estimators=331,
                          max_depth=8,
                          min_child_weight=1,
                          gamma=0.032443,
                          subsample=0.905887,
                          colsample_bytree=0.93342,
                          reg_alpha=0.782748,
                          #reg_lambda=0.5123,
                          objective= 'binary:logistic',
                          n_jobs=-1,
                          scale_pos_weight=1,
                          seed=231),
                   X_train_trust2_prepared,
                   y=y_train_trust2,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

25

# trustLevel1 & 2

### Split the Data

In [33]:
# create X_train trustLevel==1
df_train = pd.read_csv('train.csv' ,delimiter="|")

is_trust12 = df_train['trustLevel']<=2
df_train_trust12 = df_train[is_trust12]

X_train_trust12, y_train_trust12 = df_train_trust12.drop(columns='fraud'), df_train_trust12['fraud']

print(X_train_trust12.shape)
print(y_train_trust12.shape)
X_train_trust12.head(3)

(679, 9)
(679,)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
5,1,770,11.09,11,5,2,0.033766,0.014403,0.423077
7,2,1545,22.8,0,8,4,0.006472,0.014757,0.0
9,2,725,41.08,10,2,4,0.037241,0.056662,0.37037


In [34]:
# show distribution
from collections import Counter

print("training data size trustLevel==1&2 combined: {}".format(len(y_train_trust12)))
print(sorted(Counter(y_train_trust12).items()))
print("Max. Profit: {}".format(104*5))

training data size trustLevel==1&2 combined: 679
[(0, 575), (1, 104)]
Max. Profit: 520


**Note:** Maximal profit equals **520**

In [35]:
# keep trustLevel as a feature but recode it to be binary [0,1]
X_train_trust12['trustLevel'].replace(to_replace=1, value=0, inplace=True)
X_train_trust12['trustLevel'].replace(to_replace=2, value=1, inplace=True)

In [36]:
X_train_trust12.head(3)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
5,0,770,11.09,11,5,2,0.033766,0.014403,0.423077
7,1,1545,22.8,0,8,4,0.006472,0.014757,0.0
9,1,725,41.08,10,2,4,0.037241,0.056662,0.37037


### Preprocess the Data

In [37]:
# select one or more out of feature list below that will be added in the featureGenerationPipeline
feature_list = ['scannedLineItemsTotal',
                'valuePerLineItem',
                'quantityModificationsPerLineItem',
                'lineItemVoids*scansWithoutRegistration',
                #'totalScanTimeInSeconds/trustLevel',
                #'trustLevel_Log', 
               ]

**Note:** Only two preprocessing steps at the moment are adding newly designed features (see above) and scaling

In [38]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder(featurelist=feature_list)),                
    #("RandomAttributeAdder", RandomAttributeAdder())         #  This class is still void
    ])

**Note:** This is a slight change to the preprocesses pipeline to treat trustLevel as a categorical feature and do not apply scaling to it (I'm pretty sure there is a smarter way of achieving this)

In [39]:
# define a class: that will take pandas dataframe select columns and convert to numpy array
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [40]:
X_train_trust12_num = X_train_trust12.drop(columns='trustLevel')
X_train_trust12_num.columns

Index(['totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition'],
      dtype='object')

In [41]:
# use DataFrameSelector
num_attributes = list(X_train_trust12_num)
num_attributes.append('scannedLineItemsTotal')
num_attributes.append('valuePerLineItem')
num_attributes.append('quantityModificationsPerLineItem')
num_attributes.append('lineItemVoids*scansWithoutRegistration')
cat_attributes = ["trustLevel"]

In [42]:
num_attributes

['totalScanTimeInSeconds',
 'grandTotal',
 'lineItemVoids',
 'scansWithoutRegistration',
 'quantityModifications',
 'scannedLineItemsPerSecond',
 'valuePerSecond',
 'lineItemVoidsPerPosition',
 'scannedLineItemsTotal',
 'valuePerLineItem',
 'quantityModificationsPerLineItem',
 'lineItemVoids*scansWithoutRegistration']

In [43]:
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attributes)),
    ("scaler", Scaling(strategy='Standard')),
    ])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attributes)),
    ])


In [44]:
# combining numeric and categorical preprocessing with FeatureUnion
from sklearn.pipeline import FeatureUnion

preprocessing_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [45]:
# combine two pipeline into a single data_preparation_pipeline
data_preparation_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

X_train_trust12_prepared = data_preparation_pipeline.fit_transform(X_train_trust12)

In [46]:
X_train_trust12_prepared.shape

(679, 13)

In [47]:
X_train_trust12_prepared

array([[-0.26493495, -1.27906871,  1.59381746, ..., -0.42462227,
         1.09821241,  0.        ],
       [ 1.18451551, -0.8789435 , -1.61919808, ...,  0.05885284,
        -1.02185699,  1.        ],
       [-0.34909659, -0.25432447,  1.30172513, ..., -0.31803604,
        -0.25092266,  1.        ],
       ...,
       [-1.10468108,  0.93990064,  0.71754049, ..., -0.40960751,
         1.13675913,  0.        ],
       [-0.96254142, -0.22664715, -0.15873648, ..., -0.53973539,
        -0.05818908,  0.        ],
       [-0.42390693,  0.4837374 , -1.32710576, ..., -0.41502951,
        -0.79057669,  1.        ]])

## XGB

In [30]:
from xgboost import XGBClassifier

In [31]:
def evaluateXGB(reg_alpha,
                #reg_lambda,
                subsample, colsample_bytree,
                gamma,
                max_depth,
                min_child_weight,
                n_estimators,
               ):
    
    model = XGBClassifier(learning_rate =0.1,
                          n_estimators=int(n_estimators),
                          max_depth=int(max_depth),
                          min_child_weight=(min_child_weight),
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha,
                          #reg_lambda=reg_lambda,
                          objective= 'binary:logistic',
                          n_jobs=-1,
                          scale_pos_weight=1,
                          seed=231)
    
    return sum(cross_validate(model, X_train_trust12_prepared, y=y_train_trust12, cv=cv,
                              scoring=profit_scoring,
                              #scoring="f1",
                              #scoring="precision"
                              n_jobs=-1)['test_score'])

In [32]:
params_XGB = {
    'n_estimators': (50,500),
    'max_depth':(3,10.5),
    'min_child_weight':(1,7),
    'gamma': (0,1),
    'subsample':(0.6,1),
    'colsample_bytree':(0.6,1),
    'reg_alpha':(0.001, 100),
    #'reg_lambda':(0.3, 0.7)
}

In [33]:
optimization_XGB = BayesianOptimization(evaluateXGB, params_XGB, random_state=42)
optimization_XGB.maximize(n_iter=200, init_points=50)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | n_esti... | reg_alpha | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-55.0    [0m | [0m 0.7498  [0m | [0m 0.9507  [0m | [0m 8.49    [0m | [0m 4.592   [0m | [0m 120.2   [0m | [0m 15.6    [0m | [0m 0.6232  [0m |
| [0m 2       [0m | [0m-520.0   [0m | [0m 0.9465  [0m | [0m 0.6011  [0m | [0m 8.311   [0m | [0m 1.124   [0m | [0m 486.5   [0m | [0m 83.24   [0m | [0m 0.6849  [0m |
| [0m 3       [0m | [0m-430.0   [0m | [0m 0.6727  [0m | [0m 0.1834  [0m | [0m 5.282   [0m | [0m 4.149   [0m | [0m 244.4   [0m | [0m 29.12   [0m | [0m 0.8447  [0m |
| [95m 4       [0m | [95m-45.0    [0m | [95m 0.6558  [0m | [95m 0.2921  [0m | [95m 5.748   [0m | [95m 3.736   [0m | [95m 403.3   [0m | [95m 19.97   [0m | [95m 0.8057  [0m |
| [0m 5       [0m | [0m-520.0   [0m | 

| [0m 45      [0m | [0m-520.0   [0m | [0m 0.9181  [0m | [0m 0.2708  [0m | [0m 6.292   [0m | [0m 1.471   [0m | [0m 61.41   [0m | [0m 96.26   [0m | [0m 0.9344  [0m |
| [0m 46      [0m | [0m-520.0   [0m | [0m 0.8784  [0m | [0m 0.409   [0m | [0m 4.3     [0m | [0m 1.939   [0m | [0m 162.6   [0m | [0m 54.92   [0m | [0m 0.8858  [0m |
| [0m 47      [0m | [0m-520.0   [0m | [0m 0.8641  [0m | [0m 0.2799  [0m | [0m 10.16   [0m | [0m 5.427   [0m | [0m 299.5   [0m | [0m 61.17   [0m | [0m 0.7678  [0m |
| [0m 48      [0m | [0m 40.0    [0m | [0m 0.6991  [0m | [0m 0.356   [0m | [0m 8.684   [0m | [0m 1.086   [0m | [0m 102.2   [0m | [0m 4.601   [0m | [0m 0.6163  [0m |
| [0m 49      [0m | [0m-520.0   [0m | [0m 0.9422  [0m | [0m 0.7037  [0m | [0m 6.556   [0m | [0m 1.587   [0m | [0m 271.2   [0m | [0m 47.35   [0m | [0m 0.6693  [0m |
| [0m 50      [0m | [0m-520.0   [0m | [0m 0.7735  [0m | [0m 0.3985  [0m | [0m 7.6

| [95m 90      [0m | [95m 220.0   [0m | [95m 0.6059  [0m | [95m 0.3272  [0m | [95m 4.661   [0m | [95m 1.751   [0m | [95m 309.9   [0m | [95m 1.691   [0m | [95m 0.725   [0m |
| [0m 91      [0m | [0m 125.0   [0m | [0m 0.8519  [0m | [0m 0.868   [0m | [0m 5.43    [0m | [0m 2.008   [0m | [0m 311.1   [0m | [0m 2.506   [0m | [0m 0.8394  [0m |
| [0m 92      [0m | [0m 130.0   [0m | [0m 0.6112  [0m | [0m 0.1496  [0m | [0m 7.16    [0m | [0m 2.314   [0m | [0m 177.8   [0m | [0m 3.797   [0m | [0m 0.8792  [0m |
| [0m 93      [0m | [0m 90.0    [0m | [0m 0.6975  [0m | [0m 0.2712  [0m | [0m 6.944   [0m | [0m 1.835   [0m | [0m 410.4   [0m | [0m 7.285   [0m | [0m 0.9336  [0m |
| [0m 94      [0m | [0m 130.0   [0m | [0m 0.9319  [0m | [0m 0.08713 [0m | [0m 6.282   [0m | [0m 1.712   [0m | [0m 309.9   [0m | [0m 4.19    [0m | [0m 0.6807  [0m |
| [0m 95      [0m | [0m-30.0    [0m | [0m 0.8335  [0m | [0m 0.9356  [0m |

| [0m 135     [0m | [0m 145.0   [0m | [0m 0.7764  [0m | [0m 0.3223  [0m | [0m 6.079   [0m | [0m 1.907   [0m | [0m 310.5   [0m | [0m 3.051   [0m | [0m 0.7077  [0m |
| [0m 136     [0m | [0m 185.0   [0m | [0m 0.7603  [0m | [0m 0.5984  [0m | [0m 5.047   [0m | [0m 2.824   [0m | [0m 308.0   [0m | [0m 2.715   [0m | [0m 0.6808  [0m |
| [0m 137     [0m | [0m 160.0   [0m | [0m 0.987   [0m | [0m 0.167   [0m | [0m 6.088   [0m | [0m 2.699   [0m | [0m 308.4   [0m | [0m 3.897   [0m | [0m 0.6173  [0m |
| [0m 138     [0m | [0m 105.0   [0m | [0m 0.7919  [0m | [0m 0.7134  [0m | [0m 4.591   [0m | [0m 1.247   [0m | [0m 307.6   [0m | [0m 3.775   [0m | [0m 0.7924  [0m |
| [0m 139     [0m | [0m 100.0   [0m | [0m 0.8339  [0m | [0m 0.5315  [0m | [0m 6.04    [0m | [0m 1.239   [0m | [0m 311.2   [0m | [0m 3.935   [0m | [0m 0.7476  [0m |
| [0m 140     [0m | [0m 90.0    [0m | [0m 0.6314  [0m | [0m 0.2532  [0m | [0m 6.7

| [0m 181     [0m | [0m 195.0   [0m | [0m 0.981   [0m | [0m 0.6171  [0m | [0m 3.425   [0m | [0m 1.313   [0m | [0m 348.5   [0m | [0m 0.2262  [0m | [0m 0.6899  [0m |
| [0m 182     [0m | [0m 120.0   [0m | [0m 0.784   [0m | [0m 0.1646  [0m | [0m 4.352   [0m | [0m 2.026   [0m | [0m 308.2   [0m | [0m 3.033   [0m | [0m 0.853   [0m |
| [0m 183     [0m | [0m 160.0   [0m | [0m 0.9453  [0m | [0m 0.1447  [0m | [0m 5.113   [0m | [0m 2.174   [0m | [0m 308.3   [0m | [0m 2.996   [0m | [0m 0.7827  [0m |
| [0m 184     [0m | [0m 85.0    [0m | [0m 0.6544  [0m | [0m 0.4095  [0m | [0m 5.977   [0m | [0m 2.512   [0m | [0m 309.7   [0m | [0m 4.166   [0m | [0m 0.9882  [0m |
| [0m 185     [0m | [0m 45.0    [0m | [0m 0.7856  [0m | [0m 0.3149  [0m | [0m 8.884   [0m | [0m 2.497   [0m | [0m 330.5   [0m | [0m 9.278   [0m | [0m 0.89    [0m |
| [0m 186     [0m | [0m 120.0   [0m | [0m 0.9978  [0m | [0m 0.8345  [0m | [0m 4.6

| [0m 227     [0m | [0m-90.0    [0m | [0m 0.9913  [0m | [0m 0.4703  [0m | [0m 4.395   [0m | [0m 4.816   [0m | [0m 247.9   [0m | [0m 13.07   [0m | [0m 0.6961  [0m |
| [0m 228     [0m | [0m 155.0   [0m | [0m 0.8123  [0m | [0m 0.07254 [0m | [0m 5.56    [0m | [0m 1.427   [0m | [0m 310.1   [0m | [0m 2.978   [0m | [0m 0.684   [0m |
| [0m 229     [0m | [0m 110.0   [0m | [0m 0.7001  [0m | [0m 0.6939  [0m | [0m 4.884   [0m | [0m 1.526   [0m | [0m 311.5   [0m | [0m 2.748   [0m | [0m 0.7694  [0m |
| [0m 230     [0m | [0m 155.0   [0m | [0m 0.8152  [0m | [0m 0.4726  [0m | [0m 5.098   [0m | [0m 1.099   [0m | [0m 309.5   [0m | [0m 2.811   [0m | [0m 0.6634  [0m |
| [0m 231     [0m | [0m 155.0   [0m | [0m 0.7556  [0m | [0m 0.6531  [0m | [0m 5.612   [0m | [0m 2.004   [0m | [0m 311.8   [0m | [0m 2.199   [0m | [0m 0.8569  [0m |
| [0m 232     [0m | [0m 100.0   [0m | [0m 0.6231  [0m | [0m 0.2455  [0m | [0m 5.1

In [34]:
optimization_XGB.max

{'target': 220.0,
 'params': {'colsample_bytree': 0.605873136934962,
  'gamma': 0.327183911209176,
  'max_depth': 4.660877147619236,
  'min_child_weight': 1.7514049980791158,
  'n_estimators': 309.86651749531757,
  'reg_alpha': 1.691261657814916,
  'subsample': 0.7249958039005243}}

In [50]:
sum(cross_validate(XGBClassifier(learning_rate =0.1,
                          n_estimators=308,
                          max_depth=4,
                          min_child_weight=1,
                          gamma=0.327183911209176,
                          subsample=0.7249958039005243,
                          colsample_bytree=0.605873136934962,
                          reg_alpha=1.691261657814916,
                          #reg_lambda=0.5123,
                          objective= 'binary:logistic',
                          n_jobs=-1,
                          scale_pos_weight=1,
                          seed=231),
                   X_train_trust12_prepared,
                   y=y_train_trust12,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

155

### Logistic Regression

In [51]:
def evaluateLogistic(C):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42)
    
    return sum(cross_validate(model, X_train_trust12_prepared, y=y_train_trust12, cv=cv,
                             scoring=profit_scoring,
                             #scoring="precision",
                             )['test_score'])

In [52]:
params_logistic = {
    'C': (1,1000),
}

In [53]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=231)
optimization_logistic.maximize(n_iter=100, init_points=50)

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 260.0   [0m | [0m 781.7   [0m |
| [95m 2       [0m | [95m 270.0   [0m | [95m 584.6   [0m |
| [0m 3       [0m | [0m 270.0   [0m | [0m 427.3   [0m |
| [0m 4       [0m | [0m 270.0   [0m | [0m 316.0   [0m |
| [0m 5       [0m | [0m 260.0   [0m | [0m 827.1   [0m |
| [0m 6       [0m | [0m 260.0   [0m | [0m 903.5   [0m |
| [95m 7       [0m | [95m 330.0   [0m | [95m 39.9    [0m |
| [0m 8       [0m | [0m 260.0   [0m | [0m 915.3   [0m |
| [0m 9       [0m | [0m 330.0   [0m | [0m 61.44   [0m |
| [0m 10      [0m | [0m 280.0   [0m | [0m 181.5   [0m |
| [0m 11      [0m | [0m 330.0   [0m | [0m 73.23   [0m |
| [0m 12      [0m | [0m 270.0   [0m | [0m 459.3   [0m |
| [0m 13      [0m | [0m 260.0   [0m | [0m 708.7   [0m |
| [0m 14      [0m | [0m 270.0   [0m | [0m 483.8   [0m |
| [0m 15      [0m | [0m 330.0   [0m | [0m 26.

| [0m 132     [0m | [0m 330.0   [0m | [0m 66.74   [0m |
| [0m 133     [0m | [0m 330.0   [0m | [0m 72.36   [0m |
| [0m 134     [0m | [0m 330.0   [0m | [0m 37.58   [0m |
| [0m 135     [0m | [0m 330.0   [0m | [0m 41.57   [0m |
| [0m 136     [0m | [0m 330.0   [0m | [0m 64.09   [0m |
| [0m 137     [0m | [0m 330.0   [0m | [0m 25.46   [0m |
| [0m 138     [0m | [0m 330.0   [0m | [0m 24.87   [0m |
| [0m 139     [0m | [0m 330.0   [0m | [0m 25.64   [0m |
| [0m 140     [0m | [0m 330.0   [0m | [0m 24.14   [0m |
| [0m 141     [0m | [0m 330.0   [0m | [0m 24.82   [0m |
| [0m 142     [0m | [0m 330.0   [0m | [0m 23.38   [0m |
| [0m 143     [0m | [0m 330.0   [0m | [0m 24.07   [0m |
| [0m 144     [0m | [0m 330.0   [0m | [0m 22.56   [0m |
| [0m 145     [0m | [0m 330.0   [0m | [0m 23.23   [0m |
| [0m 146     [0m | [0m 330.0   [0m | [0m 21.69   [0m |
| [0m 147     [0m | [0m 330.0   [0m | [0m 22.45   [0m |
| [0m 1

In [54]:
optimization_logistic.max

{'target': 330.0, 'params': {'C': 39.89916730939222}}

In [56]:
sum(cross_validate(LogisticRegression(C=39.89916730939222,
                                      solver='liblinear',
                                      random_state=42),
                   X_train_trust12_prepared,
                   y=y_train_trust12,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

330

# Add additional Features - Interaction Effects

# PolynomialFeatures - SelectKBest for trustLevel==1 

### Split the Data

In [21]:
# create X_train trustLevel==1
df_train = pd.read_csv('train.csv' ,delimiter="|")

is_trust1 = df_train['trustLevel']==1
df_train_trust1 = df_train[is_trust1]

X_train_trust1, y_train_trust1 = df_train_trust1.drop(columns='fraud'), df_train_trust1['fraud']

print(X_train_trust1.shape)
print(y_train_trust1.shape)
X_train_trust1.head(3)

(332, 9)
(332,)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
5,1,770,11.09,11,5,2,0.033766,0.014403,0.423077
15,1,870,32.45,3,1,5,0.006897,0.037299,0.5
24,1,71,78.91,1,4,4,0.014085,1.111408,1.0


In [22]:
X_train_trust1.head(3)
X_train_trust1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 5 to 1875
Data columns (total 9 columns):
trustLevel                   332 non-null int64
totalScanTimeInSeconds       332 non-null int64
grandTotal                   332 non-null float64
lineItemVoids                332 non-null int64
scansWithoutRegistration     332 non-null int64
quantityModifications        332 non-null int64
scannedLineItemsPerSecond    332 non-null float64
valuePerSecond               332 non-null float64
lineItemVoidsPerPosition     332 non-null float64
dtypes: float64(4), int64(5)
memory usage: 25.9 KB


In [23]:
# show distribution
from collections import Counter

print("training data size trustLevel==1: {}".format(len(y_train_trust1)))
print(sorted(Counter(y_train_trust1).items()))
print("Max. profit: {}".format(89*5))

training data size trustLevel==1: 332
[(0, 243), (1, 89)]
Max. profit: 445


**Note:** Maximal profit equals **445**

### Preprocess the Data

In [24]:
# drop trustLevel
X_train_trust1 = X_train_trust1.drop(columns='trustLevel')

**Note:** Only two preprocessing steps at the moment are adding newly designed features (see above) and scaling

In [25]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif

In [26]:
preprocessing_pipeline = Pipeline([
    #("transformer", Transformer()),                           # This class is still void
    #("interactions", PolynomialFeatures(2, interaction_only=True)),
    ("interactions", PolynomialFeatures(2, interaction_only=False)),
    ("scaler1", StandardScaler()),
    ("scaler", MinMaxScaler()),
    ("featureSelection", SelectKBest(mutual_info_classif, k=8)),
])

In [27]:
# apply new pipeline
X_train_trust1_prepared = preprocessing_pipeline.fit_transform(X_train_trust1, y=y_train_trust1)

In [28]:
# delete trustLevel feature!
#X_train_trust1_prepared = np.delete(X_train_trust1_prepared, 0, 1)

In [29]:
X_train_trust1_prepared.shape

(332, 6)

In [30]:
X_train_trust1_prepared

array([[2.06232423e-02, 4.23076923e-02, 8.62068966e-01, 4.65384615e-02,
        4.39135492e-04, 7.15398485e-05],
       [3.93980009e-03, 5.00000000e-02, 1.72413793e-01, 1.50000000e-02,
        1.82066175e-05, 2.18954215e-04],
       [8.40281669e-03, 1.00000000e-01, 1.11022302e-16, 1.00000000e-02,
        7.63073803e-05, 1.30485289e-02],
       ...,
       [1.30031476e-03, 2.33333333e-01, 6.89655172e-02, 1.63333333e-01,
        2.57920517e-06, 3.50525002e-04],
       [4.41460295e-02, 3.47826087e-02, 7.58620690e-01, 2.78260870e-02,
        1.97773894e-03, 9.67230464e-04],
       [4.03213306e-02, 1.92307692e-02, 8.62068966e-01, 9.61538462e-03,
        1.65228125e-03, 2.38234343e-04]])

### Model Selection

In [31]:
# Install the library for Bayesian optimization from here: https://github.com/fmfn/BayesianOptimization
from bayes_opt import BayesianOptimization

### Logistic Regression

In [32]:
def evaluateLogistic(C):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42)
    
    return sum(cross_validate(model, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                                scoring=profit_scoring,
                                  #scoring="f1"
                                 )['test_score'])

In [33]:
params_logistic = {
    'C': (1,1000),
}

In [34]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=231)
optimization_logistic.maximize(n_iter=100, init_points=50)

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 60.0    [0m | [0m 781.7   [0m |
| [0m 2       [0m | [0m 60.0    [0m | [0m 584.6   [0m |
| [95m 3       [0m | [95m 85.0    [0m | [95m 427.3   [0m |
| [0m 4       [0m | [0m 85.0    [0m | [0m 316.0   [0m |
| [0m 5       [0m | [0m 60.0    [0m | [0m 827.1   [0m |
| [0m 6       [0m | [0m 60.0    [0m | [0m 903.5   [0m |
| [0m 7       [0m | [0m-40.0    [0m | [0m 39.9    [0m |
| [0m 8       [0m | [0m 60.0    [0m | [0m 915.3   [0m |
| [0m 9       [0m | [0m-15.0    [0m | [0m 61.44   [0m |
| [0m 10      [0m | [0m 70.0    [0m | [0m 181.5   [0m |
| [0m 11      [0m | [0m-15.0    [0m | [0m 73.23   [0m |
| [0m 12      [0m | [0m 60.0    [0m | [0m 459.3   [0m |
| [0m 13      [0m | [0m 60.0    [0m | [0m 708.7   [0m |
| [0m 14      [0m | [0m 60.0    [0m | [0m 483.8   [0m |
| [0m 15      [0m | [0m-40.0    [0m | [0m 26.42 

KeyboardInterrupt: 

In [None]:
optimization_logistic.max

In [17]:
sum(cross_validate(LogisticRegression(C=2.32,
                                      solver='liblinear',
                                      random_state=42),
                   X_train_trust1_prepared,
                   y=y_train_trust1,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

335

In [18]:
335/445

0.7528089887640449

**Observation:** For **trustLevel==1** Logistic Regression achieves profit of **335** or **0.75** percent of maximal profit

# SELECT FROM MODEL for trustLevel==1

### Split the Data

In [168]:
# create X_train trustLevel==1
df_train = pd.read_csv('train.csv' ,delimiter="|")

is_trust1 = df_train['trustLevel']==1
df_train_trust1 = df_train[is_trust1]

X_train_trust1, y_train_trust1 = df_train_trust1.drop(columns='fraud'), df_train_trust1['fraud']

print(X_train_trust1.shape)
print(y_train_trust1.shape)
X_train_trust1.head(3)

(332, 9)
(332,)


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
5,1,770,11.09,11,5,2,0.033766,0.014403,0.423077
15,1,870,32.45,3,1,5,0.006897,0.037299,0.5
24,1,71,78.91,1,4,4,0.014085,1.111408,1.0


In [169]:
# show distribution
from collections import Counter

print("training data size trustLevel==1: {}".format(len(y_train_trust1)))
print(sorted(Counter(y_train_trust1).items()))
print("Max. profit: {}".format(89*5))

training data size trustLevel==1: 332
[(0, 243), (1, 89)]
Max. profit: 445


**Note:** Maximal profit equals **445**

In [170]:
# drop trustLevel
X_train_trust1 = X_train_trust1.drop(columns='trustLevel')

### Preprocess the Data

List of features that show at least some correlation with fraud according to Niklas' and Yaxi's notebooks:
- scannedLineItemsTotal * totalScanTimeInSeconds
- scannedLineItemsTotal * scansWithoutRegistration
- lineItemVoids * scannedLineItemsTotal
- grandTotal * scannedLineItemsTotal

In [171]:
# select one or more out of feature list below that will be added in the featureGenerationPipeline
feature_list = ['scannedLineItemsTotal',
                'valuePerLineItem',
                'quantityModificationsPerLineItem',
                'lineItemVoids*scansWithoutRegistration',
                #'totalScanTimeInSeconds/trustLevel',
                #'trustLevel_Log', 
               ]

**Note:** Only two preprocessing steps at the moment are adding newly designed features (see above) and scaling

In [172]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder(featurelist=feature_list)),
    ("interactions", PolynomialFeatures(3, interaction_only=False))
    ])


preprocessing_pipeline = Pipeline([
    ("scaler", Scaling(strategy='Standard')),
])

In [173]:
# combine two pipeline into a single data_preparation_pipeline
data_preparation_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

X_train_trust1_prepared = data_preparation_pipeline.fit_transform(X_train_trust1)

In [174]:
X_train_trust1_prepared.shape

(332, 455)

### Model Selection

In [175]:
# Install the library for Bayesian optimization from here: https://github.com/fmfn/BayesianOptimization
from bayes_opt import BayesianOptimization

### SelectFromModel + Logistic Regression

In [176]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

In [177]:
# generate fbeta scorer with beta = 0.5 giving more weight to precision as an alternative to cost evaluation
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=0.333)

In [181]:
def evaluateLogistic(C, k,
                    max_depth,
                    #min_child_weight,
                    #gamma,
                    #subsample,
                    #colsample_bytree,
                    #reg_alpha,
                    #n_estimators
                    ):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42)
                     
    
    #estimator=RandomForestClassifier(max_depth=3, n_estimators=150, random_state=231),
    #estimator=LinearSVC(penalty="l1", dual=False),
    
    rf = RandomForestClassifier(max_depth=int(max_depth), n_estimators=300, random_state=231,
                               class_weight='balanced')
    
    #xgb = XGBClassifier(learning_rate =0.1,
     #                     n_estimators=300,
      #                    max_depth=int(max_depth),
       #                   min_child_weight=1,
        #                  gamma=gamma,
         #                 subsample=subsample,
          #                colsample_bytree=colsample_bytree,
           #               reg_alpha=reg_alpha,
            #              #reg_lambda=0.5123,
             #             objective= 'binary:logistic',
              #            n_jobs=-1,
               #           scale_pos_weight=1,
                #          seed=231)
    
    test_pipeline = Pipeline([
        ('feature_selection', SelectFromModel(estimator=rf,
                                             max_features=int(k),
                                             threshold=-np.inf)),
        ('classification', model),
    ])
    
    return np.mean(cross_validate(test_pipeline, X_train_trust1_prepared, y=y_train_trust1, cv=cv,
                              #scoring=profit_scoring,
                              n_jobs=-1,
                              scoring=ftwo_scorer,
                                  #scoring="f1"
                                 )['test_score'])

In [182]:
params_logistic = {
    'C': (1,500),
    'k': (2,20),
    'max_depth':(2,5.5),
    #'min_child_weight':(1,7),
    #'gamma': (0,1),
    #'subsample':(0.6,1),
    #'colsample_bytree':(0.6,1),
    #'reg_alpha':(0.001, 100),
    #'n_estimators': (50,300),
    #'reg_lambda':(0.3, 0.7)
}

In [183]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=231)
optimization_logistic.maximize(n_iter=200, init_points=75)

|   iter    |  target   |     C     |     k     | max_depth |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.9449  [0m | [0m 391.0   [0m | [0m 12.52   [0m | [0m 3.493   [0m |
| [95m 2       [0m | [95m 0.9636  [0m | [95m 158.4   [0m | [95m 16.88   [0m | [95m 5.162   [0m |
| [0m 3       [0m | [0m 0.9562  [0m | [0m 20.43   [0m | [0m 18.47   [0m | [0m 2.212   [0m |
| [0m 4       [0m | [0m 0.8515  [0m | [0m 91.18   [0m | [0m 3.301   [0m | [0m 3.606   [0m |
| [0m 5       [0m | [0m 0.9056  [0m | [0m 354.5   [0m | [0m 10.7    [0m | [0m 2.089   [0m |
| [0m 6       [0m | [0m 0.9191  [0m | [0m 34.38   [0m | [0m 9.666   [0m | [0m 2.767   [0m |
| [0m 7       [0m | [0m 0.854   [0m | [0m 405.3   [0m | [0m 4.521   [0m | [0m 4.629   [0m |
| [0m 8       [0m | [0m 0.9562  [0m | [0m 140.3   [0m | [0m 19.08   [0m | [0m 5.098   [0m |
| [0m 9       [0m | [0m 0.9458  [0m | [0m 305.5   

| [0m 81      [0m | [0m 0.9498  [0m | [0m 321.9   [0m | [0m 20.0    [0m | [0m 5.5     [0m |
| [0m 82      [0m | [0m 0.9468  [0m | [0m 238.3   [0m | [0m 20.0    [0m | [0m 2.0     [0m |
| [0m 83      [0m | [0m 0.9562  [0m | [0m 377.9   [0m | [0m 19.92   [0m | [0m 2.024   [0m |
| [0m 84      [0m | [0m 0.9562  [0m | [0m 118.6   [0m | [0m 19.97   [0m | [0m 5.281   [0m |
| [0m 85      [0m | [0m 0.9562  [0m | [0m 222.0   [0m | [0m 19.8    [0m | [0m 5.483   [0m |
| [0m 86      [0m | [0m 0.8352  [0m | [0m 235.8   [0m | [0m 2.0     [0m | [0m 5.5     [0m |
| [0m 87      [0m | [0m 0.9562  [0m | [0m 264.7   [0m | [0m 19.85   [0m | [0m 2.162   [0m |
| [0m 88      [0m | [0m 0.9562  [0m | [0m 381.8   [0m | [0m 19.01   [0m | [0m 5.493   [0m |
| [0m 89      [0m | [0m 0.9562  [0m | [0m 213.6   [0m | [0m 19.96   [0m | [0m 2.049   [0m |
| [0m 90      [0m | [0m 0.9498  [0m | [0m 268.1   [0m | [0m 20.0    [0m | 

KeyboardInterrupt: 

In [97]:
optimization_logistic.max

{'target': 340.0, 'params': {'C': 617.4371992275123, 'k': 8.026182029526707}}

### Show results of best performing model

In [184]:
select_from_model = Pipeline([
    ('feature_selection', SelectFromModel(estimator=RandomForestClassifier(max_depth=2,
                                                                           n_estimators=300, random_state=231,
                                                                           class_weight='balanced'),
                                             max_features=17,
                                             threshold=-np.inf)),
        ('classification', LogisticRegression(C=187.9, solver='liblinear', random_state=42)),
    ])

In [185]:
sum(cross_validate(select_from_model,
                   
                   X_train_trust1_prepared,
                   y=y_train_trust1,
                   cv=cv,
                   scoring=profit_scoring)['test_score'])

330

In [186]:
#...