In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import itertools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn import svm as SVM
from funcs import cv_profits_for_models, cv_preds_and_confusion_matrix, CustomModelWithThreshold
from funcs import profit_scorer, profit_scoring
from customClassifiers import OutlierRemover
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization

from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2

In [2]:
X_train = pd.read_csv('train.csv' ,delimiter="|")
X_test = pd.read_csv('test.csv', delimiter="|")

X_train['scannedLineItemsTotal'] = X_train['scannedLineItemsPerSecond'] * X_train['totalScanTimeInSeconds']
X_train['valuePerLineItem'] = X_train['grandTotal'] * X_train['scannedLineItemsTotal']
X_train['quantityModificationsPerLineItem'] = X_train['quantityModifications'] * X_train['scannedLineItemsTotal']
X_train['lineItemVoids*scansWithoutRegistration'] = X_train['lineItemVoids'] * X_train['scansWithoutRegistration']

In [3]:
X_train1 = X_train[X_train['trustLevel']==1]
y1 = X_train1.pop('fraud')
l=X_train1.pop('trustLevel')

In [4]:
X_train2 = X_train[X_train['trustLevel']==2]
y2 = X_train2.pop('fraud')
l=X_train2.pop('trustLevel')

# Trust Level 1 (Calculate Feature Importance with XGboost)

In [5]:
# normalize data first to prevent 0's in dataset
scaler = StandardScaler()
names_X_train1 = X_train1.columns
X_train1 = scaler.fit_transform(X_train1)

# generate features and rescale
polyFeatures = PolynomialFeatures(3, interaction_only=False)
X_train1_all = polyFeatures.fit_transform(X_train1)
X_train1_all = scaler.fit_transform(X_train1_all)

#remove the first var because it is the constant term
X_train1_all = X_train1_all[:,1:]
features_X_train1_all = polyFeatures.get_feature_names(input_features=names_X_train1)[1:]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [6]:
# obtain feature importance by xgboost
xgb = XGBClassifier(num_estimator=100)
xgb.fit(X_train1_all, y1)
imp = xgb.feature_importances_

In [7]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [8]:
# choose starting model
model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

cv = StratifiedKFold(n_splits=10, random_state=42)
last_score = -10000

In [9]:
# add most important feature
X1_temporary = pd.DataFrame(X_train1_all[:,(list(imp.index))[0]])

features_to_use1 = [(list(imp.index))[0]]
# iteratively add features one by one
for featnum in (list(imp.index))[1:]:
    X_check = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
    score = sum(cross_validate(model,X_check, y1, scoring=profit_scoring, cv=cv)['test_score'])
    # add the feature ultimatively if score improved
    if score > last_score:
        X1_temporary = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
        features_to_use1.append(featnum)
        last_score = score    
        print(last_score)
    
# for test predictions use features_to_use to select the according features in the test set    

-105
5
15
45
50
90
160
170
180
205
225
245
255
265
275
295
305
315


In [10]:
def evaluateLogReg(C, pred_threshold):
    clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
    clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
    return sum(cross_validate(clf,X1_temporary, y1, scoring=profit_scoring, cv=cv)['test_score'])


params_logreg = {
    'C': (0.1, 5),
    'pred_threshold': (0.5, 0.6)
}

optimization_logreg1 = BayesianOptimization(evaluateLogReg, params_logreg)
optimization_logreg1.maximize(n_iter=100, init_points=100)

|   iter    |  target   |     C     | pred_t... |
-------------------------------------------------
| [0m 1       [0m | [0m 280.0   [0m | [0m 2.401   [0m | [0m 0.5904  [0m |
| [95m 2       [0m | [95m 285.0   [0m | [95m 3.085   [0m | [95m 0.5028  [0m |
| [0m 3       [0m | [0m 280.0   [0m | [0m 1.857   [0m | [0m 0.5667  [0m |
| [0m 4       [0m | [0m 280.0   [0m | [0m 4.918   [0m | [0m 0.5957  [0m |
| [0m 5       [0m | [0m 280.0   [0m | [0m 1.682   [0m | [0m 0.5859  [0m |
| [0m 6       [0m | [0m 285.0   [0m | [0m 3.88    [0m | [0m 0.5321  [0m |
| [0m 7       [0m | [0m 285.0   [0m | [0m 2.64    [0m | [0m 0.5199  [0m |
| [0m 8       [0m | [0m 285.0   [0m | [0m 2.887   [0m | [0m 0.5146  [0m |
| [0m 9       [0m | [0m 285.0   [0m | [0m 1.082   [0m | [0m 0.5955  [0m |
| [0m 10      [0m | [0m 280.0   [0m | [0m 2.374   [0m | [0m 0.5975  [0m |
| [95m 11      [0m | [95m 310.0   [0m | [95m 0.2401  [0m | [95m 0.532

KeyboardInterrupt: 

In [11]:
optimization_logreg1.max

{'target': 340.0,
 'params': {'C': 0.3413832354643051, 'pred_threshold': 0.538300775948387}}

In [12]:
# verify

C = optimization_logreg1.max['params']['C']
pred_threshold = optimization_logreg1.max['params']['pred_threshold']

clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
# final score for trust=1
cv = StratifiedKFold(n_splits=10, random_state=42)
sum(cross_validate(clf,X1_temporary, y1, scoring=profit_scoring, cv=cv)['test_score'])

340

In [13]:
# number of features used
len(X1_temporary.columns)

19

In [14]:
features_select_1 = [features_X_train1_all[i] for i in features_to_use1]
features_select_1

['scannedLineItemsTotal',
 'scannedLineItemsPerSecond scannedLineItemsTotal',
 'lineItemVoids scansWithoutRegistration lineItemVoids*scansWithoutRegistration',
 'totalScanTimeInSeconds lineItemVoids*scansWithoutRegistration',
 'valuePerSecond scannedLineItemsTotal^2',
 'scansWithoutRegistration scannedLineItemsTotal',
 'quantityModifications^2 scannedLineItemsTotal',
 'totalScanTimeInSeconds lineItemVoids^2',
 'scansWithoutRegistration lineItemVoidsPerPosition scannedLineItemsTotal',
 'lineItemVoidsPerPosition lineItemVoids*scansWithoutRegistration',
 'totalScanTimeInSeconds scansWithoutRegistration lineItemVoidsPerPosition',
 'lineItemVoids quantityModificationsPerLineItem^2',
 'totalScanTimeInSeconds scannedLineItemsTotal valuePerLineItem',
 'totalScanTimeInSeconds scansWithoutRegistration',
 'totalScanTimeInSeconds lineItemVoids lineItemVoids*scansWithoutRegistration',
 'lineItemVoidsPerPosition scannedLineItemsTotal^2',
 'scannedLineItemsTotal^2',
 'lineItemVoids valuePerSecond qua

# Trust Level 2 (Feature Importance by Logistic Regression)

In [15]:
# normalize data first to prevent 0's in dataset
scaler2 = StandardScaler()
names_X_train2 = X_train2.columns
X_train2 = scaler2.fit_transform(X_train2)

# generate features and rescale
polyFeatures2 = PolynomialFeatures(3, interaction_only=False)
X_train2_all = polyFeatures2.fit_transform(X_train2)
X_train2_all = scaler.fit_transform(X_train2_all)

#remove the first var because it is the constant term
X_train2_all = X_train2_all[:,1:]
features_X_train2_all = polyFeatures2.get_feature_names(input_features=names_X_train2)[1:]


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
# Feature Importance with logistic Regression
lr = LogisticRegression(C=20, solver='lbfgs')
lr.fit(X_train2_all, y2)
imp = lr.coef_[0]

In [17]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [18]:
# choose starting model
model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

cv = StratifiedKFold(n_splits=10, random_state=42)
last_score = -10000

In [19]:
# add most important feature
X2_temporary = pd.DataFrame(X_train2_all[:,(list(imp.index))[0]])

features_to_use2 = [(list(imp.index))[0]]
# iteratively add features one by one
for featnum in (list(imp.index))[1:]:
    X_check = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
    score = sum(cross_validate(model,X_check, y2, scoring=profit_scoring, cv=cv)['test_score'])
    # add the feature ultimatively if score improved
    if score > last_score:
        X2_temporary = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
        features_to_use2.append(featnum)
        last_score = score    
        print(last_score)
    
# for test predictions use features_to_use to select the according features in the test set    

-75
-55
-45
-25
15
35


In [20]:
# dataset in function is X2
def evaluateLogReg(C, pred_threshold):
    clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
    clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
    return sum(cross_validate(clf,X2_temporary, y2, scoring=profit_scoring, cv=cv)['test_score'])


params_logreg = {
    'C': (0.001, 50),
    'pred_threshold': (0.5, 1)
}

optimization_logreg = BayesianOptimization(evaluateLogReg, params_logreg)
optimization_logreg.maximize(n_iter=100, init_points=100)

|   iter    |  target   |     C     | pred_t... |
-------------------------------------------------
| [0m 1       [0m | [0m 10.0    [0m | [0m 23.03   [0m | [0m 0.8011  [0m |
| [0m 2       [0m | [0m 10.0    [0m | [0m 15.12   [0m | [0m 0.5295  [0m |
| [0m 3       [0m | [0m 10.0    [0m | [0m 13.01   [0m | [0m 0.5812  [0m |
| [95m 4       [0m | [95m 35.0    [0m | [95m 47.47   [0m | [95m 0.9132  [0m |
| [0m 5       [0m | [0m 10.0    [0m | [0m 32.4    [0m | [0m 0.822   [0m |
| [95m 6       [0m | [95m 45.0    [0m | [95m 2.664   [0m | [95m 0.5811  [0m |
| [0m 7       [0m | [0m 35.0    [0m | [0m 2.251   [0m | [0m 0.7448  [0m |
| [0m 8       [0m | [0m 10.0    [0m | [0m 47.06   [0m | [0m 0.6282  [0m |
| [0m 9       [0m | [0m 35.0    [0m | [0m 23.45   [0m | [0m 0.8892  [0m |
| [0m 10      [0m | [0m 10.0    [0m | [0m 42.75   [0m | [0m 0.678   [0m |
| [0m 11      [0m | [0m 5.0     [0m | [0m 4.683   [0m | [0m 0.926

KeyboardInterrupt: 

In [21]:
optimization_logreg.max

{'target': 45.0,
 'params': {'C': 2.6642719484799917, 'pred_threshold': 0.5810930799041378}}

In [22]:
# verify
C = optimization_logreg.max['params']['C']
pred_threshold = optimization_logreg.max['params']['pred_threshold']

clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
# final score for trust=2
cv = StratifiedKFold(n_splits=10, random_state=42)
sum(cross_validate(clf,X2_temporary, y2, scoring=profit_scoring, cv=cv)['test_score'])

45

In [23]:
# number of features used
len(X2_temporary.columns)

7

In [24]:
features_select_2 = [features_X_train2_all[i] for i in features_to_use2]
features_select_2

['scannedLineItemsTotal^2',
 'scannedLineItemsTotal^3',
 'totalScanTimeInSeconds scannedLineItemsTotal^2',
 'totalScanTimeInSeconds lineItemVoids scannedLineItemsTotal',
 'totalScanTimeInSeconds scannedLineItemsTotal valuePerLineItem',
 'totalScanTimeInSeconds scansWithoutRegistration valuePerLineItem',
 'scannedLineItemsTotal^2 lineItemVoids*scansWithoutRegistration']