In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import itertools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn import svm as SVM
from funcs import cv_profits_for_models, cv_preds_and_confusion_matrix, CustomModelWithThreshold
from funcs import profit_scorer, profit_scoring
from customClassifiers import OutlierRemover
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization

from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2

In [2]:
X_train = pd.read_csv('train.csv' ,delimiter="|")
X_test = pd.read_csv('test.csv', delimiter="|")

X_train['scannedLineItemsTotal'] = X_train['scannedLineItemsPerSecond'] * X_train['totalScanTimeInSeconds']
X_train['valuePerLineItem'] = X_train['grandTotal'] * X_train['scannedLineItemsTotal']
X_train['quantityModificationsPerLineItem'] = X_train['quantityModifications'] * X_train['scannedLineItemsTotal']
X_train['lineItemVoids*scansWithoutRegistration'] = X_train['lineItemVoids'] * X_train['scansWithoutRegistration']

In [3]:
X_train1 = X_train[X_train['trustLevel']==1]
y1 = X_train1.pop('fraud')
l=X_train1.pop('trustLevel')

In [4]:
X_train2 = X_train[X_train['trustLevel']==2]
y2 = X_train2.pop('fraud')
l=X_train2.pop('trustLevel')

# Trust Level 1 (Calculate Feature Importance with XGboost)

In [5]:
# normalize data first to prevent 0's in dataset
prep_pipeline = Pipeline([
    ('scaling', StandardScaler())
])
X_train1 = prep_pipeline.fit_transform(X_train1)


# generate features and rescale
prep_pipeline = Pipeline([
    ('interaction', PolynomialFeatures(3, interaction_only=False)),
    ('scaling', StandardScaler()),
])
X_train1_all = prep_pipeline.fit_transform(X_train1)

#remove the first var because it is the constant term
X_train1_all = X_train1_all[:,1:]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [6]:
# obtain feature importance by xgboost
xgb = XGBClassifier(num_estimator=100)
xgb.fit(X_train1_all, y1)
imp = xgb.feature_importances_

In [7]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [8]:
# choose starting model
model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

cv = StratifiedKFold(n_splits=10, random_state=42)
last_score = -10000

In [9]:
# add most important feature
X1_temporary = pd.DataFrame(X_train1_all[:,(list(imp.index))[0]])

features_to_use1 = []
# iteratively add features one by one
for featnum in (list(imp.index))[1:]:
    X_check = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
    score = sum(cross_validate(model,X_check, y1, scoring=profit_scoring, cv=cv)['test_score'])
    # add the feature ultimatively if score improved
    if score > last_score:
        X1_temporary = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
        features_to_use1.append(featnum)
        last_score = score    
        print(last_score)
    
# for test predictions use features_to_use to select the according features in the test set    

-215
-135
-70
-30
45
65
90
100
140
180
190
230
250
270
290
310
320
330
340


In [54]:
def evaluateLogReg(C, pred_threshold):
    clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
    clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
    return sum(cross_validate(clf,X1_temporary, y1, scoring=profit_scoring, cv=cv)['test_score'])


params_logreg = {
    'C': (0.1, 5),
    'pred_threshold': (0.5, 0.6)
}

optimization_logreg1 = BayesianOptimization(evaluateLogReg, params_logreg)
optimization_logreg1.maximize(n_iter=100, init_points=100)

|   iter    |  target   |     C     | pred_t... |
-------------------------------------------------
| [0m 1       [0m | [0m 275.0   [0m | [0m 4.098   [0m | [0m 0.5279  [0m |
| [95m 2       [0m | [95m 345.0   [0m | [95m 0.2743  [0m | [95m 0.5696  [0m |
| [0m 3       [0m | [0m 275.0   [0m | [0m 4.981   [0m | [0m 0.5781  [0m |
| [95m 4       [0m | [95m 365.0   [0m | [95m 0.477   [0m | [95m 0.5786  [0m |
| [0m 5       [0m | [0m 275.0   [0m | [0m 2.734   [0m | [0m 0.5356  [0m |
| [95m 6       [0m | [95m 375.0   [0m | [95m 0.9518  [0m | [95m 0.5484  [0m |
| [0m 7       [0m | [0m 350.0   [0m | [0m 1.876   [0m | [0m 0.5882  [0m |
| [0m 8       [0m | [0m 365.0   [0m | [0m 1.157   [0m | [0m 0.5951  [0m |
| [0m 9       [0m | [0m 300.0   [0m | [0m 1.664   [0m | [0m 0.5309  [0m |
| [0m 10      [0m | [0m 275.0   [0m | [0m 4.621   [0m | [0m 0.5606  [0m |
| [0m 11      [0m | [0m 275.0   [0m | [0m 3.923   [0m | [0m 0

| [0m 101     [0m | [0m 375.0   [0m | [0m 0.9623  [0m | [0m 0.5569  [0m |


KeyboardInterrupt: 

In [55]:
optimization_logreg1.max

{'target': 375.0,
 'params': {'C': 0.9517977397195106, 'pred_threshold': 0.5483985798365555}}

In [73]:
# verify

C = optimization_logreg1.max['params']['C']
pred_threshold = optimization_logreg1.max['params']['pred_threshold']

clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
# final score for trust=1
cv = StratifiedKFold(n_splits=10, random_state=42)
sum(cross_validate(clf,X1_temporary, y1, scoring=profit_scoring, cv=cv)['test_score'])

375

In [74]:
# number of features used
len(X1_temporary.columns)

20

# Trust Level 2 (Feature Importance by Logistic Regression)

In [19]:
# normalize data first to prevent 0's in dataset
prep_pipeline = Pipeline([
    ('scaling', StandardScaler())
])
X_train2 = prep_pipeline.fit_transform(X_train2)


# generate features and rescale
prep_pipeline = Pipeline([
    ('interaction', PolynomialFeatures(3, interaction_only=False)),
    ('scaling', StandardScaler()),
])
X_train2_all = prep_pipeline.fit_transform(X_train2)

#remove the first var because it is the constant term
X_train2_all = X_train2_all[:,1:]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
# Feature Importance with logistic Regression
lr = LogisticRegression(C=20, solver='lbfgs')
lr.fit(X_train2_all, y2)
imp = lr.coef_[0]

In [22]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [23]:
# choose starting model
model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

cv = StratifiedKFold(n_splits=10, random_state=42)
last_score = -10000

In [24]:
# add most important feature
X2_temporary = pd.DataFrame(X_train2_all[:,(list(imp.index))[0]])

features_to_use2 = []
# iteratively add features one by one
for featnum in (list(imp.index))[1:]:
    X_check = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
    score = sum(cross_validate(model,X_check, y2, scoring=profit_scoring, cv=cv)['test_score'])
    # add the feature ultimatively if score improved
    if score > last_score:
        X2_temporary = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
        features_to_use2.append(featnum)
        last_score = score    
        print(last_score)
    
# for test predictions use features_to_use to select the according features in the test set    

-75
-55
-45
-25
15
35


In [27]:
# dataset in function is X2
def evaluateLogReg(C, pred_threshold):
    clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
    clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
    return sum(cross_validate(clf,X2_temporary, y2, scoring=profit_scoring, cv=cv)['test_score'])


params_logreg = {
    'C': (0.001, 50),
    'pred_threshold': (0.5, 1)
}

optimization_logreg = BayesianOptimization(evaluateLogReg, params_logreg)
optimization_logreg.maximize(n_iter=100, init_points=100)

|   iter    |  target   |     C     | pred_t... |
-------------------------------------------------
| [0m 1       [0m | [0m 10.0    [0m | [0m 38.83   [0m | [0m 0.7443  [0m |
| [0m 2       [0m | [0m 10.0    [0m | [0m 23.63   [0m | [0m 0.692   [0m |
| [0m 3       [0m | [0m-15.0    [0m | [0m 28.32   [0m | [0m 0.5482  [0m |
| [0m 4       [0m | [0m 10.0    [0m | [0m 43.63   [0m | [0m 0.748   [0m |
| [0m 5       [0m | [0m 10.0    [0m | [0m 31.35   [0m | [0m 0.6154  [0m |
| [0m 6       [0m | [0m 10.0    [0m | [0m 39.89   [0m | [0m 0.6498  [0m |
| [0m 7       [0m | [0m 10.0    [0m | [0m 47.98   [0m | [0m 0.6517  [0m |
| [95m 8       [0m | [95m 35.0    [0m | [95m 10.13   [0m | [95m 0.8509  [0m |
| [0m 9       [0m | [0m 35.0    [0m | [0m 11.74   [0m | [0m 0.7987  [0m |
| [0m 10      [0m | [0m 35.0    [0m | [0m 3.92    [0m | [0m 0.8325  [0m |
| [0m 11      [0m | [0m 10.0    [0m | [0m 43.76   [0m | [0m 0.6426  

| [0m 100     [0m | [0m-15.0    [0m | [0m 38.74   [0m | [0m 0.5478  [0m |
| [0m 101     [0m | [0m 45.0    [0m | [0m 0.7027  [0m | [0m 0.5223  [0m |
| [0m 102     [0m | [0m 45.0    [0m | [0m 0.6945  [0m | [0m 0.5257  [0m |


KeyboardInterrupt: 

In [75]:
optimization_logreg.max

{'target': 45.0,
 'params': {'C': 0.7011802960246133, 'pred_threshold': 0.5315216790813109}}

In [76]:
# verify
C = optimization_logreg.max['params']['C']
pred_threshold = optimization_logreg.max['params']['pred_threshold']

clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
# final score for trust=2
cv = StratifiedKFold(n_splits=10, random_state=42)
sum(cross_validate(clf,X2_temporary, y2, scoring=profit_scoring, cv=cv)['test_score'])

45

In [77]:
# number of features used
len(X2_temporary.columns)

7