In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import itertools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn import svm as SVM
from funcs import cv_profits_for_models, cv_preds_and_confusion_matrix, CustomModelWithThreshold
from funcs import profit_scorer, profit_scoring
from customClassifiers import OutlierRemover
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization

from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2

In [2]:
X_train = pd.read_csv('train.csv' ,delimiter="|")
X_test = pd.read_csv('test.csv', delimiter="|")

X_train['scannedLineItemsTotal'] = X_train['scannedLineItemsPerSecond'] * X_train['totalScanTimeInSeconds']
X_train['valuePerLineItem'] = X_train['grandTotal'] * X_train['scannedLineItemsTotal']
X_train['quantityModificationsPerLineItem'] = X_train['quantityModifications'] * X_train['scannedLineItemsTotal']
X_train['lineItemVoids*scansWithoutRegistration'] = X_train['lineItemVoids'] * X_train['scansWithoutRegistration']

In [3]:
X_train1 = X_train[X_train['trustLevel']==1]
y1 = X_train1.pop('fraud')
l=X_train1.pop('trustLevel')

In [4]:
X_train2 = X_train[X_train['trustLevel']==2]
y2 = X_train2.pop('fraud')
l=X_train2.pop('trustLevel')

# Trust Level 1 (Calculate Feature Importance with XGboost)

In [5]:
# normalize data first to prevent 0's in dataset
scaler = StandardScaler()
names_X_train1 = X_train1.columns
X_train1 = scaler.fit_transform(X_train1)

# generate features and rescale
polyFeatures = PolynomialFeatures(3, interaction_only=False)
X_train1_all = polyFeatures.fit_transform(X_train1)
X_train1_all = scaler.fit_transform(X_train1_all)

#remove the first var because it is the constant term
X_train1_all = X_train1_all[:,1:]
features_X_train1_all = polyFeatures.get_feature_names(input_features=names_X_train1)[1:]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [6]:
# obtain feature importance by xgboost
xgb = XGBClassifier(num_estimator=100)
xgb.fit(X_train1_all, y1)
imp = xgb.feature_importances_

In [7]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [8]:
features_to_use1_cv = []
for cvNew in np.arange(0,100,20):
    # choose starting model
    model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

    cv = StratifiedKFold(n_splits=10, random_state=cvNew, shuffle=True)
    last_score = -10000

    # add most important feature
    X1_temporary = pd.DataFrame(X_train1_all[:,(list(imp.index))[0]])

    features_to_use1 = [(list(imp.index))[0]]
    # iteratively add features one by one
    for featnum in (list(imp.index))[1:]:
        X_check = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
        score = sum(cross_validate(model,X_check, y1, scoring=profit_scoring, cv=cv)['test_score'])
        # add the feature ultimatively if score improved
        if score > last_score:
            X1_temporary = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
            features_to_use1.append(featnum)
            last_score = score    
            #print(last_score)
    
    # for test predictions use features_to_use to select the according features in the test set
    features_to_use1_names = [features_X_train1_all[i] for i in features_to_use1]
    features_to_use1_cv = features_to_use1_cv + features_to_use1_names

In [9]:
from collections import Counter
Counter(features_to_use1_cv)

Counter({'scannedLineItemsTotal': 5,
         'scannedLineItemsPerSecond scannedLineItemsTotal': 5,
         'lineItemVoids scansWithoutRegistration lineItemVoids*scansWithoutRegistration': 5,
         'scansWithoutRegistration scannedLineItemsTotal^2': 2,
         'valuePerSecond^2 valuePerLineItem': 1,
         'totalScanTimeInSeconds lineItemVoids^2': 4,
         'totalScanTimeInSeconds^2 scannedLineItemsTotal': 4,
         'lineItemVoidsPerPosition lineItemVoids*scansWithoutRegistration': 4,
         'grandTotal^2 scannedLineItemsTotal': 4,
         'lineItemVoids^2 valuePerSecond': 3,
         'totalScanTimeInSeconds scannedLineItemsTotal valuePerLineItem': 5,
         'scannedLineItemsPerSecond valuePerLineItem^2': 3,
         'totalScanTimeInSeconds quantityModificationsPerLineItem^2': 2,
         'valuePerSecond lineItemVoids*scansWithoutRegistration': 3,
         'scannedLineItemsTotal^2 valuePerLineItem': 1,
         'valuePerSecond': 1,
         'totalScanTimeInSeconds lineI

# Trust Level 2 (Feature Importance by Logistic Regression)

In [10]:
# normalize data first to prevent 0's in dataset
scaler2 = StandardScaler()
names_X_train2 = X_train2.columns
X_train2 = scaler2.fit_transform(X_train2)

# generate features and rescale
polyFeatures2 = PolynomialFeatures(3, interaction_only=False)
X_train2_all = polyFeatures2.fit_transform(X_train2)
X_train2_all = scaler.fit_transform(X_train2_all)

#remove the first var because it is the constant term
X_train2_all = X_train2_all[:,1:]
features_X_train2_all = polyFeatures2.get_feature_names(input_features=names_X_train2)[1:]


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [11]:
# Feature Importance with logistic Regression
lr = LogisticRegression(C=20, solver='lbfgs')
lr.fit(X_train2_all, y2)
imp = lr.coef_[0]

In [12]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [13]:
features_to_use2_cv = []
for cvNew in np.arange(0,100,20):
    # choose starting model
    model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

    cv = StratifiedKFold(n_splits=10, random_state=cvNew, shuffle=True)
    last_score = -10000

    # add most important feature
    X2_temporary = pd.DataFrame(X_train2_all[:,(list(imp.index))[0]])

    features_to_use2 = [(list(imp.index))[0]]
    # iteratively add features one by one
    for featnum in (list(imp.index))[1:]:
        X_check = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
        score = sum(cross_validate(model,X_check, y2, scoring=profit_scoring, cv=cv)['test_score'])
        # add the feature ultimatively if score improved
        if score > last_score:
            X2_temporary = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
            features_to_use2.append(featnum)
            last_score = score    
            #print(last_score)
  
    # for test predictions use features_to_use to select the according features in the test set    
    features_to_use2_names = [features_X_train2_all[i] for i in features_to_use2]
    features_to_use2_cv = features_to_use2_cv + features_to_use2_names

In [14]:
from collections import Counter
Counter(features_to_use2_cv)

Counter({'scannedLineItemsTotal^2': 5,
         'scannedLineItemsTotal^3': 5,
         'totalScanTimeInSeconds scannedLineItemsTotal^2': 5,
         'totalScanTimeInSeconds lineItemVoids scannedLineItemsTotal': 4,
         'totalScanTimeInSeconds^2 scannedLineItemsTotal': 2,
         'totalScanTimeInSeconds scannedLineItemsTotal valuePerLineItem': 4,
         'totalScanTimeInSeconds scansWithoutRegistration valuePerLineItem': 5,
         'totalScanTimeInSeconds lineItemVoids*scansWithoutRegistration^2': 1,
         'scannedLineItemsTotal quantityModificationsPerLineItem lineItemVoids*scansWithoutRegistration': 2,
         'lineItemVoids*scansWithoutRegistration': 1,
         'scansWithoutRegistration scannedLineItemsTotal valuePerLineItem': 1,
         'scannedLineItemsTotal^2 lineItemVoids*scansWithoutRegistration': 2,
         'totalScanTimeInSeconds^2 valuePerLineItem': 1,
         'lineItemVoids scannedLineItemsTotal^2': 1,
         'scannedLineItemsTotal^2 quantityModificationsPer