In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('train.csv', sep='|')
df_test = pd.read_csv('test.csv', sep='|')

__scannedLineItemsTotal__

In [3]:
df_train['scannedLineItemsTotal'] = df_train['scannedLineItemsPerSecond'] * df_train['totalScanTimeInSeconds']

### Main attributes
 - trustLevel
 - totalScanTimeInSeconds
 - grandTotal
 - lineItemVoids
 - scansWithoutRegistration
 - quantityModifications
 - scannedLineItemsTotal

other attributes like scannedLineItemsPerSeconds are derived from the above
 

In [4]:
df_train_main = df_train[['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration',
                          'quantityModifications', 'scannedLineItemsTotal']]

### Polynomial Features (multiplication between features)

In [5]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2,interaction_only=True)
poly_main = poly.fit_transform(df_train_main)
poly_main_names = poly.get_feature_names(df_train_main.columns)

df_poly_main = pd.DataFrame(data=poly_main, columns=poly_main_names)
df_poly_main['fraud'] = df_train['fraud']
#polynomialFeatures creates one column wiht only ones, which is dropped here
df_poly_main.drop(['1'], axis=1, inplace=True)

In [6]:
corr_abs = abs(df_poly_main.corr())
corr_abs[['fraud']].style.background_gradient(cmap='coolwarm')

Unnamed: 0,fraud
trustLevel,0.319765
totalScanTimeInSeconds,0.110414
grandTotal,0.00142089
lineItemVoids,0.0634963
scansWithoutRegistration,0.0741225
quantityModifications,0.000863773
scannedLineItemsTotal,0.298423
trustLevel totalScanTimeInSeconds,0.168631
trustLevel grandTotal,0.198792
trustLevel lineItemVoids,0.166165


#### interesting observations
- totalScanTimeInSeconds lineItemVoids: the multiplication has a slightly higher correlation than its parts
- totalScanTimeInSeconds scansWithoutRegistration: the multiplication has a slightly higher correlation than its parts
- totalScanTimeInSeconds scannedLineItemsTotal: the multiplication has a slightly higher correlation than its parts
- lineItemVoids scansWithoutRegistration: the multiplication has a slightly higher correlation than its parts
- => multiplication does not seem to yield much

### division between features

In [7]:
df_division_main = df_train_main.copy()
for col1 in df_train_main.columns:
    for col2 in df_train_main.columns[df_train_main.columns != col1]:
        #replace zeros in col2 with 0.1, to avoid division by 0
        tmp = df_train_main[col2].copy()
        tmp[tmp == 0] = 0.1
        df_division_main[col1 + '/' + col2] = df_train_main[col1] / tmp

df_division_main['fraud'] = df_train['fraud']

In [8]:
corr_abs = abs(df_division_main.corr())
corr_abs[['fraud']].style.background_gradient(cmap='coolwarm')

Unnamed: 0,fraud
trustLevel,0.319765
totalScanTimeInSeconds,0.110414
grandTotal,0.00142089
lineItemVoids,0.0634963
scansWithoutRegistration,0.0741225
quantityModifications,0.000863773
scannedLineItemsTotal,0.298423
trustLevel/totalScanTimeInSeconds,0.0325759
trustLevel/grandTotal,0.00919693
trustLevel/lineItemVoids,0.0615414


#### interesting observations
- totalScanTimeInSeconds/trustLevel: higher correlation compared to its parts
- lineItemVoids/trustLevel: higher correlation compared to its parts
- scansWithoutRegistration/trustLevel: higher correlation compared to its parts
- scannedLineItemsTotal/trustLevel: higher correlation compared to its parts
- => interesting that division by trustLevel often increases overall correlation, but might not matter

### log and square of features

In [9]:
df_logsquare_main = df_train_main.copy()
for col in df_train_main.columns:
    df_logsquare_main[col+'_Square'] = np.square(df_train_main[col])
    #only take log if there are no zeros in the column
    if (not any(df_train_main[col] == 0)):
        df_logsquare_main[col+'_Log'] = np.log(df_train_main[col])
    
df_logsquare_main['fraud'] = df_train['fraud']

In [10]:
corr_abs = abs(df_logsquare_main.corr())
corr_abs[['fraud']].style.background_gradient(cmap='coolwarm')

Unnamed: 0,fraud
trustLevel,0.319765
totalScanTimeInSeconds,0.110414
grandTotal,0.00142089
lineItemVoids,0.0634963
scansWithoutRegistration,0.0741225
quantityModifications,0.000863773
scannedLineItemsTotal,0.298423
trustLevel_Square,0.260442
trustLevel_Log,0.382605
totalScanTimeInSeconds_Square,0.107345


#### interesting observations
- trustLevel_Log: has higher correlation, but does it matter?
- grandTotal_Log: has higher correlation
- quantityModifications_Square: hash higher correlation
- scannedLineItemsTotal_Square: hash higher correlation

### same imports as in main.py

In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn import svm as SVM
from sklearn.naive_bayes import GaussianNB as NB
from xgboost import XGBClassifier

# custom imports
from funcs import plot_cv_confidence_vs_profit, score_dmc_profit,dmc_profit,cv_preds_and_confusion_matrix,cv_profits_for_models, profit_scoring
from customClassifiers import CustomModelWithThreshold, TrustHard, PerceptronLearner
from pipes import CustomAttributeAdder,Scaling,RandomAttributeAdder,Transformer,ClfSwitcher

from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import VotingClassifier

# use sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import itertools

## Test all feature combinations of the new features

### load data

In [12]:
df_train = pd.read_csv('train.csv', sep='|')
df_test = pd.read_csv('test.csv', sep='|')

# split label and predictors
X_train, y_train = df_train.drop(columns=['fraud',]), df_train['fraud']

### Create pipeline (as in main.py)

In [13]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder()),    # returns pd.dataframe
    ("RandomAttributeAdder", RandomAttributeAdder())
    ])


preprocessing_pipeline = Pipeline([
    ("transformer", Transformer()),                # This class is still void
    ("scaler", Scaling(strategy='Standard')),
])


model_training_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline),
    ('classifier', ClfSwitcher())
])

models  = [
('lr',LogisticRegression()),
('svm',SVM.SVC()),
#TrustHard(LogisticRegression(C=10)),
('perc',PerceptronLearner(100)),
('sgd',SGDClassifier()),
('xgb',XGBClassifier())
]

weights = list(itertools.product([0,1], repeat=len(models)))
weights.remove(tuple(np.zeros(len(models))))

### Create all combinations up to 5 features from the interesting features

In [14]:
featurelist = [                      'totalScanTimeInSeconds*lineItemVoids',
                                     'totalScanTimeInSeconds*scansWithoutRegistration',
                                     'totalScanTimeInSeconds*scannedLineItemsTotal',
                                     'lineItemVoids*scansWithoutRegistration',
                                     'totalScanTimeInSeconds/trustLevel',
                                     'lineItemVoids/trustLevel',
                                     'scansWithoutRegistration/trustLevel',
                                     'scansWithoutRegistration/quantityModifications',
                                     'scannedLineItemsTotal/trustLevel',
                                     'trustLevel_Log',
                                     'grandTotal_Log',
                                     'quantityModifications_Square',
                                     'scannedLineItemsTotal_Square']
combinations = [1, 2, 3, 4, 5]
featurelistAllcombinations = []
for l in combinations:
    featurelistAllcombinations += list(itertools.combinations(featurelist, l))

### use these features in a gridsearch with logistic regression

In [15]:
parameters = [
    #{
    #   'classifier__estimator': [SGDClassifier()],
    #    'classifier__estimator__penalty': ('l2', 'elasticnet', 'l1'),
    #   'classifier__estimator__max_iter': [50, 80],
    #    'classifier__estimator__tol': [1e-4],
    #   'classifier__estimator__loss': ['hinge', 'modified_huber']
    #},
    
    #{
    #   'classifier__estimator': [LogisticRegression()],
    #    'classifier__estimator__C': [0.5,1,2,5,10,20,30],
    #},
    
    #{  
    #   #try different feature combinations  
    #   'feature_generation__attribs_adder__featurelist': [
    #                                  ['valuePerLineItem','quantityModificationsPerLineItem'],
    #                                  ['quantityModificationsPerLineItem'],
    #                                  ['valuePerLineItem']],  
    #   'classifier__estimator': [VotingClassifier(estimators=models,voting='hard')],
    #   'classifier__estimator__weights': weights,
    #   # params for the single models
    #   'classifier__estimator__lr__C': [10],
    #   'classifier__estimator__sgd__loss':['modified_huber'],
    #   'classifier__estimator__sgd__max_iter':[50],
    #   'classifier__estimator__xgb__max_depth': [4],
    #   #'classifier__estimator__perc__epochs':[100]
    #},
    
    
    #{
    #    'classifier__estimator': [XGBClassifier()],
    #    'classifier__estimator__n_estimators': [50, 100, 150],
    #    'classifier__estimator__reg_alpha': [0, 0.05, 0.1]
    #},
    #{
    #    'classifier__estimator': [RandomForestClassifier()],
    #    'classifier__estimator__min_samples_split': [2, 4, 6],
    #    'classifier__estimator__criterion': ['gini', 'entropy']
    #}
    {
     'feature_generation__attribs_adder__featurelist': featurelistAllcombinations,
     'classifier__estimator': [LogisticRegression()],
     'classifier__estimator__C': [10]
    }
]

In [16]:
gscv = GridSearchCV(model_training_pipeline, parameters, cv=10, n_jobs=-1, scoring=profit_scoring, verbose=3)
gscv.fit(X_train, y_train)

Fitting 10 folds for each of 2379 candidates, totalling 23790 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 5400 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 6264 tasks      | elapsed: 31.5min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | e

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('feature_generation', Pipeline(memory=None,
     steps=[('attribs_adder', CustomAttributeAdder(featurelist=['scannedLineItemsTotal', 'valuePerLineItem', 'quantityModificationsPerLineItem', 'totalScanTimeInSeconds*lineItemVoids', 'totalScanTimeInSeconds*scansWithoutRegistration', 'totalScanTi..._state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'feature_generation__attribs_adder__featurelist': [('totalScanTimeInSeconds*lineItemVoids',), ('totalScanTimeInSeconds*scansWithoutRegistration',), ('totalScanTimeInSeconds*scannedLineItemsTotal',), ('lineItemVoids*scansWithoutRegistration',), ('totalScanTimeInSeconds/trustLevel',), ('l...lver='warn',
          tol=0.0001, verbose=0, warm_start=False)], 'classifier__estimator__C': [10]}],
       pre_dispatch='2*n_jobs', refit=

In [17]:
print(gscv.best_score_)
print(gscv.best_params_)
gscv.best_estimator_.named_steps

35.987227248536456
{'classifier__estimator': LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 'classifier__estimator__C': 10, 'feature_generation__attribs_adder__featurelist': ('lineItemVoids*scansWithoutRegistration', 'totalScanTimeInSeconds/trustLevel', 'trustLevel_Log')}


{'feature_generation': Pipeline(memory=None,
      steps=[('attribs_adder', CustomAttributeAdder(featurelist=('lineItemVoids*scansWithoutRegistration', 'totalScanTimeInSeconds/trustLevel', 'trustLevel_Log'))), ('RandomAttributeAdder', <pipes.RandomAttributeAdder object at 0x7fc84dce2588>)]),
 'preprocessing': Pipeline(memory=None,
      steps=[('transformer', <pipes.Transformer object at 0x7fc84dce2eb8>), ('scaler', <pipes.Scaling object at 0x7fc84dce2978>)]),
 'classifier': ClfSwitcher(estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='warn',
           tol=0.0001, verbose=0, warm_start=False))}

### look at the best scores and their parameters

In [139]:
gscv.cv_results_['mean_test_score'][np.argsort(-gscv.cv_results_['mean_test_score'])][0:6]

array([35.98722725, 35.98722725, 35.98722725, 35.98722725, 35.98722725,
       34.98669505])

In [140]:
c = [gscv.cv_results_['params'][i]['feature_generation__attribs_adder__featurelist'] for i in np.argsort(-gscv.cv_results_['mean_test_score'])]
c[0:6]

[('lineItemVoids*scansWithoutRegistration',
  'totalScanTimeInSeconds/trustLevel',
  'trustLevel_Log',
  'quantityModifications_Square'),
 ('lineItemVoids*scansWithoutRegistration',
  'totalScanTimeInSeconds/trustLevel',
  'trustLevel_Log',
  'grandTotal_Log',
  'quantityModifications_Square'),
 ('lineItemVoids*scansWithoutRegistration',
  'totalScanTimeInSeconds/trustLevel',
  'trustLevel_Log'),
 ('lineItemVoids*scansWithoutRegistration',
  'totalScanTimeInSeconds/trustLevel',
  'scansWithoutRegistration/quantityModifications',
  'trustLevel_Log'),
 ('lineItemVoids*scansWithoutRegistration',
  'totalScanTimeInSeconds/trustLevel',
  'scansWithoutRegistration/quantityModifications',
  'trustLevel_Log',
  'quantityModifications_Square'),
 ('totalScanTimeInSeconds*lineItemVoids',
  'lineItemVoids*scansWithoutRegistration',
  'totalScanTimeInSeconds/trustLevel',
  'scansWithoutRegistration/quantityModifications',
  'trustLevel_Log')]

the 5 best models have the same score and each of the 5 best models always includes the following three features
- lineItemVoids*scansWithoutRegistration
- totalScanTimeInSeconds/trustLevel
- trustLevel_Log

### test these features in the cv function from ilias

In [274]:
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer

df = pd.read_csv('train.csv', sep='|')
X, y = df.drop(columns='fraud'), df['fraud']

cv = StratifiedKFold(n_splits=10, random_state=42)
def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

In [275]:
X_standard = X.copy()
X_extraFeatures = X.copy()
X_onlyTrustLevel_Log = X.copy()
X_noTrustLevel_Log = X.copy()

CustomAttributeAdder(featurelist=['scannedLineItemsTotal']).transform(X_standard)
CustomAttributeAdder(featurelist=['lineItemVoids*scansWithoutRegistration','totalScanTimeInSeconds/trustLevel','trustLevel_Log']).transform(X_extraFeature)
CustomAttributeAdder(featurelist=['trustLevel_Log']).transform(X_onlyTrustLevel_Log)
CustomAttributeAdder(featurelist=['lineItemVoids*scansWithoutRegistration','totalScanTimeInSeconds/trustLevel']).transform(X_noTrustLevel_Log)
print(X_standard.columns)
print(X_extraFeature.columns)
print(X_onlyTrustLevel_Log.columns)
print(X_noTrustLevel_Log.columns)

Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'scannedLineItemsTotal'],
      dtype='object')
Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'scannedLineItemsTotal', 'trustLevel_Log',
       'lineItemVoids*scansWithoutRegistration',
       'totalScanTimeInSeconds/trustLevel'],
      dtype='object')
Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'scannedLineItemsTotal', 'trustLevel_Log'],
      dtype='object')
Index(['trustLevel', 'totalScanTimeInSeconds',

In [276]:
sum(cross_validate(LogisticRegression(C=300,solver='liblinear'), X_standard, y=y, cv=cv, scoring=profit_scoring)['test_score'])

325

In [277]:
sum(cross_validate(LogisticRegression(C=300,solver='liblinear'), X_extraFeature, y=y, cv=cv, scoring=profit_scoring)['test_score'])

335

In [278]:
sum(cross_validate(LogisticRegression(C=300,solver='liblinear'), X_onlyTrustLevel_Log, y=y, cv=cv, scoring=profit_scoring)['test_score'])

345

In [279]:
sum(cross_validate(LogisticRegression(C=300,solver='liblinear'), X_noTrustLevel_Log, y=y, cv=cv, scoring=profit_scoring)['test_score'])

275

In [280]:
sum(cross_validate(LogisticRegression(C=305,solver='liblinear'), X_noTrustLevel_Log, y=y, cv=cv, scoring=profit_scoring)['test_score'])

335

### Observations
- for some reasons in gridsearch a regularization parameter for log reg of 10 yields good results, while for the cv function from ilias a parameter of 300 yields good results
- regularization parameter seems to play a large role; do I overfit the cross-validation?
- TrustLevel_Log seems to be important
- when only using the other two new features that are not the log of TrustLevel there is a big change between C=300 and C=305