### same imports as in main.py

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn import svm as SVM
from sklearn.naive_bayes import GaussianNB as NB
from xgboost import XGBClassifier

# custom imports
from funcs import plot_cv_confidence_vs_profit, score_dmc_profit,dmc_profit,cv_preds_and_confusion_matrix,cv_profits_for_models, profit_scoring
from customClassifiers import CustomModelWithThreshold, TrustHard, PerceptronLearner
from pipes import CustomAttributeAdder,Scaling,RandomAttributeAdder,Transformer,ClfSwitcher

from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import VotingClassifier

# use sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import itertools

### load data

In [2]:
df_train = pd.read_csv('train.csv', sep='|')
df_test = pd.read_csv('test.csv', sep='|')

# split label and predictors
X_train, y_train = df_train.drop(columns=['fraud',]), df_train['fraud']
X_test = df_test

### create pipeline

In [3]:
feature_list = ['scannedLineItemsTotal',
                #'valuePerLineItem',
                #'quantityModificationsPerLineItem',
                'lineItemVoids*scansWithoutRegistration',
                'totalScanTimeInSeconds/trustLevel',
                'trustLevel_Log', 
               ]

featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder(featurelist=feature_list)),                
    #("RandomAttributeAdder", RandomAttributeAdder())         #  This class is still void
    ])


preprocessing_pipeline = Pipeline([
    #("transformer", Transformer()),                           # This class is still void
    ("scaler", Scaling(strategy='Standard')),
])

data_preparation_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

### prepare X_train and X_test

In [4]:
data_preparation_pipeline.fit_transform(X_train)
data_preparation_pipeline.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([[ 2.90890804e-01, -8.48396806e-01,  1.33309604e+00, ...,
         1.67119810e-01, -6.91222611e-01,  4.77364223e-01],
       [-2.94705662e-01,  1.67163106e-01,  3.11740617e-01, ...,
         5.39164642e-01, -1.04481221e-01,  1.46764073e-03],
       [-1.46589859e+00, -1.42520458e+00, -1.24644115e+00, ...,
        -2.79333988e-01, -5.69386827e-01, -1.81590606e+00],
       ...,
       [-2.94705662e-01, -1.16422270e+00, -1.54394685e+00, ...,
         3.15937743e-01, -7.36322045e-01,  1.46764073e-03],
       [-8.80302128e-01,  1.15057308e+00, -5.54801009e-01, ...,
        -6.51378820e-01,  1.04611528e+00, -6.69270920e-01],
       [-2.94705662e-01,  1.02197331e+00,  2.41780022e-01, ...,
         1.84132155e+00,  3.01189308e-01,  1.46764073e-03]])

In [5]:
X_train.columns

Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'scannedLineItemsTotal',
       'lineItemVoids*scansWithoutRegistration',
       'totalScanTimeInSeconds/trustLevel', 'trustLevel_Log'],
      dtype='object')

In [6]:
X_test.columns

Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'scannedLineItemsTotal',
       'lineItemVoids*scansWithoutRegistration',
       'totalScanTimeInSeconds/trustLevel', 'trustLevel_Log'],
      dtype='object')

### train best model from presentation 2 on train set
### i.e. Logistic Regression with C = 40

In [7]:
lr = LogisticRegression(C=40, solver='liblinear', random_state=42)
lr.fit(X_train, y_train)

LogisticRegression(C=40, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

### predict fraudsters for the test set

In [8]:
test_predict = lr.predict(X_test)

### check percentage of fraudsters predicted and in train

In [9]:
sum(test_predict)/len(test_predict)

0.04737603915514504

In [10]:
sum(df_train.fraud)/len(df_train.fraud)

0.05534858967535923

- 4.7% predicted vs. 5.5% in train
- seems to be a reasonable result since we want to be careful with predicting someone as a fraud
- maybe it makes sense to always check our models on the test set to see how this percentage changes?

### check how many fraudsters per trustLevel

In [11]:
print(sum(test_predict[df_test.trustLevel == 1]))
print(sum(test_predict[df_test.trustLevel == 2]))
print(sum(test_predict[df_test.trustLevel == 3]))
print(sum(test_predict[df_test.trustLevel == 4]))
print(sum(test_predict[df_test.trustLevel == 5]))
print(sum(test_predict[df_test.trustLevel == 6]))

19717
3882
0
0
0
0
