In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import itertools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn import svm as SVM
from funcs import cv_profits_for_models, cv_preds_and_confusion_matrix, CustomModelWithThreshold
from funcs import profit_scorer, profit_scoring
from customClassifiers import OutlierRemover
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization

from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2

In [2]:
X_train = pd.read_csv('train.csv' ,delimiter="|")
X_test = pd.read_csv('test.csv', delimiter="|")

X_train['scannedLineItemsTotal'] = X_train['scannedLineItemsPerSecond'] * X_train['totalScanTimeInSeconds']
X_train['valuePerLineItem'] = X_train['grandTotal'] * X_train['scannedLineItemsTotal']
X_train['quantityModificationsPerLineItem'] = X_train['quantityModifications'] * X_train['scannedLineItemsTotal']
X_train['lineItemVoids*scansWithoutRegistration'] = X_train['lineItemVoids'] * X_train['scansWithoutRegistration']

In [3]:
X_test['scannedLineItemsTotal'] = X_test['scannedLineItemsPerSecond'] * X_test['totalScanTimeInSeconds']
X_test['valuePerLineItem'] = X_test['grandTotal'] * X_test['scannedLineItemsTotal']
X_test['quantityModificationsPerLineItem'] = X_test['quantityModifications'] * X_test['scannedLineItemsTotal']
X_test['lineItemVoids*scansWithoutRegistration'] = X_test['lineItemVoids'] * X_test['scansWithoutRegistration']


In [4]:
X_train1 = X_train[X_train['trustLevel']==1]
y1 = X_train1.pop('fraud')
l=X_train1.pop('trustLevel')

In [5]:
X_test1 = X_test[X_test['trustLevel']==1]
l=X_test1.pop('trustLevel')

In [6]:
X_train2 = X_train[X_train['trustLevel']==2]
y2 = X_train2.pop('fraud')
l=X_train2.pop('trustLevel')

In [7]:
X_test2 = X_test[X_test['trustLevel']==2]
l=X_test2.pop('trustLevel')

In [8]:
X_test_higher = X_test[X_test['trustLevel']>2]

# Trust Level 1 (Calculate Feature Importance with XGboost)

In [9]:
# normalize data first to prevent 0's in dataset
scaler = StandardScaler()
names_X_train1 = X_train1.columns
X_train1 = scaler.fit_transform(X_train1)


# for test transform with the scaler fitted on train
names_X_test1 = X_test1.columns
index_X_test1 = X_test1.index
X_test1 = scaler.transform(X_test1)

# generate features and rescale
polyFeatures = PolynomialFeatures(3, interaction_only=False)
X_train1_all = polyFeatures.fit_transform(X_train1)
X_train1_all = scaler.fit_transform(X_train1_all)

# for test transform with the scaler fitted on train
X_test1_all = polyFeatures.transform(X_test1)
X_test1_all = scaler.transform(X_test1_all)

# remove first var (constant term)
X_test1_all = X_test1_all[:,1:]
features_X_test1_all = polyFeatures.get_feature_names(input_features=names_X_test1)[1:]

# remove the first var because it is the constant term
X_train1_all = X_train1_all[:,1:]
features_X_train1_all = polyFeatures.get_feature_names(input_features=names_X_train1)[1:]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  # Remove the CWD from sys.path while we load stuff.


In [10]:
# obtain feature importance by xgboost
xgb = XGBClassifier(num_estimator=100)
xgb.fit(X_train1_all, y1)
imp = xgb.feature_importances_

In [11]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [12]:
# choose starting model
model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

cv = StratifiedKFold(n_splits=10, random_state=42)
last_score = -10000

In [13]:
# add most important feature
X1_temporary = pd.DataFrame(X_train1_all[:,(list(imp.index))[0]])

features_to_use1 = [(list(imp.index))[0]]
# iteratively add features one by one
for featnum in (list(imp.index))[1:]:
    X_check = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
    score = sum(cross_validate(model,X_check, y1, scoring=profit_scoring, cv=cv)['test_score'])
    # add the feature ultimatively if score improved
    if score > last_score:
        X1_temporary = pd.concat([X1_temporary,pd.Series(X_train1_all[:,featnum])], axis=1)
        features_to_use1.append(featnum)
        last_score = score    
        print(last_score)
    
# for test predictions use features_to_use to select the according features in the test set    

-215
-135
-70
-30
45
65
90
100
140
180
190
230
250
270
290
310
320
330
340


In [14]:
def evaluateLogReg(C, pred_threshold):
    clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
    clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
    return sum(cross_validate(clf,X1_temporary, y1, scoring=profit_scoring, cv=cv)['test_score'])


params_logreg = {
    'C': (0.1, 5),
    'pred_threshold': (0.5, 0.6)
}

optimization_logreg1 = BayesianOptimization(evaluateLogReg, params_logreg)
optimization_logreg1.maximize(n_iter=100, init_points=100)

|   iter    |  target   |     C     | pred_t... |
-------------------------------------------------
| [0m 1       [0m | [0m 285.0   [0m | [0m 3.87    [0m | [0m 0.5105  [0m |
| [0m 2       [0m | [0m 275.0   [0m | [0m 3.955   [0m | [0m 0.543   [0m |
| [95m 3       [0m | [95m 300.0   [0m | [95m 0.6032  [0m | [95m 0.503   [0m |
| [0m 4       [0m | [0m 275.0   [0m | [0m 2.116   [0m | [0m 0.5565  [0m |
| [0m 5       [0m | [0m 275.0   [0m | [0m 3.121   [0m | [0m 0.5528  [0m |
| [95m 6       [0m | [95m 325.0   [0m | [95m 1.802   [0m | [95m 0.5736  [0m |
| [0m 7       [0m | [0m 275.0   [0m | [0m 3.826   [0m | [0m 0.5452  [0m |
| [0m 8       [0m | [0m 275.0   [0m | [0m 4.863   [0m | [0m 0.5601  [0m |
| [0m 9       [0m | [0m 275.0   [0m | [0m 3.682   [0m | [0m 0.5552  [0m |
| [0m 10      [0m | [0m 275.0   [0m | [0m 3.916   [0m | [0m 0.569   [0m |
| [95m 11      [0m | [95m 345.0   [0m | [95m 0.4046  [0m | [95m 0

| [0m 100     [0m | [0m 275.0   [0m | [0m 2.095   [0m | [0m 0.5499  [0m |
| [0m 101     [0m | [0m 375.0   [0m | [0m 0.7609  [0m | [0m 0.548   [0m |
| [0m 102     [0m | [0m 375.0   [0m | [0m 1.509   [0m | [0m 0.5866  [0m |
| [0m 103     [0m | [0m 375.0   [0m | [0m 0.7592  [0m | [0m 0.5437  [0m |
| [0m 104     [0m | [0m 375.0   [0m | [0m 0.7657  [0m | [0m 0.5443  [0m |


KeyboardInterrupt: 

In [15]:
optimization_logreg1.max

{'target': 375.0,
 'params': {'C': 0.7615716480588779, 'pred_threshold': 0.5397758971971408}}

In [16]:
# obtain selected feature names
features_select_1 = [features_X_train1_all[i] for i in features_to_use1]
features_select_1

['lineItemVoids^2 valuePerSecond',
 'scannedLineItemsTotal',
 'scannedLineItemsPerSecond scannedLineItemsTotal',
 'scannedLineItemsPerSecond valuePerLineItem^2',
 'valuePerLineItem^2 lineItemVoids*scansWithoutRegistration',
 'totalScanTimeInSeconds^2 lineItemVoids*scansWithoutRegistration',
 'valuePerSecond scannedLineItemsTotal^2',
 'scansWithoutRegistration scannedLineItemsTotal',
 'totalScanTimeInSeconds scannedLineItemsPerSecond lineItemVoidsPerPosition',
 'totalScanTimeInSeconds valuePerLineItem^2',
 'scannedLineItemsTotal^2',
 'valuePerSecond lineItemVoids*scansWithoutRegistration',
 'scannedLineItemsTotal valuePerLineItem^2',
 'quantityModifications^2 scannedLineItemsTotal',
 'totalScanTimeInSeconds scannedLineItemsTotal valuePerLineItem',
 'lineItemVoids scansWithoutRegistration quantityModifications',
 'lineItemVoids quantityModifications quantityModificationsPerLineItem',
 'totalScanTimeInSeconds lineItemVoids^2',
 'lineItemVoids lineItemVoidsPerPosition',
 'totalScanTimeInSe

In [17]:
X_train1_all = pd.DataFrame(X_train1_all, columns=features_X_train1_all)
X_train1_selected = X_train1_all.loc[:,features_select_1]

In [18]:
X_test1_all = pd.DataFrame(X_test1_all, columns=features_X_test1_all)
X_test1_selected = X_test1_all.loc[:,features_select_1]
X_test1_selected.index = index_X_test1

### Final score for 'robustLogReg' in Trust=1

In [19]:
# verify the score obtained from bayesian optimization
C1 = optimization_logreg1.max['params']['C']
pred_threshold1 = optimization_logreg1.max['params']['pred_threshold']

clf1 = LogisticRegression(C=C1, solver='lbfgs', max_iter=10000)
clf1 = CustomModelWithThreshold(clf1,threshold=pred_threshold1)
# final score for trust=1
cv = StratifiedKFold(n_splits=10, random_state=42)
sum(cross_validate(clf1,X_train1_selected, y1, scoring=profit_scoring, cv=cv)['test_score'])

375

In [20]:
# number of features used
len(X1_temporary.columns)

20

# Trust Level 2 (Feature Importance by Logistic Regression)

In [21]:
# normalize data first to prevent 0's in dataset
scaler = StandardScaler()
names_X_train2 = X_train2.columns
X_train2 = scaler.fit_transform(X_train2)


# for test transform with the scaler fitted on train
names_X_test2 = X_test2.columns
index_X_test2 = X_test2.index
X_test2 = scaler.transform(X_test2)

# generate features and rescale
polyFeatures = PolynomialFeatures(3, interaction_only=False)
X_train2_all = polyFeatures.fit_transform(X_train2)
X_train2_all = scaler.fit_transform(X_train2_all)

# for test transform with the scaler fitted on train
X_test2_all = polyFeatures.transform(X_test2)
X_test2_all = scaler.transform(X_test2_all)

# remove first var (constant term)
X_test2_all = X_test2_all[:,1:]
features_X_test2_all = polyFeatures.get_feature_names(input_features=names_X_test2)[1:]

# remove the first var because it is the constant term
X_train2_all = X_train2_all[:,1:]
features_X_train2_all = polyFeatures.get_feature_names(input_features=names_X_train2)[1:]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  # Remove the CWD from sys.path while we load stuff.


In [22]:
# Feature Importance with logistic Regression
lr = LogisticRegression(C=20, solver='lbfgs')
lr.fit(X_train2_all, y2)
imp = lr.coef_[0]

In [23]:
# order the feature indices by importance
imp = pd.DataFrame(imp)
imp = imp.sort_values(by=0, ascending=False)

In [24]:
# choose starting model
model = CustomModelWithThreshold(LogisticRegression(C=10, solver='lbfgs', max_iter=300), 0.9)

cv = StratifiedKFold(n_splits=10, random_state=42)
last_score = -10000

In [25]:
# add most important feature
X2_temporary = pd.DataFrame(X_train2_all[:,(list(imp.index))[0]])

features_to_use2 = [(list(imp.index))[0]]
# iteratively add features one by one
for featnum in (list(imp.index))[1:]:
    X_check = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
    score = sum(cross_validate(model,X_check, y2, scoring=profit_scoring, cv=cv)['test_score'])
    # add the feature ultimatively if score improved
    if score > last_score:
        X2_temporary = pd.concat([X2_temporary,pd.Series(X_train2_all[:,featnum])], axis=1)
        features_to_use2.append(featnum)
        last_score = score    
        print(last_score)
    
# for test predictions use features_to_use to select the according features in the test set    

-75
-55
-45
-25
15
35


In [31]:
# dataset in function is X2
def evaluateLogReg(C, pred_threshold):
    clf = LogisticRegression(C=C, solver='lbfgs', max_iter=10000)
    clf = CustomModelWithThreshold(clf,threshold=pred_threshold)
    return sum(cross_validate(clf,X2_temporary, y2, scoring=profit_scoring, cv=cv)['test_score'])


params_logreg = {
    'C': (0.001, 50),
    'pred_threshold': (0.5, 1)
}

optimization_logreg = BayesianOptimization(evaluateLogReg, params_logreg)
optimization_logreg.maximize(n_iter=100, init_points=100)

|   iter    |  target   |     C     | pred_t... |
-------------------------------------------------
| [0m 1       [0m | [0m 35.0    [0m | [0m 40.34   [0m | [0m 0.881   [0m |
| [0m 2       [0m | [0m 10.0    [0m | [0m 44.83   [0m | [0m 0.8641  [0m |
| [95m 3       [0m | [95m 45.0    [0m | [95m 2.262   [0m | [95m 0.5954  [0m |
| [0m 4       [0m | [0m 35.0    [0m | [0m 28.01   [0m | [0m 0.9039  [0m |
| [0m 5       [0m | [0m 10.0    [0m | [0m 29.4    [0m | [0m 0.7576  [0m |
| [0m 6       [0m | [0m 25.0    [0m | [0m 12.3    [0m | [0m 0.9471  [0m |
| [0m 7       [0m | [0m 10.0    [0m | [0m 38.09   [0m | [0m 0.7607  [0m |
| [0m 8       [0m | [0m 10.0    [0m | [0m 39.75   [0m | [0m 0.6307  [0m |
| [0m 9       [0m | [0m 35.0    [0m | [0m 45.56   [0m | [0m 0.8975  [0m |
| [0m 10      [0m | [0m 10.0    [0m | [0m 30.41   [0m | [0m 0.827   [0m |
| [0m 11      [0m | [0m 10.0    [0m | [0m 19.46   [0m | [0m 0.7188  

| [0m 100     [0m | [0m 10.0    [0m | [0m 47.41   [0m | [0m 0.7219  [0m |
| [0m 101     [0m | [0m 45.0    [0m | [0m 2.256   [0m | [0m 0.5966  [0m |


KeyboardInterrupt: 

In [32]:
optimization_logreg.max

{'target': 45.0,
 'params': {'C': 2.2620169543165747, 'pred_threshold': 0.5954276044708}}

In [33]:
features_select_2 = [features_X_train2_all[i] for i in features_to_use2]
features_select_2

['scannedLineItemsTotal^2',
 'scannedLineItemsTotal^3',
 'totalScanTimeInSeconds scannedLineItemsTotal^2',
 'totalScanTimeInSeconds lineItemVoids scannedLineItemsTotal',
 'totalScanTimeInSeconds scannedLineItemsTotal valuePerLineItem',
 'totalScanTimeInSeconds scansWithoutRegistration valuePerLineItem',
 'scannedLineItemsTotal^2 lineItemVoids*scansWithoutRegistration']

In [34]:
X_train2_all = pd.DataFrame(X_train2_all, columns=features_X_train2_all)
X_train2_selected = X_train2_all.loc[:,features_select_2]

In [35]:
X_test2_all = pd.DataFrame(X_test2_all, columns=features_X_test2_all)
X_test2_selected = X_test2_all.loc[:,features_select_2]
X_test2_selected.index = index_X_test2

### Final score for 'robustLogReg' in Trust=2

In [36]:
# verify
C2 = optimization_logreg.max['params']['C']
pred_threshold2 = optimization_logreg.max['params']['pred_threshold']

clf2 = LogisticRegression(C=C2, solver='lbfgs', max_iter=10000)
clf2 = CustomModelWithThreshold(clf2,threshold=pred_threshold2)
# final score for trust=2
cv = StratifiedKFold(n_splits=10, random_state=42)
sum(cross_validate(clf2,X_train2_selected, y2, scoring=profit_scoring, cv=cv)['test_score'])

45

In [37]:
# number of features used
len(X2_temporary.columns)

7

## Make final Predictions

In [38]:
#check if the indices are disjunct

In [39]:
X_test1_selected.index

Int64Index([     2,     17,     32,     36,     45,     52,     57,     65,
                73,     74,
            ...
            498045, 498052, 498060, 498080, 498085, 498092, 498099, 498101,
            498112, 498117],
           dtype='int64', length=82713)

In [40]:
X_test2_selected.index

Int64Index([     7,     22,     37,     48,     70,     71,     77,     81,
                93,    101,
            ...
            498058, 498067, 498069, 498071, 498073, 498097, 498102, 498105,
            498108, 498119],
           dtype='int64', length=82913)

In [41]:
X_test_higher.index

Int64Index([     0,      1,      3,      4,      5,      6,      8,      9,
                10,     11,
            ...
            498107, 498109, 498110, 498111, 498113, 498114, 498115, 498116,
            498118, 498120],
           dtype='int64', length=332495)

In [42]:
#Trust=1
clf1.fit(X_train1_selected, y1)
X_test1_selected['preds'] = clf1.predict(X_test1_selected)
#Trust=2
clf2.fit(X_train2_selected, y2)
X_test2_selected['preds'] = clf2.predict(X_test2_selected)

In [43]:
#Higher Trust
X_test_higher['preds'] = 0 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [44]:
y_pred = pd.concat([X_test1_selected['preds'],X_test2_selected['preds'],X_test_higher['preds'],])

In [45]:
y_pred = y_pred.astype({'preds':'int'})

In [46]:
y_pred.sort_index(inplace=True)

## reload test set and concat the predictions to check if they make sense

In [47]:
X_test = pd.read_csv('test.csv', delimiter="|")
X_test['fraud'] = y_pred

In [48]:
sum((X_test['fraud']==1) & (X_test['trustLevel']==1)) / sum(X_test['trustLevel']==1)

0.23305889037998864

In [49]:
sum((X_test['fraud']==1) & (X_test['trustLevel']==2)) / sum(X_test['trustLevel']==2)

0.04796594020238081

In [50]:
sum((X_test['fraud']==1) & (X_test['trustLevel']>2)) / sum(X_test['trustLevel']>2)

0.0

In [51]:
X_test

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429,0
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259,0
2,1,162,14.00,4,5,4,0.006173,0.086420,4.000000,0
3,5,532,84.79,9,3,4,0.026316,0.159380,0.642857,0
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526,0
5,5,1072,12.67,3,4,1,0.019590,0.011819,0.142857,0
6,3,259,93.75,0,7,0,0.100386,0.361969,0.000000,0
7,2,1528,47.35,2,9,5,0.009817,0.030988,0.133333,0
8,6,816,80.89,9,4,0,0.017157,0.099130,0.642857,0
9,4,16,31.91,7,7,4,1.312500,1.994375,0.333333,0


# The prediction proportions are very similar to the proportions in the train set. Therefore, save as csv.

In [52]:
y_pred.to_csv("finalPredictionGroup6.csv", index=False)