### Packages and Libraries

In [70]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn import svm as SVM
from sklearn.naive_bayes import GaussianNB as NB
from xgboost import XGBClassifier

from bayes_opt import BayesianOptimization
from imblearn.over_sampling import SMOTE

In [2]:
# custom imports
from funcs import plot_cv_confidence_vs_profit, score_dmc_profit,dmc_profit,cv_preds_and_confusion_matrix,cv_profits_for_models, profit_scoring
from customClassifiers import CustomModelWithThreshold, TrustHard, PerceptronLearner
from pipes import CustomAttributeAdder,Scaling,RandomAttributeAdder,Transformer,ClfSwitcher

from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import VotingClassifier

In [3]:
# use sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import itertools

# Load the Data

In [4]:
# load the data
train = pd.read_csv('train.csv' ,delimiter="|")
test = pd.read_csv('test.csv', delimiter="|")
X_train, y_train = train.drop(columns='fraud'), train['fraud']
#y_test = test.pop('fraud')

In [5]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder()),    # returns pd.dataframe
    ("RandomAttributeAdder", RandomAttributeAdder())
    ])


preprocessing_pipeline = Pipeline([
    ("transformer", Transformer()),                # This class is still void
    ("scaler", Scaling(strategy='Standard')),
])


model_training_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline),
    ('classifier', ClfSwitcher())
])

In [6]:
X_train_prepared = featureGeneration_pipeline.fit_transform(X_train)
X_train_prepared = preprocessing_pipeline.fit_transform(X_train_prepared)
X_train_prepared.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(1879, 15)

### Learn a Model to Classify the Records from the Test Set

In [7]:
from sklearn.metrics import f1_score

def cross_val_imbalanced_f1(classifier, X, y, sampler):
    kf = StratifiedKFold(n_splits=10, random_state=42)
    cross_val_f1_lst = []
    
    X = pd.DataFrame(X)

    for train_index, test_index in kf.split(X, y):
        # keeping validation set apart and oversampling in each iteration using smote 
        train, test = X.iloc[train_index], X.iloc[test_index]
        target_train, target_test = y.iloc[train_index], y.iloc[test_index]
        X_train_res, y_train_res = sampler.fit_sample(train, target_train.ravel())

        # training the model on oversampled 4 folds of training set
        classifier.fit(pd.DataFrame(X_train_res), y_train_res)
        # testing on 1 fold of validation set
        test_preds = classifier.predict(test)
        cross_val_f1_lst.append(f1_score(target_test, test_preds))
        
    return np.mean(cross_val_f1_lst)

#### SGD

In [9]:
def evaluateSgd(alpha, l1_ratio, tol, penalty, loss):
    
    # 3 options, l1 by default
    penalty_str = 'l1'
    if int(penalty) == 0:
        penalty_str = 'l2'
    elif int(penalty) == 1:
        penalty_str = 'elasticnet'
    
    # 3 options, modified_huber by default
    loss_str = 'modified_huber'
    if int(loss) == 0:
        loss_str = 'hinge'
    elif int(loss) == 1:
        loss_str = 'log'

        
    model = SGDClassifier(alpha=alpha, l1_ratio=l1_ratio, tol=tol, penalty=penalty_str, loss=loss_str, random_state=231)
    
    return cross_val_imbalanced_f1(model, X_train_prepared, y_train, SMOTE(random_state=42))

In [10]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'penalty': (0, 3),
    'loss': (0, 3)
}

In [15]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=1000, init_points=100)

|   iter    |  target   |   alpha   | l1_ratio  |   loss    |  penalty  |    tol    |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.4621  [0m | [0m 0.7815  [0m | [0m 0.5842  [0m | [0m 1.28    [0m | [0m 0.946   [0m | [0m 0.08269 [0m |
| [95m 2       [0m | [95m 0.5754  [0m | [95m 0.9034  [0m | [95m 0.03894 [0m | [95m 2.746   [0m | [95m 0.1815  [0m | [95m 0.01807 [0m |
| [95m 3       [0m | [95m 0.8764  [0m | [95m 0.0723  [0m | [95m 0.4588  [0m | [95m 2.125   [0m | [95m 1.45    [0m | [95m 0.002545[0m |
| [0m 4       [0m | [0m 0.6529  [0m | [0m 0.0669  [0m | [0m 0.4259  [0m | [0m 0.6577  [0m | [0m 2.43    [0m | [0m 0.01401 [0m |
| [0m 5       [0m | [0m 0.4837  [0m | [0m 0.7512  [0m | [0m 0.2791  [0m | [0m 2.847   [0m | [0m 2.655   [0m | [0m 0.06102 [0m |
| [0m 6       [0m | [0m 0.4514  [0m | [0m 0.7791  [0m | [0m 0.04156 [0m | [0m 2.359   [0m | [

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 16      [0m | [0m 0.0541  [0m | [0m 0.9719  [0m | [0m 0.9393  [0m | [0m 0.3421  [0m | [0m 2.91    [0m | [0m 0.05437 [0m |
| [0m 17      [0m | [0m 0.5169  [0m | [0m 0.4692  [0m | [0m 0.433   [0m | [0m 0.6649  [0m | [0m 0.9394  [0m | [0m 0.09047 [0m |
| [0m 18      [0m | [0m 0.456   [0m | [0m 0.9692  [0m | [0m 0.5473  [0m | [0m 0.9498  [0m | [0m 0.8481  [0m | [0m 0.06274 [0m |
| [0m 19      [0m | [0m 0.5336  [0m | [0m 0.147   [0m | [0m 0.4413  [0m | [0m 1.411   [0m | [0m 1.044   [0m | [0m 0.06648 [0m |
| [0m 20      [0m | [0m 0.4562  [0m | [0m 0.9434  [0m | [0m 0.3161  [0m | [0m 0.3041  [0m | [0m 0.8433  [0m | [0m 0.006185[0m |
| [0m 21      [0m | [0m 0.607   [0m | [0m 0.7615  [0m | [0m 0.912   [0m | [0m 2.34    [0m | [0m 0.128   [0m | [0m 0.009628[0m |
| [0m 22      [0m | [0m 0.4459  [0m | [0m 0.9636  [0m | [0m 0.1826  [0m | [0m 1.268   [0m | [0m 0.2957  [0m | [0m 0.0392  [0m |
| [0m

  'precision', 'predicted', average, warn_for)


| [0m 28      [0m | [0m 0.6008  [0m | [0m 0.07705 [0m | [0m 0.02587 [0m | [0m 1.597   [0m | [0m 0.1616  [0m | [0m 0.0683  [0m |
| [0m 29      [0m | [0m 0.4957  [0m | [0m 0.2012  [0m | [0m 0.8356  [0m | [0m 0.3355  [0m | [0m 1.104   [0m | [0m 0.09121 [0m |
| [0m 30      [0m | [0m 0.3924  [0m | [0m 0.4478  [0m | [0m 0.2655  [0m | [0m 0.8961  [0m | [0m 2.243   [0m | [0m 0.008349[0m |
| [0m 31      [0m | [0m 0.4646  [0m | [0m 0.8114  [0m | [0m 0.3955  [0m | [0m 0.9163  [0m | [0m 0.6078  [0m | [0m 0.09837 [0m |


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 32      [0m | [0m 0.07925 [0m | [0m 0.4882  [0m | [0m 0.1755  [0m | [0m 1.785   [0m | [0m 2.079   [0m | [0m 0.03995 [0m |
| [0m 33      [0m | [0m 0.4898  [0m | [0m 0.5341  [0m | [0m 0.4075  [0m | [0m 1.62    [0m | [0m 0.4151  [0m | [0m 0.02291 [0m |
| [0m 34      [0m | [0m 0.6303  [0m | [0m 0.07316 [0m | [0m 0.9006  [0m | [0m 0.528   [0m | [0m 2.675   [0m | [0m 0.09519 [0m |
| [0m 35      [0m | [0m 0.4908  [0m | [0m 0.4674  [0m | [0m 0.4513  [0m | [0m 1.853   [0m | [0m 1.306   [0m | [0m 0.02686 [0m |
| [0m 36      [0m | [0m 0.462   [0m | [0m 0.7434  [0m | [0m 0.07095 [0m | [0m 1.745   [0m | [0m 1.419   [0m | [0m 0.005491[0m |
| [0m 37      [0m | [0m 0.3957  [0m | [0m 0.3392  [0m | [0m 0.2243  [0m | [0m 1.128   [0m | [0m 2.874   [0m | [0m 0.03449 [0m |
| [0m 38      [0m | [0m 0.4849  [0m | [0m 0.5454  [0m | [0m 0.2823  [0m | [0m 1.24    [0m | [0m 1.706   [0m | [0m 0.03295 [0m |
| [0m

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 46      [0m | [0m 0.7419  [0m | [0m 0.03231 [0m | [0m 0.2048  [0m | [0m 0.5945  [0m | [0m 0.8885  [0m | [0m 0.05676 [0m |
| [0m 47      [0m | [0m 0.0541  [0m | [0m 0.9078  [0m | [0m 0.3622  [0m | [0m 1.655   [0m | [0m 2.592   [0m | [0m 0.01883 [0m |


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 48      [0m | [0m 0.6335  [0m | [0m 0.1482  [0m | [0m 0.4535  [0m | [0m 0.007191[0m | [0m 0.6033  [0m | [0m 0.008541[0m |
| [0m 49      [0m | [0m 0.647   [0m | [0m 0.07578 [0m | [0m 0.6292  [0m | [0m 0.738   [0m | [0m 1.574   [0m | [0m 0.09411 [0m |
| [0m 50      [0m | [0m 0.8741  [0m | [0m 0.04136 [0m | [0m 0.7879  [0m | [0m 2.846   [0m | [0m 0.8865  [0m | [0m 0.01983 [0m |
| [0m 51      [0m | [0m 0.4069  [0m | [0m 0.4235  [0m | [0m 0.3695  [0m | [0m 0.5758  [0m | [0m 2.266   [0m | [0m 0.01336 [0m |


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 52      [0m | [0m 0.0541  [0m | [0m 0.998   [0m | [0m 0.5708  [0m | [0m 1.4     [0m | [0m 1.094   [0m | [0m 0.06923 [0m |
| [0m 53      [0m | [0m 0.6668  [0m | [0m 0.08546 [0m | [0m 0.9574  [0m | [0m 0.1977  [0m | [0m 0.8879  [0m | [0m 0.09676 [0m |
| [0m 54      [0m | [0m 0.4927  [0m | [0m 0.714   [0m | [0m 0.7338  [0m | [0m 2.171   [0m | [0m 2.914   [0m | [0m 0.06863 [0m |
| [0m 55      [0m | [0m 0.4945  [0m | [0m 0.434   [0m | [0m 0.8846  [0m | [0m 1.449   [0m | [0m 0.5984  [0m | [0m 0.01101 [0m |
| [0m 56      [0m | [0m 0.4881  [0m | [0m 0.7338  [0m | [0m 0.3263  [0m | [0m 2.677   [0m | [0m 2.793   [0m | [0m 0.05477 [0m |
| [0m 57      [0m | [0m 0.4262  [0m | [0m 0.3203  [0m | [0m 0.8574  [0m | [0m 0.8473  [0m | [0m 1.455   [0m | [0m 0.001987[0m |
| [0m 58      [0m | [0m 0.4841  [0m | [0m 0.5358  [0m | [0m 0.358   [0m | [0m 1.938   [0m | [0m 1.273   [0m | [0m 0.06128 [0m |
| [0m

  'precision', 'predicted', average, warn_for)


| [0m 64      [0m | [0m 0.5022  [0m | [0m 0.4118  [0m | [0m 0.5424  [0m | [0m 1.315   [0m | [0m 0.1382  [0m | [0m 0.0472  [0m |
| [0m 65      [0m | [0m 0.4008  [0m | [0m 0.3726  [0m | [0m 0.7975  [0m | [0m 1.704   [0m | [0m 1.75    [0m | [0m 0.07076 [0m |


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 66      [0m | [0m 0.1246  [0m | [0m 0.5301  [0m | [0m 0.9044  [0m | [0m 1.314   [0m | [0m 1.913   [0m | [0m 0.07356 [0m |
| [0m 67      [0m | [0m 0.498   [0m | [0m 0.2012  [0m | [0m 0.8248  [0m | [0m 0.3446  [0m | [0m 1.253   [0m | [0m 0.03034 [0m |
| [0m 68      [0m | [0m 0.8155  [0m | [0m 0.01347 [0m | [0m 0.5512  [0m | [0m 0.8074  [0m | [0m 1.007   [0m | [0m 0.05728 [0m |
| [0m 69      [0m | [0m 0.5058  [0m | [0m 0.2654  [0m | [0m 0.6499  [0m | [0m 1.14    [0m | [0m 1.102   [0m | [0m 0.06328 [0m |


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 70      [0m | [0m 0.0541  [0m | [0m 0.7437  [0m | [0m 0.6021  [0m | [0m 1.027   [0m | [0m 2.293   [0m | [0m 0.007132[0m |
| [0m 71      [0m | [0m 0.6394  [0m | [0m 0.5364  [0m | [0m 0.5156  [0m | [0m 2.641   [0m | [0m 0.3479  [0m | [0m 0.009225[0m |
| [95m 72      [0m | [95m 0.9013  [0m | [95m 0.01982 [0m | [95m 0.6282  [0m | [95m 2.534   [0m | [95m 1.899   [0m | [95m 0.0483  [0m |
| [0m 73      [0m | [0m 0.3382  [0m | [0m 0.9509  [0m | [0m 0.7544  [0m | [0m 0.8791  [0m | [0m 1.954   [0m | [0m 0.08164 [0m |


  'precision', 'predicted', average, warn_for)


| [0m 74      [0m | [0m 0.4166  [0m | [0m 0.3241  [0m | [0m 0.7589  [0m | [0m 1.594   [0m | [0m 1.149   [0m | [0m 0.02788 [0m |
| [0m 75      [0m | [0m 0.4898  [0m | [0m 0.5293  [0m | [0m 0.5685  [0m | [0m 1.199   [0m | [0m 0.04264 [0m | [0m 0.008818[0m |
| [0m 76      [0m | [0m 0.4649  [0m | [0m 0.7532  [0m | [0m 0.274   [0m | [0m 1.816   [0m | [0m 1.602   [0m | [0m 0.007338[0m |
| [0m 77      [0m | [0m 0.6541  [0m | [0m 0.4172  [0m | [0m 0.05366 [0m | [0m 2.889   [0m | [0m 0.8389  [0m | [0m 0.07926 [0m |
| [0m 78      [0m | [0m 0.4701  [0m | [0m 0.7194  [0m | [0m 0.9006  [0m | [0m 1.485   [0m | [0m 0.8284  [0m | [0m 0.05561 [0m |
| [0m 79      [0m | [0m 0.5518  [0m | [0m 0.3331  [0m | [0m 0.6955  [0m | [0m 0.3536  [0m | [0m 0.3571  [0m | [0m 0.001816[0m |
| [0m 80      [0m | [0m 0.5149  [0m | [0m 0.6124  [0m | [0m 0.6823  [0m | [0m 2.857   [0m | [0m 1.077   [0m | [0m 0.06702 [0m |
| [0m

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 86      [0m | [0m 0.6067  [0m | [0m 0.7203  [0m | [0m 0.578   [0m | [0m 2.651   [0m | [0m 0.1701  [0m | [0m 0.0407  [0m |
| [0m 87      [0m | [0m 0.8854  [0m | [0m 0.03448 [0m | [0m 0.5745  [0m | [0m 2.293   [0m | [0m 0.9431  [0m | [0m 0.01846 [0m |


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 88      [0m | [0m 0.2043  [0m | [0m 0.4752  [0m | [0m 0.7162  [0m | [0m 1.068   [0m | [0m 2.008   [0m | [0m 0.06807 [0m |
| [0m 89      [0m | [0m 0.7824  [0m | [0m 0.1456  [0m | [0m 0.177   [0m | [0m 2.464   [0m | [0m 1.684   [0m | [0m 0.06217 [0m |
| [0m 90      [0m | [0m 0.4515  [0m | [0m 0.8004  [0m | [0m 0.5657  [0m | [0m 0.2266  [0m | [0m 1.751   [0m | [0m 0.04191 [0m |
| [0m 91      [0m | [0m 0.5091  [0m | [0m 0.5537  [0m | [0m 0.1614  [0m | [0m 2.67    [0m | [0m 2.261   [0m | [0m 0.05516 [0m |
| [0m 92      [0m | [0m 0.6396  [0m | [0m 0.524   [0m | [0m 0.9729  [0m | [0m 2.197   [0m | [0m 0.4341  [0m | [0m 0.09636 [0m |
| [0m 93      [0m | [0m 0.4863  [0m | [0m 0.6416  [0m | [0m 0.2216  [0m | [0m 0.2052  [0m | [0m 0.000529[0m | [0m 0.002023[0m |
| [0m 94      [0m | [0m 0.4238  [0m | [0m 0.894   [0m | [0m 0.9404  [0m | [0m 2.433   [0m | [0m 1.283   [0m | [0m 0.03264 [0m |
| [0m

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


| [0m 96      [0m | [0m 0.4663  [0m | [0m 0.7659  [0m | [0m 0.8777  [0m | [0m 2.652   [0m | [0m 2.413   [0m | [0m 0.07136 [0m |
| [0m 97      [0m | [0m 0.4059  [0m | [0m 0.915   [0m | [0m 0.3332  [0m | [0m 1.57    [0m | [0m 1.965   [0m | [0m 0.01689 [0m |
| [0m 98      [0m | [0m 0.5318  [0m | [0m 0.2423  [0m | [0m 0.5047  [0m | [0m 1.479   [0m | [0m 0.4505  [0m | [0m 0.04001 [0m |
| [0m 99      [0m | [0m 0.3723  [0m | [0m 0.6926  [0m | [0m 0.4425  [0m | [0m 0.05789 [0m | [0m 2.716   [0m | [0m 0.09811 [0m |


  'precision', 'predicted', average, warn_for)


| [0m 100     [0m | [0m 0.6659  [0m | [0m 0.3861  [0m | [0m 0.101   [0m | [0m 2.326   [0m | [0m 0.7574  [0m | [0m 0.00476 [0m |
| [95m 101     [0m | [95m 0.9136  [0m | [95m 1e-06   [0m | [95m 1.0     [0m | [95m 3.0     [0m | [95m 3.0     [0m | [95m 0.1     [0m |
| [0m 102     [0m | [0m 0.9004  [0m | [0m 1e-06   [0m | [0m 1.0     [0m | [0m 3.0     [0m | [0m 0.0     [0m | [0m 0.1     [0m |
| [95m 103     [0m | [95m 0.9278  [0m | [95m 1e-06   [0m | [95m 0.0     [0m | [95m 3.0     [0m | [95m 3.0     [0m | [95m 1e-09   [0m |
| [0m 104     [0m | [0m 0.9104  [0m | [0m 1e-06   [0m | [0m 0.0     [0m | [0m 0.0     [0m | [0m 3.0     [0m | [0m 0.1     [0m |
| [0m 105     [0m | [0m 0.9004  [0m | [0m 1e-06   [0m | [0m 0.0     [0m | [0m 3.0     [0m | [0m 0.0     [0m | [0m 1e-09   [0m |
| [0m 106     [0m | [0m 0.9136  [0m | [0m 1e-06   [0m | [0m 1.0     [0m | [0m 2.056   [0m | [0m 3.0     [0m | [0m 0.1    

KeyboardInterrupt: 

In [16]:
optimization_sgd.max

{'target': 0.9395534554436157,
 'params': {'alpha': 1e-06,
  'l1_ratio': 0.8573169692882996,
  'loss': 1.790135476441257,
  'penalty': 1.950482150001914,
  'tol': 1e-09}}

In [20]:
sgd = SGDClassifier(alpha=1e-6, l1_ratio=0.8573169692882996, loss='log', penalty='elasticnet', tol=1e-09)
cross_val_imbalanced_f1(sgd, X_train_prepared, y_train, SMOTE(random_state=42))

0.9443153602055204

In [21]:
sgd.fit(X_train_prepared, y_train)

SGDClassifier(alpha=1e-06, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.8573169692882996, learning_rate='optimal', loss='log',
       max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       tol=1e-09, validation_fraction=0.1, verbose=0, warm_start=False)

### Generate New Training Set

In [22]:
X_test_prepared = featureGeneration_pipeline.fit_transform(test)
X_test_prepared = preprocessing_pipeline.fit_transform(X_test_prepared)
X_test_prepared.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(498121, 15)

In [24]:
sgd_pred = sgd.decision_function(X_test_prepared)

In [29]:
sgd_pred.max()

1358.780357178902

In [30]:
sgd_pred.min()

-12223.419243189055

In [59]:
augment_pos = test.loc[np.where(sgd_pred > 1000)[0]]
augment_pos['fraud'] = 1
augment_neg = test.loc[np.where(sgd_pred < -2900)[0]]
augment_neg['fraud'] = 0

In [60]:
len(augment_pos) / (len(augment_pos) + len(augment_neg)) # target label distribution is approx. the same 
                                                         # as in the original training set

0.05053052704818229

In [61]:
augment_df = pd.concat([augment_pos, augment_neg], ignore_index=True)
augment_df.shape

(11498, 16)

### Train the Models and Check the Performance

In [62]:
X_train, y_train = augment_df.drop('fraud', axis=1), augment_df.fraud
X_test, y_test = train.drop('fraud', axis=1), train.fraud

In [65]:
X_train_prepared = preprocessing_pipeline.fit_transform(X_train)

X_test = featureGeneration_pipeline.transform(X_test)
X_test_prepared = preprocessing_pipeline.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self._scaler.transform(X)


In [71]:
cv = StratifiedKFold(n_splits=10, random_state=42)

def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

In [75]:
def cross_val_imbalanced(classifier, X, y, sampler):
    kf = StratifiedKFold(n_splits=10, random_state=42)
    cross_val_profit_lst = []
    
    X = pd.DataFrame(X)

    for train_index, test_index in kf.split(X, y):
        # keeping validation set apart and oversampling in each iteration using smote 
        train, test = X.iloc[train_index], X.iloc[test_index]
        target_train, target_test = y.iloc[train_index], y.iloc[test_index]
        X_train_res, y_train_res = sampler.fit_sample(train, target_train.ravel())

        # training the model on oversampled 4 folds of training set
        classifier.fit(pd.DataFrame(X_train_res), y_train_res)
        # testing on 1 fold of validation set
        test_preds = classifier.predict(test)
        cross_val_profit_lst.append(profit_scorer(target_test, test_preds))
        
#     print ('Cross validated profit: {}'.format(np.sum(cross_val_profit_lst)))
    return np.mean(cross_val_profit_lst)

#### SGDClassifier

In [76]:
def evaluateSgd(alpha, l1_ratio, tol, penalty, loss):
    
    # 3 options, l1 by default
    penalty_str = 'l1'
    if int(penalty) == 0:
        penalty_str = 'l2'
    elif int(penalty) == 1:
        penalty_str = 'elasticnet'
    
    # 3 options, modified_huber by default
    loss_str = 'modified_huber'
    if int(loss) == 0:
        loss_str = 'hinge'
    elif int(loss) == 1:
        loss_str = 'log'
        
    
    model = SGDClassifier(alpha=alpha, l1_ratio=l1_ratio, tol=tol, penalty=penalty_str, loss=loss_str, random_state=231)
    
    
    return cross_val_imbalanced(model, X_train_prepared, y_train, SMOTE(random_state=42))

In [77]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'penalty': (0, 3),
    'loss': (0, 3)
}

In [78]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=1000, init_points=100)

|   iter    |  target   |   alpha   | l1_ratio  |   loss    |  penalty  |    tol    |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 225.5   [0m | [0m 0.7815  [0m | [0m 0.5842  [0m | [0m 1.28    [0m | [0m 0.946   [0m | [0m 0.08269 [0m |
| [95m 2       [0m | [95m 290.5   [0m | [95m 0.9034  [0m | [95m 0.03894 [0m | [95m 2.746   [0m | [95m 0.1815  [0m | [95m 0.01807 [0m |
| [0m 3       [0m | [0m 290.5   [0m | [0m 0.0723  [0m | [0m 0.4588  [0m | [0m 2.125   [0m | [0m 1.45    [0m | [0m 0.002545[0m |
| [0m 4       [0m | [0m 290.5   [0m | [0m 0.0669  [0m | [0m 0.4259  [0m | [0m 0.6577  [0m | [0m 2.43    [0m | [0m 0.01401 [0m |
| [0m 5       [0m | [0m 290.5   [0m | [0m 0.7512  [0m | [0m 0.2791  [0m | [0m 2.847   [0m | [0m 2.655   [0m | [0m 0.06102 [0m |
| [0m 6       [0m | [0m 290.5   [0m | [0m 0.7791  [0m | [0m 0.04156 [0m | [0m 2.359   [0m | [0m 2.70

KeyboardInterrupt: 

In [79]:
optimization_sgd.max

{'target': 290.5,
 'params': {'alpha': 0.9034295731744418,
  'l1_ratio': 0.03893810541480702,
  'loss': 2.745667790244739,
  'penalty': 0.18151621259516437,
  'tol': 0.0180726867014855}}

In [84]:
# Evaluation on the original training set
sgd = SGDClassifier(alpha=0.9034295731744418, l1_ratio=0.03893810541480702, tol=0.0180726867014855,
                    loss='modified_huber', penalty='l2')
sgd.fit(X_train_prepared, y_train)
sgd_pred = sgd.predict(X_test_prepared)
profit_scorer(y_test, sgd_pred)

-740

#### Random Forest

In [81]:
def evaluateRandomForest(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    
    model = RandomForestClassifier(n_estimators=int(n_estimators), 
                                   max_depth=int(max_depth), 
                                   min_samples_split=int(min_samples_split), 
                                   min_samples_leaf=int(min_samples_leaf),
                                   random_state=231)
    
    return cross_val_imbalanced_f1(model, X_train_prepared, y_train, SMOTE(random_state=42))

In [82]:
params_random_forest = {
    'n_estimators': (10, 500),
    'max_depth': (1, 10),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20)
}

In [85]:
optimization_rf = BayesianOptimization(evaluateRandomForest, params_random_forest, random_state=231)
optimization_rf.maximize(n_iter=100, init_points=25)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 1.0     [0m | [0m 8.033   [0m | [0m 12.1    [0m | [0m 9.681   [0m | [0m 164.5   [0m |
| [0m 2       [0m | [0m 1.0     [0m | [0m 8.442   [0m | [0m 18.17   [0m | [0m 2.701   [0m | [0m 458.5   [0m |
| [0m 3       [0m | [0m 0.9915  [0m | [0m 1.545   [0m | [0m 4.434   [0m | [0m 3.301   [0m | [0m 234.8   [0m |
| [0m 4       [0m | [0m 1.0     [0m | [0m 7.375   [0m | [0m 10.18   [0m | [0m 2.458   [0m | [0m 42.78   [0m |
| [0m 5       [0m | [0m 1.0     [0m | [0m 4.833   [0m | [0m 5.166   [0m | [0m 16.58   [0m | [0m 78.63   [0m |
| [0m 6       [0m | [0m 1.0     [0m | [0m 7.761   [0m | [0m 6.303   [0m | [0m 19.08   [0m | [0m 443.7   [0m |
| [0m 7       [0m | [0m 1.0     [0m | [0m 6.491   [0m | [0m 15.8    [0m | [0m 2.748   [0m | [0m 395.3   [0m 

KeyboardInterrupt: 

In [86]:
optimization_rf.max

{'target': 1.0,
 'params': {'max_depth': 8.03344636745938,
  'min_samples_leaf': 12.099973205625322,
  'min_samples_split': 9.68078126340425,
  'n_estimators': 164.51890050855397}}

In [87]:
# Evaluation on the original training set
rf = RandomForestClassifier(max_depth=8, min_samples_leaf=12, min_samples_split=9, n_estimators=164)
rf.fit(X_train_prepared, y_train)
rf_pred = rf.predict(X_test_prepared)
profit_scorer(y_test, rf_pred)

-3040

#### XGBoost

In [88]:
def evaluateXgb(max_depth, n_estimators, gamma, reg_alpha, reg_lambda):
    
    model = XGBClassifier(max_depth=int(max_depth), n_estimators=int(n_estimators), gamma=gamma,
                          reg_alpha=reg_alpha, reg_lambda=reg_lambda, n_jobs=-1, random_state=231)
    
    return cross_val_imbalanced(model, X_train_prepared, y_train, SMOTE(random_state=42))

In [89]:
params_xgb = {
    'max_depth': (1, 10),
    'n_estimators': (25, 500),
    'gamma': (0, 1),
    'reg_alpha': (1e-9, 0.1),
    'reg_lambda': (1e-9, 0.1)
}

In [91]:
optimization_xgb = BayesianOptimization(evaluateXgb, params_xgb, random_state=231)
optimization_xgb.maximize(n_iter=100, init_points=25)

|   iter    |  target   |   gamma   | max_depth | n_esti... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 290.5   [0m | [0m 0.7815  [0m | [0m 6.258   [0m | [0m 227.7   [0m | [0m 0.03153 [0m | [0m 0.08269 [0m |
| [0m 2       [0m | [0m 290.5   [0m | [0m 0.9034  [0m | [0m 1.35    [0m | [0m 459.7   [0m | [0m 0.006051[0m | [0m 0.01807 [0m |
| [0m 3       [0m | [0m 290.5   [0m | [0m 0.0723  [0m | [0m 5.129   [0m | [0m 361.5   [0m | [0m 0.04832 [0m | [0m 0.002545[0m |
| [0m 4       [0m | [0m 290.5   [0m | [0m 0.0669  [0m | [0m 4.833   [0m | [0m 129.1   [0m | [0m 0.08101 [0m | [0m 0.01401 [0m |
| [0m 5       [0m | [0m 290.5   [0m | [0m 0.7512  [0m | [0m 3.512   [0m | [0m 475.8   [0m | [0m 0.0885  [0m | [0m 0.06102 [0m |
| [0m 6       [0m | [0m 290.5   [0m | [0m 0.7791  [0m | [0m 1.374   [0m | [0m 398.5   [0m | [0m 0.09025 [0

KeyboardInterrupt: 

In [92]:
optimization_xgb.max

{'target': 290.5,
 'params': {'gamma': 0.78149404082882,
  'max_depth': 6.2578820447698895,
  'n_estimators': 227.68728333983438,
  'reg_alpha': 0.031534470176196934,
  'reg_lambda': 0.08268882927323766}}

In [93]:
# Evaluation on the original training set
xgb = XGBClassifier(n_estimators=227, gamma=0.78149404082882, max_depth=6, 
                    reg_alpha=0.031534470176196934, reg_lambda=0.08268882927323766)
xgb.fit(X_train_prepared, y_train)
xgb_pred = xgb.predict(X_test_prepared)
profit_scorer(y_test, xgb_pred)

-4420

#### Logistic Regression

In [94]:
def evaluateLogReg(C):
    
    model = LogisticRegression(C=C, max_iter=1000, solver='lbfgs')
    
    return cross_val_imbalanced(model, X_train_prepared, y_train, SMOTE(random_state=42))

In [95]:
params_logreg = {
    'C': (0.0001, 10000)
}

In [96]:
optimization_logreg = BayesianOptimization(evaluateLogReg, params_logreg, random_state=231)
optimization_logreg.maximize(n_iter=1000, init_points=20)

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 290.5   [0m | [0m 7.815e+0[0m |
| [0m 2       [0m | [0m 290.5   [0m | [0m 5.842e+0[0m |
| [0m 3       [0m | [0m 290.5   [0m | [0m 4.267e+0[0m |
| [0m 4       [0m | [0m 290.5   [0m | [0m 3.153e+0[0m |
| [0m 5       [0m | [0m 290.5   [0m | [0m 8.269e+0[0m |
| [0m 6       [0m | [0m 290.5   [0m | [0m 9.034e+0[0m |
| [0m 7       [0m | [0m 290.5   [0m | [0m 389.4   [0m |
| [0m 8       [0m | [0m 290.5   [0m | [0m 9.152e+0[0m |
| [0m 9       [0m | [0m 290.5   [0m | [0m 605.1   [0m |
| [0m 10      [0m | [0m 290.5   [0m | [0m 1.807e+0[0m |
| [0m 11      [0m | [0m 290.5   [0m | [0m 723.0   [0m |
| [0m 12      [0m | [0m 290.5   [0m | [0m 4.588e+0[0m |
| [0m 13      [0m | [0m 290.5   [0m | [0m 7.084e+0[0m |
| [0m 14      [0m | [0m 290.5   [0m | [0m 4.832e+0[0m |
| [0m 15      [0m | [0m 290.5   [0m | [0m 254.5   

KeyboardInterrupt: 

In [97]:
optimization_logreg.max

{'target': 290.5, 'params': {'C': 7814.940430138797}}

In [99]:
# Evaluation on the original training set
logreg = LogisticRegression(C=7814, max_iter=1000, solver='lbfgs')
logreg.fit(X_train_prepared, y_train)
logreg_pred = logreg.predict(X_test_prepared)
profit_scorer(y_test, logreg_pred)

-3665