# Notebook for presentation 2

### Packages and Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn import svm as SVM
from sklearn.naive_bayes import GaussianNB as NB
from xgboost import XGBClassifier

In [2]:
# custom imports
from funcs import plot_cv_confidence_vs_profit, score_dmc_profit,dmc_profit,cv_preds_and_confusion_matrix,cv_profits_for_models, profit_scoring
from customClassifiers import CustomModelWithThreshold, TrustHard, PerceptronLearner
from pipes import CustomAttributeAdder,Scaling,RandomAttributeAdder,Transformer,ClfSwitcher

from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import VotingClassifier

In [3]:
# use sklearn pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import itertools

# Load the Data

In [4]:
# load the data
X_train = pd.read_csv('train.csv' ,delimiter="|")
X_test = pd.read_csv('test.csv', delimiter="|")
X_train, y_train = X_train.drop(columns='fraud'), X_train['fraud']
#y_test = test.pop('fraud')

In [5]:
X_train.head(1)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379


# Preprocess Data

In [6]:
# select one or more out of feature list below that will be added in the featureGenerationPipeline
feature_list = ['scannedLineItemsTotal',
                #'valuePerLineItem',
                #'quantityModificationsPerLineItem',
                'lineItemVoids*scansWithoutRegistration',
                'totalScanTimeInSeconds/trustLevel',
                'trustLevel_Log', 
               ]

**Note:** Only two preprocessing steps at the moment are adding newly designed features (see above) and scaling

In [7]:
featureGeneration_pipeline = Pipeline([
    ("attribs_adder", CustomAttributeAdder(featurelist=feature_list)),                
    #("RandomAttributeAdder", RandomAttributeAdder())         #  This class is still void
    ])


preprocessing_pipeline = Pipeline([
    #("transformer", Transformer()),                           # This class is still void
    ("scaler", Scaling(strategy='Standard')),
])

In [8]:
# combine two pipeline into a single data_preparation_pipeline
data_preparation_pipeline = Pipeline([
    ('feature_generation', featureGeneration_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

X_train_prepared = data_preparation_pipeline.fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
# Note that X_train has also been changed by calling fit_transform on it
# X_train can be used to visualize the features importance / correlation of the newly created features
X_train.head(5)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,scannedLineItemsTotal,lineItemVoids*scansWithoutRegistration,totalScanTimeInSeconds/trustLevel,trustLevel_Log
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,29.0,0,210.8,1.609438
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,14.0,10,36.0,1.098612
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,13.0,30,505.333333,1.098612
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,29.0,32,298.5,1.791759
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,27.0,21,86.0,1.609438


### Correlation

In [None]:
#...

# Parameter Tuning

### Evaluation 

In [10]:
# Nico's script
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer

cv = StratifiedKFold(n_splits=10, random_state=42)
def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

## 1. Tune Hyperparameter of Base Classifiers (without using sampling)

In [11]:
# Install the library for Bayesian optimization from here: https://github.com/fmfn/BayesianOptimization
from bayes_opt import BayesianOptimization

In [12]:
X_train_prepared.shape

(1879, 13)

### 1.1 SGD Classifier

In [13]:
def evaluateSgd(alpha, l1_ratio, tol, penalty, loss):
    
    # 3 options, l1 by default
    penalty_str = 'l1'
    if int(penalty) == 0:
        penalty_str = 'l2'
    elif int(penalty) == 1:
        penalty_str = 'elasticnet'
    
    # 3 options, modified_huber by default
    loss_str = 'modified_huber'
    if int(loss) == 0:
        loss_str = 'hinge'
    elif int(loss) == 1:
        loss_str = 'log'
        
    
    model = SGDClassifier(alpha=alpha, l1_ratio=l1_ratio, tol=tol, penalty=penalty_str, loss=loss_str, random_state=231)
    
    # This integrates sampling into the training. Trains on oversampled data but evaluates on unsampled data
    #return cross_val_imbalanced(model, X_train_prepared, y_train, RandomOverSampler(random_state=42))
    
    # this trains the classifier on the unbalanced folds
    return sum(cross_validate(model, X_train_prepared, y=y_train, cv=cv,
                              scoring=profit_scoring)['test_score'])

In [14]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'penalty': (0, 3),
    'loss': (0, 3)
}

In [15]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=200, init_points=20)

|   iter    |  target   |   alpha   | l1_ratio  |   loss    |  penalty  |    tol    |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-520.0   [0m | [0m 0.7815  [0m | [0m 0.5842  [0m | [0m 1.28    [0m | [0m 0.946   [0m | [0m 0.08269 [0m |
| [95m 2       [0m | [95m-490.0   [0m | [95m 0.9034  [0m | [95m 0.03894 [0m | [95m 2.746   [0m | [95m 0.1815  [0m | [95m 0.01807 [0m |
| [95m 3       [0m | [95m 75.0    [0m | [95m 0.0723  [0m | [95m 0.4588  [0m | [95m 2.125   [0m | [95m 1.45    [0m | [95m 0.002545[0m |
| [0m 4       [0m | [0m-460.0   [0m | [0m 0.0669  [0m | [0m 0.4259  [0m | [0m 0.6577  [0m | [0m 2.43    [0m | [0m 0.01401 [0m |
| [0m 5       [0m | [0m-520.0   [0m | [0m 0.7512  [0m | [0m 0.2791  [0m | [0m 2.847   [0m | [0m 2.655   [0m | [0m 0.06102 [0m |
| [0m 6       [0m | [0m-520.0   [0m | [0m 0.7791  [0m | [0m 0.04156 [0m | [0m 2.359   [0m | [

| [0m 58      [0m | [0m 130.0   [0m | [0m 0.04729 [0m | [0m 0.4817  [0m | [0m 2.173   [0m | [0m 1.4     [0m | [0m 0.003144[0m |
| [0m 59      [0m | [0m 95.0    [0m | [0m 0.06657 [0m | [0m 0.4546  [0m | [0m 2.137   [0m | [0m 1.457   [0m | [0m 0.007613[0m |
| [0m 60      [0m | [0m 200.0   [0m | [0m 0.01015 [0m | [0m 0.5133  [0m | [0m 2.089   [0m | [0m 1.537   [0m | [0m 0.06189 [0m |
| [0m 61      [0m | [0m 210.0   [0m | [0m 0.006535[0m | [0m 0.5301  [0m | [0m 2.075   [0m | [0m 1.543   [0m | [0m 0.05408 [0m |
| [0m 62      [0m | [0m 225.0   [0m | [0m 0.008352[0m | [0m 0.5273  [0m | [0m 2.087   [0m | [0m 1.545   [0m | [0m 0.05553 [0m |
| [0m 63      [0m | [0m 55.0    [0m | [0m 0.05994 [0m | [0m 0.405   [0m | [0m 2.082   [0m | [0m 1.507   [0m | [0m 0.04087 [0m |
| [0m 64      [0m | [0m 170.0   [0m | [0m 0.02173 [0m | [0m 0.521   [0m | [0m 2.118   [0m | [0m 1.498   [0m | [0m 0.05299 [0m |
| [0m

| [0m 116     [0m | [0m 180.0   [0m | [0m 0.01673 [0m | [0m 0.5409  [0m | [0m 2.137   [0m | [0m 1.515   [0m | [0m 0.08684 [0m |
| [0m 117     [0m | [0m 75.0    [0m | [0m 0.04798 [0m | [0m 0.3992  [0m | [0m 2.113   [0m | [0m 1.535   [0m | [0m 0.04553 [0m |
| [0m 118     [0m | [0m 70.0    [0m | [0m 0.06173 [0m | [0m 0.449   [0m | [0m 2.153   [0m | [0m 1.468   [0m | [0m 0.01573 [0m |
| [0m 119     [0m | [0m 140.0   [0m | [0m 0.03584 [0m | [0m 0.4807  [0m | [0m 2.13    [0m | [0m 1.464   [0m | [0m 0.05243 [0m |
| [0m 120     [0m | [0m 5.0     [0m | [0m 0.07875 [0m | [0m 0.4181  [0m | [0m 2.116   [0m | [0m 1.522   [0m | [0m 0.0609  [0m |
| [0m 121     [0m | [0m-25.0    [0m | [0m 0.07077 [0m | [0m 0.4904  [0m | [0m 2.126   [0m | [0m 1.461   [0m | [0m 0.05771 [0m |
| [0m 122     [0m | [0m 115.0   [0m | [0m 0.03812 [0m | [0m 0.4073  [0m | [0m 2.096   [0m | [0m 1.479   [0m | [0m 0.09047 [0m |
| [0m

| [0m 174     [0m | [0m 115.0   [0m | [0m 0.0522  [0m | [0m 0.5547  [0m | [0m 2.181   [0m | [0m 1.372   [0m | [0m 0.03058 [0m |
| [0m 175     [0m | [0m 105.0   [0m | [0m 0.04373 [0m | [0m 0.4352  [0m | [0m 2.076   [0m | [0m 1.49    [0m | [0m 0.02847 [0m |
| [0m 176     [0m | [0m 110.0   [0m | [0m 0.05061 [0m | [0m 0.4586  [0m | [0m 2.129   [0m | [0m 1.451   [0m | [0m 0.0213  [0m |
| [0m 177     [0m | [0m 170.0   [0m | [0m 0.02161 [0m | [0m 0.5107  [0m | [0m 2.132   [0m | [0m 1.515   [0m | [0m 0.08394 [0m |
| [0m 178     [0m | [0m-520.0   [0m | [0m 0.9489  [0m | [0m 0.6248  [0m | [0m 1.05    [0m | [0m 1.504   [0m | [0m 0.07091 [0m |
| [0m 179     [0m | [0m-35.0    [0m | [0m 0.06448 [0m | [0m 0.5685  [0m | [0m 2.169   [0m | [0m 1.373   [0m | [0m 0.02517 [0m |
| [0m 180     [0m | [0m 100.0   [0m | [0m 0.05576 [0m | [0m 0.499   [0m | [0m 2.057   [0m | [0m 1.538   [0m | [0m 0.09896 [0m |
| [0m

In [16]:
optimization_sgd.max

{'target': 260.0,
 'params': {'alpha': 0.005869942773702552,
  'l1_ratio': 0.4650232398687586,
  'loss': 2.124838792024528,
  'penalty': 1.4489392051726289,
  'tol': 0.04443900510956935}}

### 1.2 Logistic Regression

In [17]:
def evaluateLogistic(C):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42)
    
    return sum(cross_validate(model, X_train_prepared, y=y_train, cv=cv,
                   scoring=profit_scoring)['test_score'])

In [18]:
params_logistic = {
    'C': (1,1000),
}

In [19]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=231)
optimization_logistic.maximize(n_iter=200, init_points=20)

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 280.0   [0m | [0m 781.7   [0m |
| [95m 2       [0m | [95m 305.0   [0m | [95m 584.6   [0m |
| [0m 3       [0m | [0m 295.0   [0m | [0m 427.3   [0m |
| [0m 4       [0m | [0m 295.0   [0m | [0m 316.0   [0m |
| [0m 5       [0m | [0m 280.0   [0m | [0m 827.1   [0m |
| [0m 6       [0m | [0m 280.0   [0m | [0m 903.5   [0m |
| [95m 7       [0m | [95m 335.0   [0m | [95m 39.9    [0m |
| [0m 8       [0m | [0m 280.0   [0m | [0m 915.3   [0m |
| [0m 9       [0m | [0m 310.0   [0m | [0m 61.44   [0m |
| [0m 10      [0m | [0m 320.0   [0m | [0m 181.5   [0m |
| [0m 11      [0m | [0m 310.0   [0m | [0m 73.23   [0m |
| [0m 12      [0m | [0m 295.0   [0m | [0m 459.3   [0m |
| [0m 13      [0m | [0m 280.0   [0m | [0m 708.7   [0m |
| [0m 14      [0m | [0m 305.0   [0m | [0m 483.8   [0m |
| [0m 15      [0m | [0m 335.0   [0m | [0m 26.

| [0m 132     [0m | [0m 335.0   [0m | [0m 49.61   [0m |
| [0m 133     [0m | [0m 310.0   [0m | [0m 50.02   [0m |
| [0m 134     [0m | [0m 335.0   [0m | [0m 49.45   [0m |
| [0m 135     [0m | [0m 335.0   [0m | [0m 49.03   [0m |
| [0m 136     [0m | [0m 335.0   [0m | [0m 49.55   [0m |
| [0m 137     [0m | [0m 335.0   [0m | [0m 25.98   [0m |
| [0m 138     [0m | [0m 335.0   [0m | [0m 28.64   [0m |
| [0m 139     [0m | [0m 335.0   [0m | [0m 33.7    [0m |
| [0m 140     [0m | [0m 335.0   [0m | [0m 20.0    [0m |
| [0m 141     [0m | [0m 335.0   [0m | [0m 48.23   [0m |
| [0m 142     [0m | [0m 335.0   [0m | [0m 24.72   [0m |
| [0m 143     [0m | [0m 335.0   [0m | [0m 21.06   [0m |
| [0m 144     [0m | [0m 335.0   [0m | [0m 47.46   [0m |
| [0m 145     [0m | [0m 335.0   [0m | [0m 18.01   [0m |
| [0m 146     [0m | [0m 335.0   [0m | [0m 38.69   [0m |
| [0m 147     [0m | [0m 335.0   [0m | [0m 46.72   [0m |
| [0m 1

In [28]:
optimization_logistic.max

{'target': 335.0, 'params': {'C': 39.89916730939222}}

### 1.3 SVM

In [20]:
#...

### 1.4 RandomForest

In [21]:
#...

### 1.5 XGBoost

In [22]:
#...

### 1.6 AdaBoost

In [29]:
#...

### Bagging, Stacking, Voting, etc..

In [32]:
#...

## 2. Tune Hyperparameter of Base Classifiers (inlc. sampling)

In [33]:
# import sampling classes

# oversampling
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE

# undersampling
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import AllKNN
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.under_sampling import TomekLinks

# combination of over and undersampling
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [34]:
# Note use this function is used to evaluate sampling + cross validation!
def cross_val_imbalanced(classifier, X, y, sampler):
    kf = StratifiedKFold(n_splits=10, random_state=42)
    cross_val_profit_lst = []
    
    X = pd.DataFrame(X)

    for train_index, test_index in kf.split(X, y):
        # keeping validation set apart and oversampling in each iteration using smote 
        train, test = X.iloc[train_index], X.iloc[test_index]
        target_train, target_test = y.iloc[train_index], y.iloc[test_index]
        X_train_res, y_train_res = sampler.fit_sample(train, target_train.ravel())

        # training the model on oversampled 9 folds of training set
        classifier.fit(pd.DataFrame(X_train_res), y_train_res)
        # testing on 1 fold of validation set
        test_preds = classifier.predict(test)
        cross_val_profit_lst.append(profit_scorer(target_test, test_preds))
        
    return np.sum(cross_val_profit_lst)

### 2.1 RandomOverSampling + SGD Classifier

In [35]:
def evaluateSgd(alpha, l1_ratio, tol, penalty, loss):
    
    # 3 options, l1 by default
    penalty_str = 'l1'
    if int(penalty) == 0:
        penalty_str = 'l2'
    elif int(penalty) == 1:
        penalty_str = 'elasticnet'
    
    # 3 options, modified_huber by default
    loss_str = 'modified_huber'
    if int(loss) == 0:
        loss_str = 'hinge'
    elif int(loss) == 1:
        loss_str = 'log'
        
    
    model = SGDClassifier(alpha=alpha, l1_ratio=l1_ratio, tol=tol,
                          penalty=penalty_str, loss=loss_str, random_state=231,
                         n_jobs=-1)
    
    return cross_val_imbalanced(model, X_train_prepared, y_train, RandomOverSampler(random_state=42))
    
    #return sum(cross_validate(model, X_train_prepared, y=y_train, cv=cv,
                   #scoring=profit_scoring)['test_score'])

In [36]:
params_sgd = {
    'alpha': (1e-6, 1),
    'l1_ratio': (0, 1),
    'tol': (1e-9, 1e-1),
    'penalty': (0, 3),
    'loss': (0, 3)
}

In [37]:
optimization_sgd = BayesianOptimization(evaluateSgd, params_sgd, random_state=231)
optimization_sgd.maximize(n_iter=200, init_points=20)

|   iter    |  target   |   alpha   | l1_ratio  |   loss    |  penalty  |    tol    |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-5.655e+0[0m | [0m 0.7815  [0m | [0m 0.5842  [0m | [0m 1.28    [0m | [0m 0.946   [0m | [0m 0.08269 [0m |
| [95m 2       [0m | [95m-3.605e+0[0m | [95m 0.9034  [0m | [95m 0.03894 [0m | [95m 2.746   [0m | [95m 0.1815  [0m | [95m 0.01807 [0m |
| [95m 3       [0m | [95m-480.0   [0m | [95m 0.0723  [0m | [95m 0.4588  [0m | [95m 2.125   [0m | [95m 1.45    [0m | [95m 0.002545[0m |
| [0m 4       [0m | [0m-2.885e+0[0m | [0m 0.0669  [0m | [0m 0.4259  [0m | [0m 0.6577  [0m | [0m 2.43    [0m | [0m 0.01401 [0m |
| [0m 5       [0m | [0m-4.975e+0[0m | [0m 0.7512  [0m | [0m 0.2791  [0m | [0m 2.847   [0m | [0m 2.655   [0m | [0m 0.06102 [0m |
| [0m 6       [0m | [0m-5.1e+03 [0m | [0m 0.7791  [0m | [0m 0.04156 [0m | [0m 2.359   [0m | [

| [0m 58      [0m | [0m-130.0   [0m | [0m 0.04729 [0m | [0m 0.4817  [0m | [0m 2.173   [0m | [0m 1.4     [0m | [0m 0.003144[0m |
| [0m 59      [0m | [0m-465.0   [0m | [0m 0.07584 [0m | [0m 0.4617  [0m | [0m 2.138   [0m | [0m 1.449   [0m | [0m 0.006097[0m |
| [0m 60      [0m | [0m-755.0   [0m | [0m 0.07852 [0m | [0m 0.9549  [0m | [0m 2.681   [0m | [0m 0.8793  [0m | [0m 0.08541 [0m |
| [95m 61      [0m | [95m 155.0   [0m | [95m 0.00266 [0m | [95m 0.4962  [0m | [95m 2.173   [0m | [95m 1.417   [0m | [95m 0.03833 [0m |
| [0m 62      [0m | [0m-705.0   [0m | [0m 0.07468 [0m | [0m 0.9893  [0m | [0m 2.657   [0m | [0m 0.8706  [0m | [0m 0.05708 [0m |
| [0m 63      [0m | [0m-275.0   [0m | [0m 0.08822 [0m | [0m 0.5178  [0m | [0m 2.126   [0m | [0m 1.422   [0m | [0m 0.004752[0m |
| [0m 64      [0m | [0m-575.0   [0m | [0m 0.1011  [0m | [0m 0.466   [0m | [0m 2.171   [0m | [0m 1.416   [0m | [0m 0.02991 [0m |

| [0m 116     [0m | [0m 75.0    [0m | [0m 0.03049 [0m | [0m 0.4774  [0m | [0m 2.185   [0m | [0m 1.394   [0m | [0m 0.02224 [0m |
| [0m 117     [0m | [0m-830.0   [0m | [0m 0.08645 [0m | [0m 0.9738  [0m | [0m 2.652   [0m | [0m 0.8549  [0m | [0m 0.03615 [0m |
| [0m 118     [0m | [0m-225.0   [0m | [0m 0.08813 [0m | [0m 0.4819  [0m | [0m 2.17    [0m | [0m 1.411   [0m | [0m 0.01065 [0m |
| [0m 119     [0m | [0m-25.0    [0m | [0m 0.03584 [0m | [0m 0.4807  [0m | [0m 2.13    [0m | [0m 1.464   [0m | [0m 0.05243 [0m |
| [0m 120     [0m | [0m-365.0   [0m | [0m 0.07907 [0m | [0m 0.4615  [0m | [0m 2.1     [0m | [0m 1.418   [0m | [0m 0.04586 [0m |
| [0m 121     [0m | [0m-315.0   [0m | [0m 0.07077 [0m | [0m 0.4904  [0m | [0m 2.126   [0m | [0m 1.461   [0m | [0m 0.05771 [0m |
| [0m 122     [0m | [0m-205.0   [0m | [0m 0.05088 [0m | [0m 0.4599  [0m | [0m 2.179   [0m | [0m 1.405   [0m | [0m 0.05159 [0m |
| [0m

| [0m 174     [0m | [0m-280.0   [0m | [0m 0.05741 [0m | [0m 0.4467  [0m | [0m 2.116   [0m | [0m 1.403   [0m | [0m 0.03389 [0m |
| [0m 175     [0m | [0m-600.0   [0m | [0m 0.1011  [0m | [0m 0.4952  [0m | [0m 2.202   [0m | [0m 1.4     [0m | [0m 0.02511 [0m |
| [0m 176     [0m | [0m-180.0   [0m | [0m 0.05061 [0m | [0m 0.4586  [0m | [0m 2.129   [0m | [0m 1.451   [0m | [0m 0.0213  [0m |
| [0m 177     [0m | [0m-430.0   [0m | [0m 0.06643 [0m | [0m 0.4836  [0m | [0m 2.119   [0m | [0m 1.431   [0m | [0m 0.003301[0m |
| [0m 178     [0m | [0m 145.0   [0m | [0m 0.001408[0m | [0m 0.4322  [0m | [0m 2.18    [0m | [0m 1.38    [0m | [0m 0.02226 [0m |
| [0m 179     [0m | [0m-600.0   [0m | [0m 0.1045  [0m | [0m 0.4898  [0m | [0m 2.239   [0m | [0m 1.444   [0m | [0m 0.03373 [0m |
| [0m 180     [0m | [0m-365.0   [0m | [0m 0.07745 [0m | [0m 0.4614  [0m | [0m 2.245   [0m | [0m 1.487   [0m | [0m 0.03729 [0m |
| [0m

In [39]:
optimization_sgd.max

{'target': 195.0,
 'params': {'alpha': 0.004259170002503944,
  'l1_ratio': 0.4571157730347052,
  'loss': 2.1384069293948227,
  'penalty': 1.4594434596038075,
  'tol': 0.038444190700582276}}

### 2.2 BorderlineSMOTE + LogisticRegression

In [52]:
def evaluateLogistic(C, k_neighbors, m_neighbors):
    
    model = LogisticRegression(C=C, solver='liblinear', random_state=42,)
    
    sampler = BorderlineSMOTE(random_state=42, k_neighbors=int(k_neighbors), m_neighbors=int(m_neighbors))

    return cross_val_imbalanced(model, X_train_prepared, y_train, sampler)

In [53]:
params_logistic = {
    'C': (1,500),
    'k_neighbors': (2, 15),
    'm_neighbors': (2, 15),
}

In [54]:
optimization_logistic = BayesianOptimization(evaluateLogistic, params_logistic, random_state=42)
optimization_logistic.maximize(n_iter=150, init_points=50)

|   iter    |  target   |     C     | k_neig... | m_neig... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 125.0   [0m | [0m 187.9   [0m | [0m 14.36   [0m | [0m 11.52   [0m |
| [0m 2       [0m | [0m 110.0   [0m | [0m 299.7   [0m | [0m 4.028   [0m | [0m 4.028   [0m |
| [0m 3       [0m | [0m-65.0    [0m | [0m 29.98   [0m | [0m 13.26   [0m | [0m 9.814   [0m |
| [95m 4       [0m | [95m 180.0   [0m | [95m 354.3   [0m | [95m 2.268   [0m | [95m 14.61   [0m |
| [0m 5       [0m | [0m 110.0   [0m | [0m 416.4   [0m | [0m 4.76    [0m | [0m 4.364   [0m |
| [0m 6       [0m | [0m-25.0    [0m | [0m 92.52   [0m | [0m 5.955   [0m | [0m 8.822   [0m |
| [0m 7       [0m | [0m 50.0    [0m | [0m 216.5   [0m | [0m 5.786   [0m | [0m 9.954   [0m |
| [0m 8       [0m | [0m 0.0     [0m | [0m 70.61   [0m | [0m 5.798   [0m | [0m 6.763   [0m |
| [0m 9       [0m | [0m 75.0    [0m | [0m 228.6   

| [0m 81      [0m | [0m 180.0   [0m | [0m 252.5   [0m | [0m 2.87    [0m | [0m 5.794   [0m |
| [0m 82      [0m | [0m 125.0   [0m | [0m 182.4   [0m | [0m 14.71   [0m | [0m 14.63   [0m |
| [0m 83      [0m | [0m 125.0   [0m | [0m 409.3   [0m | [0m 13.2    [0m | [0m 2.043   [0m |
| [0m 84      [0m | [0m 125.0   [0m | [0m 182.5   [0m | [0m 14.7    [0m | [0m 14.67   [0m |
| [0m 85      [0m | [0m 205.0   [0m | [0m 251.5   [0m | [0m 2.342   [0m | [0m 4.65    [0m |
| [0m 86      [0m | [0m 205.0   [0m | [0m 251.7   [0m | [0m 2.866   [0m | [0m 5.619   [0m |
| [0m 87      [0m | [0m 205.0   [0m | [0m 251.9   [0m | [0m 2.839   [0m | [0m 5.999   [0m |
| [0m 88      [0m | [0m 180.0   [0m | [0m 252.1   [0m | [0m 2.741   [0m | [0m 5.556   [0m |
| [0m 89      [0m | [0m 205.0   [0m | [0m 251.0   [0m | [0m 2.355   [0m | [0m 5.915   [0m |
| [0m 90      [0m | [0m 205.0   [0m | [0m 252.4   [0m | [0m 2.411   [0m | 

| [0m 162     [0m | [0m 205.0   [0m | [0m 252.7   [0m | [0m 2.353   [0m | [0m 5.901   [0m |
| [0m 163     [0m | [0m 205.0   [0m | [0m 251.5   [0m | [0m 2.295   [0m | [0m 4.684   [0m |
| [0m 164     [0m | [0m 205.0   [0m | [0m 251.8   [0m | [0m 2.056   [0m | [0m 5.49    [0m |
| [0m 165     [0m | [0m 205.0   [0m | [0m 252.6   [0m | [0m 2.34    [0m | [0m 6.02    [0m |
| [0m 166     [0m | [0m 205.0   [0m | [0m 252.3   [0m | [0m 2.851   [0m | [0m 5.578   [0m |
| [0m 167     [0m | [0m 205.0   [0m | [0m 251.8   [0m | [0m 2.74    [0m | [0m 6.202   [0m |
| [0m 168     [0m | [0m 205.0   [0m | [0m 251.2   [0m | [0m 2.158   [0m | [0m 5.889   [0m |
| [0m 169     [0m | [0m 205.0   [0m | [0m 251.8   [0m | [0m 2.926   [0m | [0m 6.06    [0m |
| [0m 170     [0m | [0m 180.0   [0m | [0m 251.6   [0m | [0m 2.664   [0m | [0m 5.494   [0m |
| [0m 171     [0m | [0m 205.0   [0m | [0m 354.1   [0m | [0m 2.285   [0m | 

In [55]:
optimization_logistic.max

{'target': 205.0,
 'params': {'C': 251.83683259120187,
  'k_neighbors': 2.6692237662498615,
  'm_neighbors': 5.622404035075949}}

In [None]:
#...