In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt


from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB
from sklearn.ensemble import StackingClassifier

from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [3]:
# Sourced from the UCI website
from ucimlrepo import fetch_ucirepo 
  
spambase = fetch_ucirepo(id=94) 
  
full_X = spambase.data.features
full_y = spambase.data.targets 

full = full_X.join(full_y).drop_duplicates()
train, test = train_test_split(full, test_size=0.20, stratify=full['Class'], random_state=123)

y = train.pop('Class').reset_index(drop=True)
X = train.reset_index(drop=True)

test_y = test.pop('Class').reset_index(drop=True)
test_X = test.reset_index(drop=True)

In [4]:
outliers = pd.Series(0, index=np.arange(len(X)))
for col in X.columns:
    q = X[X[col]>0][col].quantile(0.99)
    outliers |= X[col] > q

X2 = X[~outliers]
y2 = y[~outliers]

outliers.sum()

377

In [5]:
# Log transform capital columns
def capital_log_transform(X):
    X = X.copy()
    rest_cols = X.filter(like='capital').columns
    X[rest_cols] = np.log(X[rest_cols])
    return X
CapitalLogTransformer = FunctionTransformer(capital_log_transform)

# Convert percentages to [0,1] range
def freq_percent_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] /= 100
    return X
FreqPercentTransformer = FunctionTransformer(capital_log_transform)

# Sqrt transform freq cols
def freq_sqrt_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.sqrt(X[freq_cols])
    return X
FreqSqrtTransformer = FunctionTransformer(freq_sqrt_transform)

# log transform freq cols
def freq_log_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.log(X[freq_cols]+0.001)
    return X
FreqLogTransformer = FunctionTransformer(freq_log_transform)

# Mark zero freq values
def freq_zero_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    # Name them zero_i so later transform steps dont mess it up
    zero_mark_cols =  'zero_i' + pd.Index(np.arange(len(freq_cols))).astype(str)
    X[zero_mark_cols] = X[freq_cols] > 0
    return X
FreqZeroTransformer = FunctionTransformer(freq_zero_transform)

In [6]:
def wrap_model(scaler=StandardScaler, transformers=[], poly=False):
    def inner(model):
        return make_pipeline(*transformers,
                             *([scaler()] if scaler else []), 
                             *([PolynomialFeatures(2, interaction_only=True)] if poly else []), 
                             model)
    return inner

default = wrap_model()

def test_model(model, X, y, preprocess=default, metric='accuracy', splits=5):
    # If no preprocessing, just use model
    pipeline = model
    if preprocess:
        pipeline = preprocess(model)
        
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)
    cv_results = cross_val_score(
        pipeline,
        X,
        y,
        cv=kf,
        scoring=metric,
        n_jobs=-1)
    
    return cv_results

def test_models(models, X, y, metric='accuracy', splits=5, preprocess=default):
    for model in models:
        result = test_model(model, X, y, metric=metric, preprocess=preprocess).mean()
        print(f'{model}: {result}')

def baseline_models():
    return [
        # Dummy
        DummyClassifier(strategy='uniform'),
        # Linear
        LogisticRegression(max_iter=500),
        # Ensambles
        RandomForestClassifier(random_state=42),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
        # SVM
        SVC(),
        # KNN
        KNeighborsClassifier(),
    ]

In [60]:
# Baseline scores
print('---- Standard')
test_models(baseline_models(), X, y)                                  

---- Standard
DummyClassifier(strategy='uniform'): 0.49643167358168616
LogisticRegression(max_iter=500): 0.923990634961927
RandomForestClassifier(random_state=42): 0.9438834043941606
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9510077116062099
ExtraTreesClassifier(random_state=42): 0.

In [61]:
best_transformers = [CapitalLogTransformer, FreqSqrtTransformer]
# Baseline reference
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.4952570755860865
LogisticRegression(max_iter=500): 0.9409129589375709
RandomForestClassifier(random_state=42): 0.9453684066648738
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9510077116062099
ExtraTreesClassifier(random_state=42): 0.95130665208707

In [33]:
from sklearn.ensemble import StackingClassifier
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        XGBClassifier(random_state=42), 
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        XGBClassifier(random_state=42), 
        HistGradientBoostingClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        KNeighborsClassifier(),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        RandomForestClassifier(random_state=42),
    ])]),
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))


StackingClassifier(estimators=[('l0',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              learning_rate=

In [74]:
from sklearn.ensemble import StackingClassifier
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        XGBClassifier(random_state=42), 
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        XGBClassifier(random_state=42), 
        HistGradientBoostingClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        KNeighborsClassifier(),
    ])]),
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        RandomForestClassifier(random_state=42),
    ])]),
]

test_models(models, X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))


StackingClassifier(estimators=[('l0',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              learning_rate=

In [91]:
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        ExtraTreesClassifier(random_state=43),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
]

test_models(models, X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))

StackingClassifier(estimators=[('l0', LogisticRegression()),
                               ('l1',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction...
              

In [94]:
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])], final_estimator=model, n_jobs=-1)
    for model in baseline_models()
]

test_models(models, X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))

StackingClassifier(estimators=[('l0', LogisticRegression()),
                               ('l1',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction...
              

In [95]:
# Try other final estimators
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])], final_estimator=model, n_jobs=-1, passthrough=True)
    for model in baseline_models()
]

test_models(models, X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))

StackingClassifier(estimators=[('l0', LogisticRegression()),
                               ('l1',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction...
              

In [98]:
# Try polynomial features
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(max_iter=500),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])], n_jobs=-1)
]

test_models(models, X, y, preprocess=wrap_model(poly=True, transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))

StackingClassifier(estimators=[('l0', LogisticRegression(max_iter=500)),
                               ('l1',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None...
                                              max_cat_to_oneh

In [23]:
# Try adding naive bayes
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(max_iter=500),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
        MultinomialNB(alpha=0.01)
    ])], n_jobs=-1)
]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))

StackingClassifier(estimators=[('l0', LogisticRegression(max_iter=500)),
                               ('l1',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None...
                                              max_delta_step=

In [25]:
# Best result
models = [
    StackingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(max_iter=500),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42)
    ])], n_jobs=1)
]

test_models(models, X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, FreqZeroTransformer, FreqSqrtTransformer]))

StackingClassifier(estimators=[('l0', LogisticRegression(max_iter=500)),
                               ('l1',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None...
                                              max_cat_to_oneh

# Voting

In [42]:
from sklearn.ensemble import VotingClassifier
models = [
    VotingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        XGBClassifier(random_state=42), 
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    VotingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        XGBClassifier(random_state=42), 
        HistGradientBoostingClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
    ])]),
    VotingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    VotingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        HistGradientBoostingClassifier(random_state=42),
    ])]),
    VotingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        KNeighborsClassifier(),
    ])]),
    VotingClassifier(estimators=[(f'l{i}', val) for i,val in enumerate([
        LogisticRegression(),
        XGBClassifier(random_state=42), 
        ExtraTreesClassifier(random_state=42),
        SVC(),
        RandomForestClassifier(random_state=42),
    ])]),
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))


VotingClassifier(estimators=[('l0',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None, max_bin=None,
                                            max_cat_th

# Adaboost

In [54]:
from sklearn.ensemble import AdaBoostClassifier

models = [
    AdaBoostClassifier(estimator=model)
    for model in [ None, LogisticRegression(max_iter=500), SVC(probability=True), SVC(probability=True, kernel='linear') ]
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))

AdaBoostClassifier(): 0.9346793003558185
AdaBoostClassifier(estimator=LogisticRegression(max_iter=500)): 0.9382432176224974
AdaBoostClassifier(estimator=SVC(probability=True)): 0.8242379883686579
AdaBoostClassifier(estimator=SVC(kernel='linear', probability=True)): 0.8755966684450245


In [73]:
models = [
    AdaBoostClassifier(estimator=model, n_estimators=100)
    for model in [ None, LogisticRegression(max_iter=500), SVC(probability=True), SVC(probability=True, kernel='linear') ]
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))

AdaBoostClassifier(n_estimators=100): 0.932898444010388
AdaBoostClassifier(estimator=LogisticRegression(max_iter=500), n_estimators=100): 0.9394297203275119
AdaBoostClassifier(estimator=SVC(probability=True), n_estimators=100): 0.8064276612536982
AdaBoostClassifier(estimator=SVC(kernel='linear', probability=True),
                   n_estimators=100): 0.8438141807134889


# Bagging

In [55]:
from sklearn.ensemble import BaggingClassifier

models = [
    BaggingClassifier(estimator=model)
    for model in [ None, LogisticRegression(max_iter=500), SVC(), SVC(kernel='linear') ]
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))

BaggingClassifier(): 0.9275532294831151
BaggingClassifier(estimator=LogisticRegression(max_iter=500)): 0.9426955789436555
BaggingClassifier(estimator=SVC(probability=True)): 0.9459609966446356
BaggingClassifier(estimator=SVC(kernel='linear', probability=True)): 0.9421012253032395


In [62]:
models = [
    BaggingClassifier(estimator=model, n_estimators=50)
    for model in [ None, LogisticRegression(max_iter=500), SVC(), SVC(kernel='linear') ]
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))

BaggingClassifier(n_estimators=50): 0.9340889149518741
BaggingClassifier(estimator=LogisticRegression(max_iter=500), n_estimators=50): 0.9421007843880759
BaggingClassifier(estimator=SVC(), n_estimators=50): 0.9447749348547847
BaggingClassifier(estimator=SVC(kernel='linear'), n_estimators=50): 0.9406162230325263


In [67]:
models = [
    BaggingClassifier(estimator=model, max_features=0.8, bootstrap=True)
    for model in [ None, LogisticRegression(max_iter=500), SVC(), SVC(kernel='linear') ]
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))

BaggingClassifier(max_features=0.8): 0.9328966803497339
BaggingClassifier(estimator=LogisticRegression(max_iter=500), max_features=0.8): 0.9415086353234775
BaggingClassifier(estimator=SVC(), max_features=0.8): 0.9382436585376608
BaggingClassifier(estimator=SVC(kernel='linear'), max_features=0.8): 0.942991433018373


In [70]:
models = [
    BaggingClassifier(estimator=model, max_features=0.8, bootstrap=True, n_estimators=50)
    for model in [ None, LogisticRegression(max_iter=500), SVC(), SVC(kernel='linear')]
]

test_models(models, X, y, preprocess=wrap_model(transformers=best_transformers))

BaggingClassifier(max_features=0.8, n_estimators=50): 0.9397277789780469
BaggingClassifier(estimator=LogisticRegression(max_iter=500), max_features=0.8,
                  n_estimators=50): 0.9412105766729424
BaggingClassifier(estimator=SVC(), max_features=0.8, n_estimators=50): 0.9432899325840716
BaggingClassifier(estimator=SVC(kernel='linear'), max_features=0.8,
                  n_estimators=50): 0.9400227512224374
