In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt


from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.base import BaseEstimator, TransformerMixin

from IPython.display import display

from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
# Sourced from the UCI website
from ucimlrepo import fetch_ucirepo 
  
spambase = fetch_ucirepo(id=94) 
  
full_X = spambase.data.features
full_y = spambase.data.targets 

full = full_X.join(full_y).drop_duplicates()
train, test = train_test_split(full, test_size=0.20, stratify=full['Class'], random_state=123)

y = train.pop('Class').reset_index(drop=True)
X = train.reset_index(drop=True)

test_y = test.pop('Class').reset_index(drop=True)
test_X = test.reset_index(drop=True)

In [3]:
outliers = pd.Series(0, index=np.arange(len(X)))
for col in X.columns:
    q = X[X[col]>0][col].quantile(0.99)
    outliers |= X[col] > q

X2 = X[~outliers]
y2 = y[~outliers]

outliers.sum()

377

In [4]:
# Log transform capital columns
def capital_log_transform(X):
    X = X.copy()
    rest_cols = X.filter(like='capital').columns
    X[rest_cols] = np.log(X[rest_cols])
    return X
CapitalLogTransformer = FunctionTransformer(capital_log_transform)

# Convert percentages to [0,1] range
def freq_percent_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] /= 100
    return X
FreqPercentTransformer = FunctionTransformer(capital_log_transform)

# Sqrt transform freq cols
def freq_sqrt_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.sqrt(X[freq_cols])
    return X
FreqSqrtTransformer = FunctionTransformer(freq_sqrt_transform)

# log transform freq cols
def freq_log_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.log(X[freq_cols]+0.001)
    return X
FreqLogTransformer = FunctionTransformer(freq_log_transform)

# Mark zero freq values
def freq_zero_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    # Name them zero_i so later transform steps dont mess it up
    zero_mark_cols =  'zero_i' + pd.Index(np.arange(len(freq_cols))).astype(str)
    X[zero_mark_cols] = X[freq_cols] > 0
    return X
FreqZeroTransformer = FunctionTransformer(freq_zero_transform)

In [5]:
def wrap_model(scaler=StandardScaler, transformers=[], poly=False):
    def inner(model):
        return make_pipeline(*transformers,
                             *([scaler()] if scaler else []), 
                             *([PolynomialFeatures(2, interaction_only=True)] if poly else []), 
                             model)
    return inner

default = wrap_model()

def test_model(model, X, y, preprocess=default, metric='accuracy', splits=5):
    # If no preprocessing, just use model
    pipeline = model
    if preprocess:
        pipeline = preprocess(model)
        
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)
    cv_results = cross_val_score(
        pipeline,
        X,
        y,
        cv=kf,
        scoring=metric,
        n_jobs=-1)
    
    return cv_results

def test_models(models, X, y, metric='accuracy', splits=5, preprocess=default):
    for model in models:
        result = test_model(model, X, y, metric=metric, preprocess=preprocess).mean()
        print(f'{model}: {result}')

def baseline_models():
    return [
        # Dummy
        DummyClassifier(strategy='uniform'),
        # Linear
        LogisticRegression(max_iter=500),
        # Ensambles
        RandomForestClassifier(random_state=42),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
        # SVM
        SVC(),
        # KNN
        KNeighborsClassifier(),
    ]

In [6]:
# Baseline scores
print('---- Standard')
test_models(baseline_models(), X, y)                                  

---- Standard
DummyClassifier(strategy='uniform'): 0.500298058650535
LogisticRegression(max_iter=500): 0.923990634961927
RandomForestClassifier(random_state=42): 0.9438834043941606
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9510077116062099
ExtraTreesClassifier(random_state=42): 0.95

In [7]:
best_transformers = [CapitalLogTransformer, FreqSqrtTransformer]
# Baseline reference
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.48723197869497936
LogisticRegression(max_iter=500): 0.9409129589375709
RandomForestClassifier(random_state=42): 0.9453684066648738
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9510077116062099
ExtraTreesClassifier(random_state=42): 0.9513066520870

## Naive bayes

In [11]:
X

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,0.00,0.00,0.00,0.0,0.82,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.82,0.0,0.00,0.0,0.00,0.00,4.13,2.47,1.65,0.82,0.82,0.82,0.82,0.82,0.00,0.82,0.82,0.82,0.00,0.0,0.0,0.82,0.0,0.0,0.0,0.00,0.82,0.00,0.00,0.0,0.000,0.361,0.000,0.240,0.000,0.000,4.666,34,126
1,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.93,0.00,0.00,0.93,0.0,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.93,0.0,0.0,0.00,0.0,0.0,0.0,0.93,0.93,0.93,0.00,0.0,0.000,0.163,0.000,0.000,0.000,0.000,1.911,15,65
2,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,5.81,0.0,1.16,0.0,0.00,1.16,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.00,2.32,0.00,0.0,0.000,0.163,0.000,0.490,0.000,0.000,2.125,7,34
3,0.54,0.00,0.27,0.0,0.00,0.00,0.00,0.00,0.27,0.54,0.00,0.81,0.00,0.00,0.00,0.00,0.00,0.27,1.08,0.0,0.81,0.0,0.00,0.00,0.81,0.81,0.00,1.08,0.00,0.54,0.00,0.00,0.27,0.00,1.08,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0.27,0.0,0.039,0.318,0.079,0.000,0.000,0.000,4.971,76,517
4,0.00,0.00,1.96,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.96,0.00,0.0,0.00,0.0,0.00,0.00,3.92,1.96,0.00,3.92,0.00,1.96,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.000,0.475,0.000,0.000,0.000,0.000,2.950,11,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,0.27,0.27,0.55,0.0,0.27,0.27,0.00,1.37,0.27,0.82,0.27,0.55,0.00,0.00,0.00,0.00,1.37,0.55,1.65,2.2,3.03,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.27,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.000,0.280,0.000,1.029,0.093,0.000,3.621,63,344
3364,0.11,0.23,0.11,0.0,0.46,0.46,0.00,0.11,0.93,1.74,0.11,0.34,0.23,0.11,2.09,0.00,0.46,0.00,3.49,0.0,1.28,0.0,0.46,0.46,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.11,0.0,0.0,0.0,0.00,0.34,0.00,0.00,0.0,0.019,0.172,0.000,0.230,0.134,0.000,4.281,144,655
3365,0.10,0.10,0.71,0.0,0.61,0.30,0.40,0.10,1.42,0.81,0.10,0.50,0.00,0.00,0.00,0.10,0.00,1.11,2.23,0.5,2.03,0.0,0.00,0.30,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.10,0.00,0.00,0.0,0.1,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.000,0.000,0.264,0.976,0.397,0.033,3.186,56,1042
3366,0.00,0.00,0.00,0.0,1.28,0.00,1.28,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.64,0.00,1.28,1.28,0.0,0.64,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.000,0.000,0.000,0.085,0.170,0.000,2.466,18,111


In [13]:
models = [
    GaussianNB(), 
    ComplementNB(), 
    MultinomialNB()
]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler))

GaussianNB(): 0.8215532559380249
ComplementNB(): 0.843226881715689
MultinomialNB(): 0.891335135206635


In [14]:
models = [
    GaussianNB(), 
    ComplementNB(), 
    MultinomialNB()
]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, transformers=best_transformers))

GaussianNB(): 0.8402559953439358
ComplementNB(): 0.869653132040864
MultinomialNB(): 0.9165665054386885


In [42]:
alpha_range = np.arange(0.1, 1, 0.1)
models = [
    ComplementNB, 
    MultinomialNB
]
models = [model(alpha=round(alpha,1)) 
          for model in models 
          for alpha in alpha_range]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, transformers=best_transformers))

ComplementNB(alpha=0.1): 0.8702452811054625
ComplementNB(alpha=0.2): 0.869355073390329
ComplementNB(alpha=0.3): 0.869355073390329
ComplementNB(alpha=0.4): 0.8696526911257006
ComplementNB(alpha=0.5): 0.8696526911257006
ComplementNB(alpha=0.6): 0.8699498679459087
ComplementNB(alpha=0.7): 0.8699498679459087
ComplementNB(alpha=0.8): 0.8699498679459087
ComplementNB(alpha=0.9): 0.869653132040864
MultinomialNB(alpha=0.1): 0.916567828184179
MultinomialNB(alpha=0.2): 0.9159739154589266
MultinomialNB(alpha=0.3): 0.915973474543763
MultinomialNB(alpha=0.4): 0.915973474543763
MultinomialNB(alpha=0.5): 0.9162702104488074
MultinomialNB(alpha=0.6): 0.9162702104488074
MultinomialNB(alpha=0.7): 0.9165665054386885
MultinomialNB(alpha=0.8): 0.9165665054386885
MultinomialNB(alpha=0.9): 0.9165665054386885


In [67]:
alpha_range = np.arange(1, 7)
models = [
    ComplementNB, 
    MultinomialNB
]
models = [model(alpha=round(0.1**alpha, 8), force_alpha=True) 
          for model in models 
          for alpha in alpha_range]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, transformers=best_transformers))

ComplementNB(alpha=0.1, force_alpha=True): 0.8702452811054625
ComplementNB(alpha=0.01, force_alpha=True): 0.8705420170105072
ComplementNB(alpha=0.001, force_alpha=True): 0.8702452811054625
ComplementNB(alpha=0.0001, force_alpha=True): 0.8702452811054625
ComplementNB(alpha=1e-05, force_alpha=True): 0.8702452811054625
ComplementNB(alpha=1e-06, force_alpha=True): 0.8702452811054625
MultinomialNB(alpha=0.1, force_alpha=True): 0.916567828184179
MultinomialNB(alpha=0.01, force_alpha=True): 0.9168645640892235
MultinomialNB(alpha=0.001, force_alpha=True): 0.9168645640892235
MultinomialNB(alpha=0.0001, force_alpha=True): 0.9168645640892235
MultinomialNB(alpha=1e-05, force_alpha=True): 0.9168645640892235
MultinomialNB(alpha=1e-06, force_alpha=True): 0.9168645640892235


In [38]:
models = [
    GaussianNB(), 
    ComplementNB(), 
    MultinomialNB()
]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, 
                                                transformers=[CapitalLogTransformer, 
                                                              FreqLogTransformer]))

GaussianNB(): 0.851535927972099
ComplementNB(): 0.8815287410549336
MultinomialNB(): 0.9026119814286533


In [40]:
models = [
    GaussianNB(), 
    ComplementNB(), 
    MultinomialNB()
]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, 
                                                transformers=[CapitalLogTransformer, 
                                                              FreqZeroTransformer,
                                                              FreqSqrtTransformer]))

GaussianNB(): 0.8411466439742329
ComplementNB(): 0.880934828329681
MultinomialNB(): 0.8972645623255631


In [41]:
models = [
    GaussianNB(), 
    ComplementNB(), 
    MultinomialNB()
]

test_models(models, X, y, preprocess=wrap_model(scaler=MinMaxScaler, 
                                                poly=True,
                                                transformers=best_transformers))

GaussianNB(): 0.7995776032733541
ComplementNB(): 0.8966697677699834
MultinomialNB(): 0.9079527868042909
