In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt


from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.base import BaseEstimator, TransformerMixin

from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [4]:
# Sourced from the UCI website
from ucimlrepo import fetch_ucirepo 
  
spambase = fetch_ucirepo(id=94) 
  
full_X = spambase.data.features
full_y = spambase.data.targets 

full = full_X.join(full_y).drop_duplicates()
train, test = train_test_split(full, test_size=0.20, stratify=full['Class'], random_state=123)

y = train.pop('Class').reset_index(drop=True)
X = train.reset_index(drop=True)

test_y = test.pop('Class').reset_index(drop=True)
test_X = test.reset_index(drop=True)

In [5]:
outliers = pd.Series(0, index=np.arange(len(X)))
for col in X.columns:
    q = X[X[col]>0][col].quantile(0.99)
    outliers |= X[col] > q

X2 = X[~outliers]
y2 = y[~outliers]

outliers.sum()

377

In [6]:
# Log transform capital columns
def capital_log_transform(X):
    X = X.copy()
    rest_cols = X.filter(like='capital').columns
    X[rest_cols] = np.log(X[rest_cols])
    return X
CapitalLogTransformer = FunctionTransformer(capital_log_transform)

# Convert percentages to [0,1] range
def freq_percent_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] /= 100
    return X
FreqPercentTransformer = FunctionTransformer(capital_log_transform)

# Sqrt transform freq cols
def freq_sqrt_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.sqrt(X[freq_cols])
    return X
FreqSqrtTransformer = FunctionTransformer(freq_sqrt_transform)

# log transform freq cols
def freq_log_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    X[freq_cols] = np.log(X[freq_cols]+0.001)
    return X
FreqLogTransformer = FunctionTransformer(freq_log_transform)

# Mark zero freq values
def freq_zero_transform(X):
    X = X.copy()
    freq_cols = X.filter(like='freq').columns
    # Name them zero_i so later transform steps dont mess it up
    zero_mark_cols =  'zero_i' + pd.Index(np.arange(len(freq_cols))).astype(str)
    X[zero_mark_cols] = X[freq_cols] > 0
    return X
FreqZeroTransformer = FunctionTransformer(freq_zero_transform)

In [24]:
def wrap_model(scaler=StandardScaler, transformers=[], poly=False):
    def inner(model):
        return make_pipeline(*transformers,
                             *([scaler()] if scaler else []), 
                             *([PolynomialFeatures(2, interaction_only=True)] if poly else []), 
                             model)
    return inner

default = wrap_model()

def test_model(model, X, y, preprocess=default, metric='accuracy', splits=5):
    # If no preprocessing, just use model
    pipeline = model
    if preprocess:
        pipeline = preprocess(model)
        
    kf = KFold(n_splits=splits, shuffle=True, random_state=0)
    cv_results = cross_val_score(
        pipeline,
        X,
        y,
        cv=kf,
        scoring=metric,
        n_jobs=-1)
    
    return cv_results

def test_models(models, X, y, metric='accuracy', splits=5, preprocess=default):
    for model in models:
        result = test_model(model, X, y, metric=metric, preprocess=preprocess).mean()
        print(f'{model}: {result}')

def baseline_models():
    return [
        # Dummy
        DummyClassifier(strategy='uniform'),
        # Linear
        LogisticRegression(max_iter=500),
        # Ensambles
        RandomForestClassifier(random_state=42),
        XGBClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        HistGradientBoostingClassifier(random_state=42),
        # SVM
        SVC(),
        # KNN
        KNeighborsClassifier(),
    ]

In [30]:
# Baseline scores
print('---- Standard')
test_models(baseline_models(), X, y)                                  

---- Standard
DummyClassifier(strategy='uniform'): 0.492276489080736
LogisticRegression(max_iter=500): 0.9242908981882797
RandomForestClassifier(random_state=42): 0.9450677025233576
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.944180140299205
ExtraTreesClassifier(random_state=42): 0.94

In [84]:
# Check MinMax Scaler
print('---- MinMax')
test_models(baseline_models(), X, y, preprocess=wrap_model(scaler=MinMaxScaler))   

---- MinMax
DummyClassifier(strategy='uniform'): 0.521076626646267
LogisticRegression(max_iter=500): 0.8883655715803723
RandomForestClassifier(random_state=42): 0.9453648793435656
GradientBoostingClassifier(random_state=42): 0.9400223103072738
ExtraTreesClassifier(random_state=42): 0.9495257957416413
HistGradientBoostingClassifier(random_state=42): 0.9483397339517904
SVC(): 0.9269597576730261
KNeighborsClassifier(): 0.8895538379460408


In [157]:
# Custom input transformer test 
print('---- CapitalLog')
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer]))
print()
print('---- FreqSqrt')
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=[FreqSqrtTransformer]))
print()
print('---- CapitalLog + FreqSqrt')
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, 
                                                                         FreqSqrtTransformer]))
print()
print('---- CapitalLog + FreqLog (with adjust for 0)')
test_models(models, X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, 
                                                              FreqLogTransformer]))

---- CapitalLog
DummyClassifier(strategy='uniform'): 0.5035678855031503
LogisticRegression(max_iter=500): 0.9302287026953145
RandomForestClassifier(random_state=42): 0.9465518229637435
GradientBoostingClassifier(random_state=42): 0.9400223103072738
ExtraTreesClassifier(random_state=42): 0.95249359570725
HistGradientBoostingClassifier(random_state=42): 0.9483397339517904
SVC(): 0.930524556770032
KNeighborsClassifier(): 0.9130083200691355

---- FreqSqrt
DummyClassifier(strategy='uniform'): 0.4949488758867906
LogisticRegression(max_iter=500): 0.9364628021922302
RandomForestClassifier(random_state=42): 0.94566161524861
GradientBoostingClassifier(random_state=42): 0.9400223103072738
ExtraTreesClassifier(random_state=42): 0.952196418887042
HistGradientBoostingClassifier(random_state=42): 0.9483397339517904
SVC(): 0.9373516871618731
KNeighborsClassifier(): 0.9254778418084577

---- CapitalLog + FreqSqrt
DummyClassifier(strategy='uniform'): 0.4848642642669124
LogisticRegression(max_iter=500): 0

In [158]:
# Zero marker test
print('---- Freq: Sqrt + ZeroMark')
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, 
                                                                         FreqZeroTransformer, 
                                                                         FreqSqrtTransformer]))
print()
print('---- Freq: Log + ZeroMark')
test_models(baseline_models(), X, y, preprocess=wrap_model(transformers=[CapitalLogTransformer, 
                                                                         FreqZeroTransformer, 
                                                                         FreqLogTransformer]))

---- Freq: Sqrt + ZeroMark
DummyClassifier(strategy='uniform'): 0.4979237304950154
LogisticRegression(max_iter=500): 0.9432934599053796
RandomForestClassifier(random_state=42): 0.9474446761698582
GradientBoostingClassifier(random_state=42): 0.9409125180224074
ExtraTreesClassifier(random_state=42): 0.9456651425699182
HistGradientBoostingClassifier(random_state=42): 0.9483397339517904
SVC(): 0.9486369107719984
KNeighborsClassifier(): 0.9364636840225572

---- Freq: Log + ZeroMark
DummyClassifier(strategy='uniform'): 0.49168301727064695
LogisticRegression(max_iter=500): 0.943293018990216
RandomForestClassifier(random_state=42): 0.9474451170850218
GradientBoostingClassifier(random_state=42): 0.9409125180224074
ExtraTreesClassifier(random_state=42): 0.9456647016547546
HistGradientBoostingClassifier(random_state=42): 0.9483397339517904
SVC(): 0.9438842862244876
KNeighborsClassifier(): 0.9367573335214571


In [13]:
best_transformers = [CapitalLogTransformer, FreqSqrtTransformer]

## Outliers

In [222]:
# Filetered outliers
test_models(baseline_models(), X2, y2)

DummyClassifier(strategy='uniform'): 0.4908079798549421
LogisticRegression(max_iter=500): 0.9297943618405258
RandomForestClassifier(random_state=42): 0.9468400511443263
GradientBoostingClassifier(random_state=42): 0.9428283482504286
ExtraTreesClassifier(random_state=42): 0.9488472984517117
HistGradientBoostingClassifier(random_state=42): 0.9485139669795254
SVC(): 0.9334682665088415
KNeighborsClassifier(): 0.908731386201082


In [213]:
# Filetered outliers and transformation applied
test_models(baseline_models(), X2, y2, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.4857834406284722
LogisticRegression(max_iter=500): 0.9394894500868224
RandomForestClassifier(random_state=42): 0.9458372650068956
GradientBoostingClassifier(random_state=42): 0.9424939000898934
ExtraTreesClassifier(random_state=42): 0.9515223253918181
HistGradientBoostingClassifier(random_state=42): 0.9488484151400606
SVC(): 0.9468445178977227
KNeighborsClassifier(): 0.9378133008749254


## Polynomial terms

In [220]:
# Polynomial
models = baseline_models()

test_models(models, X, y, preprocess=wrap_model(poly=True))

DummyClassifier(strategy='uniform'): 0.5124818673639004
LogisticRegression(max_iter=500): 0.9094444028024569
RandomForestClassifier(random_state=42): 0.942398843038611
GradientBoostingClassifier(random_state=42): 0.9453679657497102
ExtraTreesClassifier(random_state=42): 0.9498216498163587
HistGradientBoostingClassifier(random_state=42): 0.950418648947756
SVC(): 0.8563008981441881
KNeighborsClassifier(): 0.9002372123579703


In [221]:
# Poly outlierless
models = baseline_models()

test_models(models, X2, y2, preprocess=wrap_model(poly=True))

DummyClassifier(strategy='uniform'): 0.5021753089039146
LogisticRegression(max_iter=500): 0.9150758510561079
RandomForestClassifier(random_state=42): 0.9444994723647551
GradientBoostingClassifier(random_state=42): 0.9498484095566189
ExtraTreesClassifier(random_state=42): 0.9478433956259318
HistGradientBoostingClassifier(random_state=42): 0.953194007850319
SVC(): 0.8980279283756092
KNeighborsClassifier(): 0.8933540292907354


In [224]:
# Poly outlierless with transforms
models = baseline_models()

test_models(models, X2, y2, preprocess=wrap_model(poly=True, transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.5025147821620204
LogisticRegression(max_iter=500): 0.9247737310232775
RandomForestClassifier(random_state=42): 0.9455039335347093
GradientBoostingClassifier(random_state=42): 0.9461717131674308
ExtraTreesClassifier(random_state=42): 0.9515223253918179
HistGradientBoostingClassifier(random_state=42): 0.9535273393225052
SVC(): 0.9341377211740861
KNeighborsClassifier(): 0.9043835601141256


## Filtering columns

In [230]:
# Analysis in other notebook
low_corr_cols = [
    'word_freq_direct', 'char_freq_(', 'word_freq_table', 'word_freq_parts',
]
X_corr = X2.copy()
X_corr = X_corr.drop(low_corr_cols, axis=1)

In [179]:
# Columns with |correlation| > 0.1
test_models(baseline_models(), X_corr, y2, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.5121998202131758
LogisticRegression(max_iter=500): 0.9394900084309971
RandomForestClassifier(random_state=42): 0.9461717131674305
GradientBoostingClassifier(random_state=42): 0.9421594519293583
ExtraTreesClassifier(random_state=42): 0.9548656903088201
HistGradientBoostingClassifier(random_state=42): 0.9498523179658406
SVC(): 0.9485156420120491
KNeighborsClassifier(): 0.9374794110585647


In [180]:
# Columns with |correlation| > 0.1 and poly terms
models = baseline_models()

test_models(models, X_corr, y2, preprocess=wrap_model(poly=True, transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.4901357334688249
LogisticRegression(max_iter=500): 0.9277832061239188
RandomForestClassifier(random_state=42): 0.947509505809571
GradientBoostingClassifier(random_state=42): 0.9465061613279658
ExtraTreesClassifier(random_state=42): 0.9511884355754574
HistGradientBoostingClassifier(random_state=42): 0.9495167531169564
SVC(): 0.9384844305726936
KNeighborsClassifier(): 0.9077263666869531


## Custom columns

In [198]:
pos_corr = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
    'word_freq_our', 'word_freq_over', 'word_freq_remove',
    'word_freq_internet', 'word_freq_order', 'word_freq_mail',
    'word_freq_receive', 'word_freq_will', 'word_freq_people',
    'word_freq_report', 'word_freq_addresses', 'word_freq_free',
    'word_freq_business', 'word_freq_email', 'word_freq_you',
    'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',
    'word_freq_money', 'char_freq_!', 'char_freq_$', 'char_freq_#'
]

neg_corr = [
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85',
    'word_freq_technology', 'word_freq_1999', 'word_freq_parts',
    'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting',
    'word_freq_original', 'word_freq_project', 'word_freq_re',
    'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_;', 'char_freq_(', 'char_freq_['
]

In [206]:
# Test with correlated columns and two sum columns
X3 = X2.copy()
X3['sum_char_freq'] = (X3.filter(like='char_freq')/100).sum(axis=1)

test_models(baseline_models(), X3, y2, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.49380182131869726
LogisticRegression(max_iter=500): 0.9391550019262873
RandomForestClassifier(random_state=42): 0.9448344788694648
GradientBoostingClassifier(random_state=42): 0.9431627964109637
ExtraTreesClassifier(random_state=42): 0.952526228217598
HistGradientBoostingClassifier(random_state=42): 0.9488495318284098
SVC(): 0.9471772910257341
KNeighborsClassifier(): 0.9388183203890541


In [205]:
# Test with correlated columns and two sum columns
X3 = X2.copy()
X3['pos_sum_freq'] = (X3[pos_corr].filter(like='freq')/100).sum(axis=1)
X3['neg_sum_freq'] = (X3[neg_corr].filter(like='freq')/100).sum(axis=1)

test_models(baseline_models(), X3, y2, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.497826366128609
LogisticRegression(max_iter=500): 0.9378172092841469
RandomForestClassifier(random_state=42): 0.9488472984517115
GradientBoostingClassifier(random_state=42): 0.9408227759755668
ExtraTreesClassifier(random_state=42): 0.9552012551577043
HistGradientBoostingClassifier(random_state=42): 0.9481795188189904
SVC(): 0.9471789660582577
KNeighborsClassifier(): 0.9391516518612404


In [207]:
# Test with correlated columns and two sum columns
X3 = X2.copy()
X3['sum_char_freq'] = (X3.filter(like='char_freq')/100).sum(axis=1)
X3['pos_sum_freq'] = (X3[pos_corr].filter(like='freq')/100).sum(axis=1)
X3['neg_sum_freq'] = (X3[neg_corr].filter(like='freq')/100).sum(axis=1)

test_models(baseline_models(), X3, y2, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.4894685121802781
LogisticRegression(max_iter=500): 0.9368144231467161
RandomForestClassifier(random_state=42): 0.9485117336028275
GradientBoostingClassifier(random_state=42): 0.9414911139524627
ExtraTreesClassifier(random_state=42): 0.9521912217128883
HistGradientBoostingClassifier(random_state=42): 0.9508545457590968
SVC(): 0.9468445178977225
KNeighborsClassifier(): 0.9368116314258434


In [209]:
# Test with correlated columns and two sum columns
X3 = X2.copy()
X3['sum_char_freq'] = (X3.filter(like='char_freq')/100).sum(axis=1)
X3['pos_sum_freq'] = (X3[pos_corr].filter(like='freq')/100).sum(axis=1)
X3['neg_sum_freq'] = (X3[neg_corr].filter(like='freq')/100).sum(axis=1)

test_models(baseline_models(), X3, y2, preprocess=wrap_model(poly=True, transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.49683195515379597
LogisticRegression(max_iter=500): 0.9294560052707691
RandomForestClassifier(random_state=42): 0.9455039335347095
GradientBoostingClassifier(random_state=42): 0.9451700437183488
ExtraTreesClassifier(random_state=42): 0.9508545457590969
HistGradientBoostingClassifier(random_state=42): 0.9538629041713893
SVC(): 0.9334699415413649
KNeighborsClassifier(): 0.9073885684613708


## PCA

In [39]:
# Baseline reference
test_models(baseline_models(), Xpca, y, preprocess=wrap_model(transformers=best_transformers))

DummyClassifier(strategy='uniform'): 0.5083183054748435
LogisticRegression(max_iter=500): 0.9394319249033293
RandomForestClassifier(random_state=42): 0.9462572916345167
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9447753757699482
ExtraTreesClassifier(random_state=42): 0.95308927209315

In [18]:
freq_cols = X.filter(like='freq')
res = pd.DataFrame({
    'regular': freq_cols.corrwith(y),
    'sqrt': np.sqrt(freq_cols).corrwith(y),
    'log': np.log(freq_cols + 0.001).corrwith(y),
})

In [36]:
neg_low_corr_cols = res[(res['sqrt'] < 0) & (abs(res['sqrt'])<0.1)].index
display(neg_low_corr_cols)

pca = wrap_model(transformers=[CapitalLogTransformer, FreqSqrtTransformer])(PCA(n_components=2)).fit(X[neg_low_corr_cols])
pca_out = pd.DataFrame(pca.transform(X[neg_low_corr_cols]))
Xpca = X.drop(neg_low_corr_cols, axis='columns')
Xpca['pca0'] = pca_out[0]
Xpca['pca1'] = pca_out[1]

test_models(baseline_models(), Xpca, y, preprocess=wrap_model(transformers=best_transformers))

Index(['word_freq_parts', 'word_freq_direct', 'word_freq_table',
       'char_freq_;'],
      dtype='object')

DummyClassifier(strategy='uniform'): 0.5207882681293292
LogisticRegression(max_iter=500): 0.9394319249033293
RandomForestClassifier(random_state=42): 0.9429918739335366
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9471527903316124
ExtraTreesClassifier(random_state=42): 0.95278856795164

In [40]:
neg_low_corr_cols = res[(res['sqrt'] < 0) & (abs(res['sqrt'])<0.1)].index
display(neg_low_corr_cols)

pca = wrap_model(transformers=[CapitalLogTransformer, FreqSqrtTransformer])(PCA(n_components=1)).fit(X[neg_low_corr_cols])
pca_out = pd.DataFrame(pca.transform(X[neg_low_corr_cols]))
Xpca = X.drop(neg_low_corr_cols, axis='columns')
Xpca['pca0'] = pca_out[0]

test_models(baseline_models(), Xpca, y, preprocess=wrap_model(transformers=best_transformers))

Index(['word_freq_parts', 'word_freq_direct', 'word_freq_table',
       'char_freq_;'],
      dtype='object')

DummyClassifier(strategy='uniform'): 0.49524517087667164
LogisticRegression(max_iter=500): 0.9394319249033293
RandomForestClassifier(random_state=42): 0.9462572916345167
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9447753757699482
ExtraTreesClassifier(random_state=42): 0.9530892720931

In [41]:
neg_low_corr_cols = res[(res['sqrt'] < 0) & (abs(res['sqrt'])<0.1)].index
display(neg_low_corr_cols)
X3 = X.drop(neg_low_corr_cols, axis='columns')

test_models(baseline_models(), X3, y, preprocess=wrap_model(transformers=best_transformers))

Index(['word_freq_parts', 'word_freq_direct', 'word_freq_table',
       'char_freq_;'],
      dtype='object')

DummyClassifier(strategy='uniform'): 0.5062318949210982
LogisticRegression(max_iter=500): 0.9376528322185529
RandomForestClassifier(random_state=42): 0.9429901102728824
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9450712298446657
ExtraTreesClassifier(random_state=42): 0.95130621117190

In [43]:
pos_low_corr_cols = res[(res['sqrt'] > 0) & (abs(res['sqrt'])<0.15)].index
display(pos_low_corr_cols)

pca = wrap_model(transformers=[CapitalLogTransformer, FreqSqrtTransformer])(PCA(n_components=1)).fit(X[pos_low_corr_cols])
pca_out = pd.DataFrame(pca.transform(X[pos_low_corr_cols]))
Xpca = X.drop(neg_low_corr_cols, axis='columns')
Xpca['pca0'] = pca_out[0]

test_models(baseline_models(), Xpca, y, preprocess=wrap_model(transformers=best_transformers))

Index(['word_freq_3d', 'word_freq_will', 'word_freq_report', 'word_freq_font'], dtype='object')

DummyClassifier(strategy='uniform'): 0.4964409328001199
LogisticRegression(max_iter=500): 0.9376528322185529
RandomForestClassifier(random_state=42): 0.9462555279738625
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9444768762042497
ExtraTreesClassifier(random_state=42): 0.95130621117190

In [46]:
pos_low_corr_cols = res[(res['sqrt'] > 0) & (abs(res['sqrt'])<0.15)].index
display(pos_low_corr_cols)
X3 = X.drop(neg_low_corr_cols, axis='columns')

test_models(baseline_models(), X3, y, preprocess=wrap_model(transformers=best_transformers))


Index(['word_freq_3d', 'word_freq_will', 'word_freq_report', 'word_freq_font'], dtype='object')

DummyClassifier(strategy='uniform'): 0.5109942196022063
LogisticRegression(max_iter=500): 0.9376528322185529
RandomForestClassifier(random_state=42): 0.9429901102728824
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...): 0.9450712298446657
ExtraTreesClassifier(random_state=42): 0.95130621117190