In [1]:
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
import __ini__ as ini

In [2]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

## with selecting Columns via ColumnsTransformer

In [3]:
cols = ['sex','embarked']
encoder = ini.MLPEncoder(task='classification', arch=[10,5], activation='tanh')

mlp_encoder = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999)),
        ('generate_columns', encoder)
    ])


pipe = ColumnTransformer([
        ('mlp_encoder', mlp_encoder, cols),
    ])
test = pipe.fit_transform(X, y)
print(f"Encoded Columns: {pipe.transformers_[0][2]}")
test

Encoded Columns: ['sex', 'embarked']


array([[ 2.0240379 ,  1.23431511,  0.06468371, -1.13983344, -0.00626586],
       [ 2.24580408,  1.61994526, -0.23022366, -1.33050143,  0.1438937 ],
       [ 2.0240379 ,  1.23431511,  0.06468371, -1.13983344, -0.00626586],
       ...,
       [ 1.18246175,  0.45495156, -0.81848907, -1.02293795,  0.15076759],
       [ 1.18246175,  0.45495156, -0.81848907, -1.02293795,  0.15076759],
       [ 2.24580408,  1.61994526, -0.23022366, -1.33050143,  0.1438937 ]])

# Test Model Performance

In [4]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

## Standard


In [5]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [6]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_pipe, make_column_selector(dtype_include=['category', 'object'])),
    ('num', num_pipe, make_column_selector(dtype_exclude=['category', 'object'])) 
])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])
test = preprocessor.fit_transform(X, y.astype('int'))
print(f'Number of Features: {test[0].shape}')
cross_val_score(pipe, X, y.astype('int'), cv=5, scoring='f1').mean()

Number of Features: (2833,)


0.9542173735788906

## MLPEncoder

In [7]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [8]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# New
mlp_columns = ['sex', 'embarked']
encoder = ini.MLPEncoder(task='classification', arch=[500,250,100,25], activation='tanh')

mlp_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=9999)),
    ('mlp_encoder', encoder),
])

preprocessor = ColumnTransformer([
    ('mlp_pipe', mlp_pipe, mlp_columns),
    ('num', num_pipe, make_column_selector(dtype_exclude=['category', 'object'])),
    ('cat', cat_pipe, make_column_selector(dtype_include=['category', 'object']))
])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=10000))
])

test = preprocessor.fit_transform(X, y.astype('int'))
print(f'Number of Features: {test[0].shape}')
cross_val_score(pipe, X, y.astype('int'), cv=5, scoring='f1').mean()

Number of Features: (2858,)


0.9560314736718631