In [6]:
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from __init__ import ColumnsConcatenation

In [49]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [8]:
pipe = Pipeline([("generate_columns", ColumnsConcatenation(columns='auto', level=2, max_cardinality=500))])
test = pipe.fit_transform(X)

In [10]:
test.columns

Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
       'cabin', 'embarked', 'boat', 'body', 'home.dest', 'sex_cabin',
       'sex_embarked', 'sex_boat', 'sex_home.dest', 'cabin_embarked',
       'cabin_boat', 'cabin_home.dest', 'embarked_boat', 'embarked_home.dest',
       'boat_home.dest'],
      dtype='object')

In [11]:
pipe = Pipeline([("generate_columns", ColumnsConcatenation(columns='auto', level=3, max_cardinality=500))])
test2 = pipe.fit_transform(X)

  X['_'.join(comb)] = X[comb[0]].str.cat(X[list(comb[1:])].astype(str), sep="_").astype("category")


In [16]:
test2.nunique().sort_values()

sex                                               2
pclass                                            3
embarked                                          3
embarked_sex_embarked                             6
sex_sex_embarked                                  7
                                               ... 
sex_home.dest_cabin_embarked_embarked_boat      693
sex_boat_cabin_home.dest_embarked_home.dest     693
sex_cabin_embarked_home.dest_boat_home.dest     693
ticket                                          929
name                                           1307
Length: 518, dtype: int64

# Test Model Performance

In [102]:
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

In [93]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_pipe, make_column_selector(dtype_include=['category', 'object'])),
    ('num', num_pipe, make_column_selector(dtype_exclude=['category', 'object'])) 
])

In [110]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])
cross_val_score(pipe, X, y.astype('int'), cv=5, scoring='f1').mean()

0.9542173735788906

In [111]:
pipe = Pipeline([
    ("generate_columns", ColumnsConcatenation(columns='auto', level=2, max_cardinality=500)),
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])
cross_val_score(pipe, X, y.astype('int'), cv=5, scoring='f1').mean()

0.9612918306403909