### Simple Pipeline
Pipeline that can process **only one type of features** numeric/categorical (pipe must be redifined)

In [2]:
def show_models_cv_pipeline(models, cv_type, x_train, y_train, metrics, is_aggregated=True):
    """
    models: list 
        List of model candidates
        
    cv_type: cross-validation type
    
    metrics: list 
        Metric to calculate
        
    is_aggregated: bool
        Wether to return aggregated results or on each fold
    """
    res_df = pd.DataFrame()
    
    for model in log_progress(models):
        model_pipeline = Pipeline([
            ('data_imputing', KNNImputer()),
            ('data_scaling', StandardScaler()),
            ('model', model)
        ])
    
        cv_results = cross_validate(model_pipeline, X=x_train, y=y_train, cv=cv_type, scoring=metrics)
        cv_results['Model'] = str(model).split('(')[0] # extract model name
        res_df = res_df.append(pd.DataFrame(cv_results))
            
    # For making the name of the model the first
    new_columns_order = list(res_df.columns[2:-1])
    new_columns_order.insert(0, 'Model')
    
    # Returning results either on each fold or aggregated 
    if is_aggregated:
        return res_df[new_columns_order].groupby(by='Model').mean()
    else:
        return res_df[new_columns_order]

### Pipeline All Features

In [None]:
# Переопределим функцию make_pipe, добавив блок final_imputing ( необходимо добавить, т.к. появляется inf, которую заменяем на 0)
def make_pipe(cat_bin_columns,
              num_columns,
              model=None,
              cat_bin_imputer=SimpleImputer(strategy='constant', fill_value='unknown'),
              cat_bin_encoder=OneHotEncoder(sparse=True, handle_unknown='ignore'),
              num_imputer=SimpleImputer(strategy='mean'),
              num_scaler=StandardScaler()):
    
    # Categorical/Binary Features Processing
    cat_bin_pipeline = Pipeline([
        ('cat_bin_imputing', cat_bin_imputer),
        ('cat_bin_encoding', cat_bin_encoder),
        ('final_imputing', SimpleImputer(strategy='constant', fill_value=0)) # Полученные inf заменяем на 0
    ])
    
    # Numerical Features Processing
    num_pipeline = Pipeline([
        ('num_imputing', num_imputer),
        ('num_scaling', num_scaler)
    ])
    
    # Main Transformations
    transformations = [
        ('cat_bin_transformations', cat_bin_pipeline, cat_bin_columns),
        ('num_transformations', num_pipeline, num_columns)
    ]
    
    feature_transformations = ColumnTransformer(transformers=transformations, n_jobs=-1)
    
    main_pipeline = Pipeline([
        ('feature_transformations', feature_transformations)
    ])
    
    if model is not None:
        main_pipeline.steps.insert(1, ('model', model))
        return main_pipeline
    else:
        return main_pipeline

### Pipeline + ColumnTransformer (All Features)
Cross-validation for different models, imputers, encoders and scalers...processes all features independently and returns cross-validation score

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, cross_validate

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from tqdm.notebook import tqdm as log_progress

def show_model_cvs_pipeline_all_features(models, x_train, y_train, 
                                         cv_type, metrics,
                                         num_columns, cat_columns, bin_columns, 
                                         bin_imputer=SimpleImputer(strategy='most_frequent'), bin_encoder=OrdinalEncoder(),
                                         cat_imputer=SimpleImputer(strategy='most_frequent'), cat_encoder=OneHotEncoder(sparse=True, handle_unknown='ignore'),
                                         num_imputer=KNNImputer(), scaler=StandardScaler(),
                                         is_aggregated=True):
    """
    models - list of tested models
    cv_type - type of cross-validation (e.g. StratifiedKFold(shuffle=True, random_state=SEED))
    metrics - list of metrics to calculate (e.g. ['precision', 'recall', 'f1', 'roc_auc'])
    is_aggregated - return the info either for each fold or average estimation
    
    """
    
    # Pipeline Definition

    # Binary Features Processing 
    binary_pipeline = Pipeline([
        ('binary_imputing', bin_imputer), # Any Imputer Here
        ('binary_encoding', bin_encoder) # Any Encoder Here
    ])

    # Categorical Features Processing 
    cat_pipeline = Pipeline([
        ('cat_imputing', cat_imputer),
        ('cat_encoding', cat_encoder)
    ])

    # Numerical Features Processing
    num_pipeline = Pipeline([
        ('data_imputing', num_imputer),
        ('data_scaling', scaler)
    ])

    transformations = [
        ('num_transformations', num_pipeline, num_columns),
        ('bin_transformations', binary_pipeline, bin_columns),
        ('cat_transformations', cat_pipeline, cat_columns)
    ]

    feature_transformations = ColumnTransformer(transformers=transformations)
    
    res_df = pd.DataFrame() # CV results will be stored here
    
    # CV part for provided models
    for model in log_progress(models):
        model_pipeline = Pipeline([
            ('feature_transformations', feature_transformations),
            ('model', model)
        ])
        
        cv_results = cross_validate(model_pipeline, X=x_train, y=y_train, cv=cv_type,
                                    scoring=metrics, error_score='raise', n_jobs=-1)
        
        cv_results['Model'] = str(model).split('(')[0] # extract the name of the current model
        res_df = res_df.append(pd.DataFrame(cv_results))

    # Make the first column store the name of the model + drop unnecessary columns  
    new_columns_order = list(res_df.columns[2:-1])
    new_columns_order.insert(0, 'Model')
    
    if is_aggregated:
        return res_df[new_columns_order].groupby(by='Model').mean()
    else:
        return res_df[new_columns_order]

### Test Predictions Using Pipeline 

In [5]:
# Function for saving the test predictions
def to_submission(data, f_name):
    result = pd.DataFrame({'Id':range(data.shape[0]),
                           'result':data[:,1]})
    result.to_csv(f_name, index=False)

# Make predictions and saves them if needed
def make_test_preds_pipeline(models, x_train, y_train, x_test, f_names=None, save_results=True):
    """
    models - list of models
    f_names - list of file names    
    
    """
    
    predictions = []
    
    for model in log_progress(models):
        model_pipeline = Pipeline([
            ('feature_transformations', feature_transformations),
            ('model', model)
        ])
        model_pipeline.fit(x_train, y_train)
        predictions.append(model_pipeline.predict_proba(x_test))
        
    if save_results:
        for indx, pred in enumerate(predictions):
            to_submission(pred, f_name=f_names[indx])
    else:
        return predictions