https://towardsdatascience.com/simple-way-to-find-a-suitable-algorithm-for-your-data-in-scikit-learn-python-9a9710c7c0fe

In [1]:
# Set seed
seed = 8

# Data manipulation
import numpy as np
import pandas as pd
from seaborn import load_dataset

# Machine learning pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

# Import data
exclude = ['pclass', 'embarked', 'who', 'adult_male', 'alive', 'alone']
df = load_dataset('titanic').drop(columns=exclude)

# Inspect shape of the data and top rows
print(f"{df.shape[0]} rows, {df.shape[1]} columns")
df.head()

891 rows, 9 columns


Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,deck,embark_town
0,0,male,22.0,1,0,7.25,Third,,Southampton
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,,Southampton
3,1,female,35.0,1,0,53.1,First,C,Southampton
4,0,male,35.0,0,0,8.05,Third,,Southampton


In [2]:
# Set target
target = 'survived'
features = df.drop(columns=target).columns

# Split data into train & test
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=.2, random_state=seed, 
                                                    stratify=df[target])

# Inspect data
print(f"Training data ({X_train.shape[0]} rows): Target distribution")
print(y_train.value_counts(normalize=True))
print(f"\nTest data ({X_test.shape[0]} rows): Target distribution")
print(y_train.value_counts(normalize=True))

# Define feature groups
numerical = X_train.select_dtypes(['number']).columns
print(f'\nNumerical: {numerical}')
categorical = X_train.columns.difference(numerical)
X_train[categorical] = X_train[categorical].astype('object')
print(f'Categorical: {categorical}')

Training data (712 rows): Target distribution
0    0.616573
1    0.383427
Name: survived, dtype: float64

Test data (179 rows): Target distribution
0    0.616573
1    0.383427
Name: survived, dtype: float64

Numerical: Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')
Categorical: Index(['class', 'deck', 'embark_town', 'sex'], dtype='object')


In [3]:
class Imputer(BaseEstimator, TransformerMixin):
    """A custom transformer that imputes with a constant value in place.
    
    Parameters
    ----------
    value: (optional) A value to impute with
    """
    def __init__(self, value='missing'):
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.fillna(self.value, inplace=True)
        return X
    
class CardinalityReducer(BaseEstimator, TransformerMixin):
    """A custom transformer that encodes infrequent labels into 'other' in place.
    
    Parameters
    ----------
    threshold: (optional) An integer for minimum threshold frequency count or 
    a float for threshold of frequency proportion to keep the category. If 
    category frequency doesn't surpass the threshold, its value will be 
    overwritten with 'other'.  
    """
    def __init__(self, threshold=.01):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.top_categories = {}
        for feature in X.columns:
            frequencies = pd.Series(X[feature].value_counts(normalize=True))
            if isinstance(self.threshold, int):
                top_categories = frequencies.head(self.threshold).index
            elif isinstance(self.threshold, float):   
                top_categories = frequencies[frequencies>self.threshold].index
            self.top_categories[feature] = list(top_categories)
        return self

    def transform(self, X):
        for feature in X.columns:
            X[feature] = np.where(X[feature].isin(self.top_categories[feature]), 
                                  X[feature], 'other')
        return X

In [4]:
# Build preprocessing pipeline
categorical_pipe = Pipeline([('imputer', Imputer()),
                             ('cardinality_reducer', CardinalityReducer()),
                             ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                           ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(transformers=[('cat', categorical_pipe, categorical),
                                               ('num', numerical_pipe, numerical)])
# Fit and transform training data
preprocessor.fit(X_train)
cat = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical)
columns = np.append(cat, numerical)
X_train_transformed = pd.DataFrame(preprocessor.transform(X_train), columns=columns)
X_train_transformed.head()

Unnamed: 0,class_First,class_Second,class_Third,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_missing,...,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_other,sex_female,sex_male,age,sibsp,parch,fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,0.031425
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.363052,0.0,0.0,0.013565
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,0.016461
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.363052,0.0,0.0,0.015835
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.22091,0.125,0.0,0.034743


In [5]:
def create_baseline_classifiers(seed=8):
    """Create a list of baseline classifiers.
    
    Parameters
    ----------
    seed: (optional) An integer to set seed for reproducibility
    Returns
    -------
    A list containing tuple of name, model object for each of these algortihms:
    DummyClassifier, LogisticRegression, SGDClassifier, ExtraTreesClassifier, 
    GradientBoostingClassifier, RandomForestClassifier, MultinomialNB, SVC, 
    XGBClassifier.
    
    """
    models = []
    models.append(('dum', DummyClassifier(random_state=seed, strategy='most_frequent')))
    models.append(('log', LogisticRegression(random_state=seed)))
    models.append(('sgd', SGDClassifier(random_state=seed)))
    models.append(('etc', ExtraTreesClassifier(random_state=seed)))
    models.append(('gbm', GradientBoostingClassifier(random_state=seed)))
    models.append(('rfc', RandomForestClassifier(random_state=seed)))
    models.append(('mnb', MultinomialNB()))
    models.append(('svc', SVC(random_state=seed, probability=True)))
    models.append(('xgb', XGBClassifier(seed=seed)))
    return models

def assess_models(X, y, models, cv=5, metrics=['roc_auc', 'f1']):
    """Provide summary of cross validation results for models.
    
    Parameters
    ----------
    X: A pandas DataFrame containing feature matrix
    y: A pandas Series containing target vector
    models: A list of models to train
    cv: (optional) An integer to set number of folds in cross-validation
    metrics: (optional) A list of scoring metrics or a string for a metric
    Returns
    -------
    A pandas DataFrame containing summary of baseline models' performance.
    
    """
    summary = pd.DataFrame()
    for name, model in models:
        result = pd.DataFrame(cross_validate(model, X, y, cv=cv, scoring=metrics))
        mean = result.mean().rename('{}_mean'.format)
        std = result.std().rename('{}_std'.format)
        summary[name] = pd.concat([mean, std], axis=0)
    return summary.sort_index()

def extract_metric(summary, metric):
    """Provide summary of baseline models' performance for a metric.
    
    Parameters
    ----------
    summary: A pandas DataFrame containing the summary of baseline models
    metric: A string specifying the name of the metric to extract info
    
    Returns
    -------
    A pandas DataFrame containing mean, standard deviation, lower and upper
    bound of the baseline models' performance in cross validation according to
    the metric specified.
    
    """
    output = summary[summary.index.str.contains(metric)].T
    output.columns = output.columns.str.replace(f'test_{metric}_', '')
    output.sort_values(by='mean', ascending=False, inplace=True)
    output['lower'] = output['mean'] - 2*output['std']
    output['upper'] = output['mean'] + 2*output['std']
    return output

In [6]:
models = create_baseline_classifiers()
summary = assess_models(X_train_transformed, y_train, models)
summary

Unnamed: 0,dum,log,sgd,etc,gbm,rfc,mnb,svc,xgb
fit_time_mean,0.00071,0.020045,0.003595,0.080918,0.063208,0.104862,0.001591,0.027985,0.091715
fit_time_std,0.000677,0.003446,0.00022,0.003421,0.000749,0.006024,0.000514,0.000795,0.109854
score_time_mean,0.001582,0.003099,0.0027,0.016798,0.003898,0.016497,0.002992,0.004598,0.008835
score_time_std,0.000617,0.000546,0.000446,0.000454,0.000546,0.001087,0.000349,0.000653,0.000742
test_f1_mean,0.0,0.724117,0.630882,0.696769,0.72845,0.717753,0.678351,0.680479,0.75555
test_f1_std,0.0,0.045849,0.150847,0.036271,0.056353,0.030834,0.076662,0.053154,0.049842
test_roc_auc_mean,0.5,0.838609,0.837111,0.808926,0.841248,0.831918,0.820045,0.836848,0.834282
test_roc_auc_std,0.0,0.040763,0.03876,0.037078,0.041538,0.034072,0.058406,0.039433,0.027477


In [7]:
extract_metric(summary, 'roc_auc')

Unnamed: 0,mean,std,lower,upper
gbm,0.841248,0.041538,0.758172,0.924325
log,0.838609,0.040763,0.757082,0.920136
sgd,0.837111,0.03876,0.759591,0.91463
svc,0.836848,0.039433,0.757982,0.915715
xgb,0.834282,0.027477,0.779327,0.889236
rfc,0.831918,0.034072,0.763774,0.900062
mnb,0.820045,0.058406,0.703234,0.936856
etc,0.808926,0.037078,0.73477,0.883082
dum,0.5,0.0,0.5,0.5


In [8]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor

def create_baseline_regressors(seed=8):
    """Create a list of of baseline regressors.
    
    Parameters
    ----------
    seed: (optional) An integer to set seed for reproducibility
    Returns
    -------
    A list containing tuple of name, model object for each of these algortihms:
    DummyRegressor, LinearRegression, SGDRegressor, ExtraTreesRegressor,
    GradientBoostingRegressor, RandomForestRegressor, SVR, XGBRegressor.
    
    """
    models = []
    models.append(('dum', DummyRegressor(strategy='mean')))
    models.append(('ols', LinearRegression()))
    models.append(('sgd', SGDRegressor(random_state=seed)))
    models.append(('etr', ExtraTreesRegressor(random_state=seed)))
    models.append(('gbm', GradientBoostingRegressor(random_state=seed)))
    models.append(('rfr', RandomForestRegressor(random_state=seed)))
    models.append(('svc', SVR()))
    models.append(('xgb', XGBRegressor(seed=seed)))
    return models