In [None]:
Some feature engineering good examples

> ### Building pipeline

https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC

import numpy as np
import pandas as pd

import pmlb

import random

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

sns.set(rc={"figure.figsize": (12, 8)})


class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)


class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])



df = pmlb.fetch_data('churn', return_X_y=False)

# Remove the target column and the phone number
x_cols = [c for c in df if c not in ["target", "phone number"]]

binary_features = ["international plan", "voice mail plan"]
categorical_features = ["state", "area code"]

# Column types are defaulted to floats
X = (
    df
    .drop(["target"], axis=1)
    .astype(float)
)
X[binary_features] = X[binary_features].astype("bool")

# Categorical features can't be set all at once
for f in categorical_features:
    X[f] = X[f].astype("category")

y = df.target

# Randomly set 500 items as missing values
random.seed(42)
num_missing = 500
indices = [(row, col) for row in range(X.shape[0]) for col in range(X.shape[1])]
for row, col in random.sample(indices, num_missing):
    X.iat[row, col] = np.nan

# Partition data set into training/test split (2 to 1 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=42)


preprocess_pipeline = make_pipeline(
    ColumnSelector(columns=x_cols),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            TypeSelector(np.number),
            Imputer(strategy="median"),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            TypeSelector("category"),
            Imputer(strategy="most_frequent"),
            OneHotEncoder()
        )),
        ("boolean_features", make_pipeline(
            TypeSelector("bool"),
            Imputer(strategy="most_frequent")
        ))
    ])
)

classifier_pipeline = make_pipeline(
    preprocess_pipeline,
    SVC(kernel="rbf", random_state=42)
)

param_grid = {
    "svc__gamma": [0.1 * x for x in range(1, 6)]
}

classifier_model = GridSearchCV(classifier_pipeline, param_grid, cv=10)
classifier_model.fit(X_train, y_train)

y_score = classifier_model.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)
roc_auc = roc_auc_score(y_test, y_score)

# Plot ROC curve
plt.figure(figsize=(16, 12))
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)', size=16)
plt.ylabel('True Positive Rate (Sensitivity)', size=16)
plt.title('ROC Curve', size=20)
plt.legend(fontsize=14);


- get dummy one-hot-encoder: remove collinearilty of one-hot-encoding

https://stackoverflow.com/questions/44864408/removing-columns-with-sklearns-onehotencoder

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
    
class DummyEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, n_values='auto'):
        self.n_values = n_values

    def transform(self, X):
        ohe = OneHotEncoder(sparse=False, n_values=self.n_values)
        return ohe.fit_transform(X)[:,:-1]

    def fit(self, X, y=None, **fit_params):
        return self


- one-hot encoder to get columns directly: https://medium.com/hugo-ferreiras-blog/dealing-with-categorical-features-in-machine-learning-1bb70f07262d



In [None]:
import category_encoders as ce
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)

- ColumnTransformer
https://medium.com/dunder-data/from-pandas-to-scikit-learn-a-new-exciting-workflow-e88e2271ef62

In [None]:
>>> transformers = [('cat', cat_pipe, cat_cols),
                    ('num', num_pipe, num_cols)]
>>> ct = ColumnTransformer(transformers=transformers)
>>> X = ct.fit_transform(train)
>>> X.shape

- imbalnce pipeline for smote
https://bsolomon1124.github.io/oversamp/

In [None]:

from imblearn.over_sampling import SMOTE  # or: import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (GridSearchCV,
                                     train_test_split,
                                     StratifiedKFold)

# Generate some data with an 8-to-2 class imbalance.
X, y = make_classification(n_features=5, n_samples=75,
                           random_state=444, weights=[0.8, 0.2])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=444, stratify=y)

# This doesn't work with sklearn.pipeline.Pipeline because
# RandomOverSampler doesn't have a .tranform() method.
# (It has .fit_sample() or .sample().)
pipe = imbPipeline([
    ('oversample', SMOTE(random_state=444)),
    ('clf', RandomForestClassifier(random_state=444, n_jobs=-1))
    ])

skf = StratifiedKFold()
param_grid = {'clf__max_depth': [25, 40],
              'clf__max_features': ['sqrt', 'log2']}
grid = GridSearchCV(pipe, param_grid, return_train_score=False,
                    n_jobs=-1, scoring='roc_auc', cv=skf)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
# 0.9500000000000001

- Custom transformer: labelEncoder to multiple columns

  - https://blog.pursuitofzen.com/pipelines-featureunions-gridsearchcv-and-custom-transformers/
  - https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn


- there is a link with many custom transfomers (not all correct though)
   https://stackoverflow.com/questions/47924363/labelencoder-in-sklearn-pandas-mapper-with-pipeline-after-cross-val-score-return

In [None]:
from collections import defaultdict
from sklearn.base import TransformerMixin #gives fit_transform method for free
from sklearn.base import BaseEstimator, TransformerMixin

class MultiColumnLabelEncoder(TransformerMixin):  
    """Transformer for applying label encoder on multiple columns.

    This transformer applies label encoding to columns in a dataset.
    """

    def __init__(self):
        self.d = defaultdict(LabelEncoder)

    def transform(self, X, **transform_params):
        """Transforms X to have columns label encoded.

        Args:
            X (obj): The dataset to transform. Can be dataframe or matrix.
            transform_params (kwargs, optional): Additional params.

        Returns:
            The transformed dataset with the label encoded columns.
        """
        X = X.fillna('NaN')  # fill null values with 'NaN'
        transformed = X.apply(lambda x: self.d[x.name].transform(x))
        return transformed

    def fit(self, X, y=None, **fit_params):
        """Fits transfomer over X.

        Needs to apply fit over the defaultdict so as to retain the
        label classes when transforming.
        """
        X = X.fillna('NaN')  # fill null values with 'NaN'
        X.apply(lambda x: self.d[x.name].fit(x))
        return self


feature importance
- explaining feature importances and feature names: library eli5


https://github.com/TeamHG-Memex/eli5/blob/master/notebooks/xgboost-titanic.ipynb
