# Selecting non-numerical columns with ColumnSelector

From: 
* https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f

## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

## Create  data

In [2]:
data = {'label': ['dog', 'cat', 'catdog', 'dog', 'catdog'], 'score': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data, columns = ["label", "score"])
df

Unnamed: 0,label,score
0,dog,1
1,cat,2
2,catdog,3
3,dog,4
4,catdog,5


## Define numerical columns

In [19]:
def get_non_numerical_columns(df):
    numerics = list(df.select_dtypes('number').columns)
    cols = list(df.columns)
    return [x for x in cols if x]

numerical

['score']

## Create custom transformer (fit and transform methods)

In [20]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

## Create numerical pipeline

In [21]:
num_pipeline = Pipeline([('num_selector', ColumnSelector(numerical))])

## Fit pipeline

In [22]:
num_pipeline.fit(df)

Pipeline(memory=None,
         steps=[('num_selector', ColumnSelector(columns=['score']))],
         verbose=False)

## Transform pipeline

In [23]:
num_pipeline.transform(df)

Unnamed: 0,score
0,1
1,2
2,3
3,4
4,5
