In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import train_test_split
from sklearn.base import TransformerMixin
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion
import numpy as np

In [2]:
clf = make_pipeline(CountVectorizer(), LogisticRegressionCV(cv=2))

data = {
    'Bruce Lee': 'Male',
    'Bruce Banner': 'Male',
    'Peter Parker': 'Male',
    'Peter Poker': 'Male',
    'Peter Springsteen': 'Male',
    'Bruce Willis': 'Male',
    'Sarah McLaughlin': 'Female',
    'Sarah Silverman': 'Female',
    'Sarah Palin': 'Female',
    'Sarah Hyland': 'Female',
    'Bruce Li': 'Male',
    'Bruce Milk': 'Male',
    'Bruce Springsteen': 'Male',
    'Bruce Willis': 'Male',
    'Sally Juice': 'Female',
    'Sarah Silverwoman': 'Female',
    'Sarah Palin': 'Female',
    'Sarah Hyland': 'Female',
    'Bruce Paul': 'Male',
    'Bruce Lame': 'Male',
    'Bruce Springsteen': 'Male',
    'Bruce Willis': 'Male',
    'Sarah Willis': 'Female',
    'Sarah Goldman': 'Female',
    'Sarah Palin': 'Female',
    'Sally Hyland': 'Female',
    'Bruce McDonald': 'Male',
    'Bruce Lane': 'Male',
    'Peter Springsteen': 'Male',
    'Bruce Willis': 'Male',
    'Sarah McLaughlin': 'Female',
    'Sarah Goldwoman': 'Female',
    'Sarah Palin': 'Female',
    'Sarah Hylie': 'Female'
    }

df = pd.DataFrame.from_dict(data, orient='index').reset_index()
df.columns = ['name', 'gender']
df['randomInt'] = np.random.choice(range(1, 6), df.shape[0])
df

Unnamed: 0,name,gender,randomInt
0,Sarah Hyland,Female,2
1,Bruce Lee,Male,2
2,Sarah McLaughlin,Female,4
3,Bruce Banner,Male,3
4,Bruce Li,Male,5
5,Bruce Milk,Male,3
6,Bruce Springsteen,Male,1
7,Peter Poker,Male,3
8,Bruce Willis,Male,5
9,Sally Juice,Female,2


In [3]:
class ExtractNames(TransformerMixin):
    def transform(self, X, *args):
        return [{'first': name.split()[0],
                 'last': name.split()[-1]}
                for name in X]

    def fit(self, *args):
        return self

class ExtractRandInt(TransformerMixin):
    def transform(self, X2, *args):
        return [{'randInt': num} for num in X2]

    def fit(self, *args):
        return self

In [4]:
trans = ExtractNames()
trans2 = ExtractRandInt()
Combined = FeatureUnion([trans, trans2])
print trans
print trans2
print Combined

<__main__.ExtractNames object at 0x109f6b210>
<__main__.ExtractRandInt object at 0x109f6b1d0>
FeatureUnion(n_jobs=1,
       transformer_list=[<__main__.ExtractNames object at 0x109f6b210>, <__main__.ExtractRandInt object at 0x109f6b1d0>],
       transformer_weights=None)


In [5]:
clf = make_pipeline(Combined, DictVectorizer(), LogisticRegressionCV())
clf

Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[<__main__.ExtractNames object at 0x109f6b210>, <__main__.ExtractRandInt object at 0x109f6b1d0>],
       transformer_weights=None)), ('dictvectorizer', DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
      ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])