In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.base import TransformerMixin

In [None]:
use_columns = ['url_raw', 'url_clean', 'url_domain', 'ugly_text', 'issue', 'political lean', 'title', 'meta_description', 'cleaned_text']
pld = pd.read_csv('training_data.csv', usecols=use_columns)
pld_text = pld[pld.cleaned_text.notnull()]

In [None]:
class Url(TransformerMixin):

    def transform(self, X, **transform_params):
        urls = self.vect.transform(X.url_raw)
        return urls

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X.url_raw)
        return self
    
class Domain(TransformerMixin):

    def transform(self, X, **transform_params):
        domains = self.vect.transform(X.url_domain)
        return domains

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X.url_domain)
        return self
    
class GetText(TransformerMixin):

    def transform(self, X, **transform_params):
        text = X.cleaned_text
        return text

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
modelA = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(ngram_range=(1,2), min_df=4))
    ])),
    ('domain', Domain()),
    ('url', Url())
  ])),
  ('logreg', LogisticRegression())
])

modelA.fit_transform(pld_text, pld_text['political lean'])

In [None]:
modelB = Pipeline([
  ('features', FeatureUnion([('domain', Domain())])),
  ('logreg', LogisticRegression())
])

modelB.fit_transform(pld, pld['political lean'])