In [1]:
from goose import Goose
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
from textblob import TextBlob, Word
from sklearn.base import TransformerMixin
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV
import datetime
%matplotlib inline

In [2]:
pld = pd.read_csv('0_16000.csv')

In [5]:
# del pld['Unnamed: 0']
pld.columns = ['url_raw', 'url_clean', 'url_domain', 'ugly_text', 'issue', 'political_lean', 'title', 'meta_description', 'cleaned_text']
pld.head(3)

Unnamed: 0,url_raw,url_clean,url_domain,ugly_text,issue,political_lean,title,meta_description,cleaned_text
0,https://www.washingtonpost.com/news/post-natio...,washingtonpost.com/news/post-nation/wp/2016/05...,washingtonpost.com,2 Desktop notifications are ...,abortion,Lean Left,‘A target on Roe v. Wade ’: Oklahoma bill maki...,Gov. Mary Fallin (R) has not said if she plans...,UPDATE: Gov. Fallin vetoed the bill on Friday....
1,http://www.salon.com/2016/04/07/camille_paglia...,salon.com/2016/04/07/camille_paglia_feminists_...,salon.com,\n\n\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t...,abortion,Left,"Camille Paglia: Feminists have abortion wrong,...",Reproductive rights have become ideological to...,"While the Hillary flap was merely a blip, give..."
2,http://www.vox.com/2016/3/20/11269226/texas-ab...,vox.com/2016/3/20/11269226/texas-abortion-wome...,vox.com,"\n \n \n\n(function(w,d,s,l,i){w[l]=w[l]...",abortion,Lean Left,Study: women had to drive 4 times farther afte...,Here's exactly how Texas anti-abortion laws bu...,Ever since Texas laws closed about half of the...


In [6]:
pld_text = pld[pld['cleaned_text'].notnull()]
pld_text[['cleaned_text', 'url_domain', 'political_lean', 'issue', 'title']].count()

cleaned_text      12501
url_domain        12501
political_lean    12501
issue             12501
title             12462
dtype: int64

In [32]:
# PIPELINE CAN ALSO BE USED WITH GRIDSEARCHCV, BUT VERY SLOW
# pipe = Pipeline([
#   ('features', FeatureUnion([
#         ('counts', CountVectorizer()),
#         ('tf_idf', TfidfVectorizer())
#   ])),
#   ('classifier', MultinomialNB())
# ])

# SEARCH FOR AN OPTIMAL N_GRAM VALUE USING GRADSEARCHCV
# gram_range = [(1, n) for n in range(1, 3)]
# param_grid = {
#     'features__counts__ngram_range': gram_range,
#     'features__tf_idf__ngram_range': gram_range,
# }

# grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
# grid.fit(df.msg, df.label)
# print grid.best_score_, grid.best_params_

In [9]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.581698121969
0:05:05.378078


In [10]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.463778334447
0:00:42.478829


In [204]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.587583794081
0:08:32.436998


In [206]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.496657158128
0:01:25.248623


In [34]:
class Domain(TransformerMixin):

    def transform(self, X, **transform_params):
        domains = self.vect.transform(X.url_domain)
        return domains

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['url_domain'])
        return self

In [35]:
class WordVect(TransformerMixin):

    def transform(self, X, **transform_params):
        words = self.vect.transform(X.cleaned_text)
        return words

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['cleaned_text'])
        return self

In [13]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('words', WordVect()),
        ('domain', Domain())
  ])),
  ('classifier', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

WordVectorizer:
(9999, 57965)
DomainVectorizer:
(9999, 165)
WordVectorizer:
(10000, 58137)
DomainVectorizer:
(10000, 153)
WordVectorizer:
(10000, 58492)
DomainVectorizer:
(10000, 170)
WordVectorizer:
(10002, 57278)
DomainVectorizer:
(10002, 157)
WordVectorizer:
(2502, 57965)
DomainVectorizer:
(2502, 165)
WordVectorizer:
(2499, 57278)
DomainVectorizer:
(2499, 157)
WordVectorizer:
(2501, 58137)
DomainVectorizer:
(2501, 153)
WordVectorizer:
(2501, 58492)
DomainVectorizer:
(2501, 170)
WordVectorizer:
(10003, 57929)
DomainVectorizer:
(10003, 167)
WordVectorizer:
(2498, 57929)
DomainVectorizer:
(2498, 167)
0.853034443257
0:03:12.789926


In [16]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(stop_words='english'))
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['cleaned_text'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.569940582322
0:03:00.713157


In [18]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(stop_words='english'))
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text['cleaned_text'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.461698717961
0:00:44.510973


In [19]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(ngram_range=(1, 2), min_df=3))
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['cleaned_text'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.62386386678
0:21:44.069018


In [30]:
def pipe(col):
    time = datetime.datetime.now()
    pld_issue = pld_text[pld_text.issue == col]
    if pld_issue['cleaned_text'].count() > 5:
        pipe = Pipeline([
          ('features', FeatureUnion([
                ('counts', CountVectorizer())
          ])),
          ('logreg', LogisticRegression())
        ])
        print " "
        print "issue: " + str(col)
        print cross_val_score(pipe, pld_issue['cleaned_text'], pld_issue.political_lean, cv=5, scoring='accuracy').mean()
        print datetime.datetime.now() - time

In [31]:
top_six_cols = pld_text.issue.unique()  #['election-2012', 'healthcare-0', 'immigration', 'economic-policy-debt-deficit', 'economy-jobs', 'gun-legislation']
for col in top_six_cols:
    pipe(col)

 
issue: abortion
0.460374396135
0:00:01.158694
 
issue: asia
0.269761904762
0:00:00.278807
 
issue: campaign-finance
0.548534798535
0:00:00.394141
 
issue: civil-rights
0.436969657875
0:00:02.179912
 
issue: cia
0.534017094017
0:00:00.557582
 
issue: foreign-policy




0.45219279201
0:00:03.083598
 
issue: gay-rights
0.393599336178
0:00:01.971592
 
issue: us-congress
0.494039294039
0:00:01.897374
 
issue: us-house-representatives
0.410714285714
0:00:01.366856
 
issue: criminal-justice
0.488528138528
0:00:00.401921
 
issue: defense
0.413758912656
0:00:00.986979
 
issue: democrat-party
0.490361812101
0:00:00.780160
 
issue: education
0.410294473377
0:00:02.571426
 
issue: domestic-policy
0.273333333333
0:00:00.108451
 
issue: economic-policy
0.406415954416
0:00:00.957803
 
issue: elections
0.574809067416
0:00:03.228309
 
issue: election-2012
0.591405090095
0:00:47.487605
 
issue: economy-jobs
0.560315790971
0:00:03.873658
 
issue: economic-policy-debt-deficit
0.572712600216
0:00:05.733343
 
issue: environment
0.445716345659
0:00:02.601595
 
issue: free-speech
0.406666666667
0:00:00.391962
 
issue: gun-legislation
0.469886876366
0:00:04.977387
 
issue: immigration
0.550232077636
0:00:05.082927
 
issue: energy
0.386260683761
0:00:00.553614
 
issue: justi



0.602989695589
0:00:07.744309
 
issue: israel
0.531944444444
0:00:00.497304
 
issue: labor
0.483888888889
0:00:00.221904
 
issue: media-watchmedia-bias
0.55697252733
0:00:03.642402
 
issue: medicare
0.508441558442
0:00:00.373096
 
issue: middle-east
0.452151990145
0:00:01.685291


In [53]:
class GetText(TransformerMixin):

    def transform(self, X, **transform_params):
        domains = X.cleaned_text
        return domains

    def fit(self, X, y=None, **fit_params):
        return self

In [62]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer())
    ])),
    ('domain', Domain())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.604970517794
0:00:38.221002


In [61]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer())
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.853034443257
0:03:16.615209


In [64]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(min_df=4))
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.853834539577
0:03:34.059663


In [57]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(ngram_range=(1, 2), min_df=2))
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.825038082707
0:15:43.350393


In [58]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(ngram_range=(1, 2), min_df=4))
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.829117092128
0:11:23.004279


In [59]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(ngram_range=(1, 2), min_df=3))
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.827037699142
0:14:29.761025


In [65]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(ngram_range=(1, 3), min_df=4))
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.819597408492
0:24:33.104491


In [66]:
class Issue(TransformerMixin):

    def transform(self, X, **transform_params):
        domains = self.vect.transform(X.issue)
        return domains

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['issue'])
        return self

In [72]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(min_df=4, max_features=25000))
    ])),
    ('domain', Domain())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.853674571513
0:03:12.073189


In [73]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(min_df=4))
    ])),
    ('domain', Domain()),
    ('issue', Issue())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.852394251154
0:02:20.039432


In [113]:
class Url(TransformerMixin):

    def transform(self, X, **transform_params):
        domains = self.vect.transform(X.url_raw)
        return domains

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['url_raw'])
        return self

In [81]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(min_df=4))
    ])),
    ('domain', Domain()),
    ('url', Url())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

0.900953915603
0:03:05.730800


In [80]:
pld_text['cleaned_text_length'] = pld_text['cleaned_text'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [105]:
pld_text.cleaned_text_length.max()

85488

In [122]:
class WordCount(TransformerMixin):

    def transform(self, X, **transform_params):
        count = X.cleaned_text_length
        return count

    def fit(self, X, y=None, **fit_params):
        return self

In [124]:
class Sentiment(TransformerMixin):

    def transform(self, X, **transform_params):
        sentiment = self.vect.transform(X.sentiment)
        return sentiment

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['sentiment'])
        return self

In [123]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
    ('word', Pipeline([
      ('gettext', GetText()),
      ('counts', CountVectorizer(min_df=4))
    ])),
    ('domain', Domain()),
    ('url', Url()),
    ('sentiment', Sentiment())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    169     pkg_name = mod_name.rpartition('.')[0]
    170     main_globals = sys.modules["__main__"].__dict__
    171     if alter_argv:
    172         sys.argv[0] = fname
    173     return _run_code(code, main_globals, None,
--> 174                      "__main__", fname, loader, pkg_name)
        fname = '/Users/stanleystevensWhistle/miniconda2/envs/sta...lib/python2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    175 
    176 def run_module(mod_name, init_globals=None,
    177                run_name=None, alter_sys=False):
    178     """Execute a module's code without importing it

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x1020d9030, file "/Use...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Users/stanleystevensWhistle/miniconda2/envs/sta...lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Users/stanl...python2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/Users/stanleystevensWhistle/miniconda2/envs/sta...lib/python2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x1020d9030, file "/Use...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Users/stanleystevensWhistle/miniconda2/envs/sta...lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Users/stanl...python2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    648 
    649         If a global instance already exists, this reinitializes and starts it
    650         """
    651         app = cls.instance(**kwargs)
    652         app.initialize(argv)
--> 653         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    654 
    655 #-----------------------------------------------------------------------------
    656 # utility functions, for convenience
    657 #-----------------------------------------------------------------------------

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    469             return self.subapp.start()
    470         if self.poller is not None:
    471             self.poller.start()
    472         self.kernel.start()
    473         try:
--> 474             ioloop.IOLoop.instance().start()
    475         except KeyboardInterrupt:
    476             pass
    477 
    478 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    882                 self._events.update(event_pairs)
    883                 while self._events:
    884                     fd, events = self._events.popitem()
    885                     try:
    886                         fd_obj, handler_func = self._handlers[fd]
--> 887                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    888                     except (OSError, IOError) as e:
    889                         if errno_from_exception(e) == errno.EPIPE:
    890                             # Happens when the client closes the connection
    891                             pass

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'time = datetime.datetime.now()\n\npipe = Pipeline(...=-1).mean()\n\nprint datetime.datetime.now() - time', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-11-30T17:04:58.511020', 'msg_id': '7CB4AA7023F34B52ACA631F9812C0F01', 'msg_type': 'execute_request', 'session': '5CAEA0B28E1A449DA1D15902359839D5', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '7CB4AA7023F34B52ACA631F9812C0F01', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['5CAEA0B28E1A449DA1D15902359839D5']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'time = datetime.datetime.now()\n\npipe = Pipeline(...=-1).mean()\n\nprint datetime.datetime.now() - time', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-11-30T17:04:58.511020', 'msg_id': '7CB4AA7023F34B52ACA631F9812C0F01', 'msg_type': 'execute_request', 'session': '5CAEA0B28E1A449DA1D15902359839D5', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '7CB4AA7023F34B52ACA631F9812C0F01', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['5CAEA0B28E1A449DA1D15902359839D5'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'time = datetime.datetime.now()\n\npipe = Pipeline(...=-1).mean()\n\nprint datetime.datetime.now() - time', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-11-30T17:04:58.511020', 'msg_id': '7CB4AA7023F34B52ACA631F9812C0F01', 'msg_type': 'execute_request', 'session': '5CAEA0B28E1A449DA1D15902359839D5', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '7CB4AA7023F34B52ACA631F9812C0F01', 'msg_type': 'execute_request', 'parent_header': {}})
    385         if not silent:
    386             self.execution_count += 1
    387             self._publish_execute_input(code, parent, self.execution_count)
    388 
    389         reply_content = self.do_execute(code, silent, store_history,
--> 390                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    391 
    392         # Flush output before sending the reply.
    393         sys.stdout.flush()
    394         sys.stderr.flush()

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u"time = datetime.datetime.now()\n\npipe = Pipel...).mean()\n\nprint datetime.datetime.now() - time", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u"time = datetime.datetime.now()\n\npipe = Pipel...).mean()\n\nprint datetime.datetime.now() - time"
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=(u"time = datetime.datetime.now()\n\npipe = Pipel...).mean()\n\nprint datetime.datetime.now() - time",), **kwargs={'silent': False, 'store_history': True})
    496             )
    497         self.payload_manager.write_payload(payload)
    498 
    499     def run_cell(self, *args, **kwargs):
    500         self._last_traceback = None
--> 501         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = (u"time = datetime.datetime.now()\n\npipe = Pipel...).mean()\n\nprint datetime.datetime.now() - time",)
        kwargs = {'silent': False, 'store_history': True}
    502 
    503     def _showtraceback(self, etype, evalue, stb):
    504         # try to preserve ordering of tracebacks and print statements
    505         sys.stdout.flush()

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u"time = datetime.datetime.now()\n\npipe = Pipel...).mean()\n\nprint datetime.datetime.now() - time", store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Print object>, <_ast.Print object>], cell_name='<ipython-input-123-9484b6344e1d>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<ExecutionResult object at 1382f9d50, execution_..._before_exec=None error_in_exec=None result=None>)
   2816 
   2817         try:
   2818             for i, node in enumerate(to_run_exec):
   2819                 mod = ast.Module([node])
   2820                 code = compiler(mod, cell_name, "exec")
-> 2821                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x1383277b0, file "<ipython-input-123-9484b6344e1d>", line 16>
        result = <ExecutionResult object at 1382f9d50, execution_..._before_exec=None error_in_exec=None result=None>
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x1383277b0, file "<ipython-input-123-9484b6344e1d>", line 16>, result=<ExecutionResult object at 1382f9d50, execution_..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x1383277b0, file "<ipython-input-123-9484b6344e1d>", line 16>
        self.user_global_ns = {'BeautifulSoup': <class 'bs4.BeautifulSoup'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'Domain': <class '__main__.Domain'>, 'FeatureUnion': <class 'sklearn.pipeline.FeatureUnion'>, 'GetText': <class '__main__.GetText'>, 'Goose': <class 'goose.Goose'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', u"from goose import Goose\nimport pandas as pd\n...etime\nget_ipython().magic(u'matplotlib inline')", u"pld = pd.read_csv('0_16000.csv')", u"# del pld['Unnamed: 0']\n# pld.head()\n# cols ...'meta_description', 'cleaned_text']\npld.head(3)", u"del pld['Unnamed: 0']\n# pld.head()\n# cols = ...'meta_description', 'cleaned_text']\npld.head(3)", u"# del pld['Unnamed: 0']\npld.columns = ['url_r...'meta_description', 'cleaned_text']\npld.head(3)", u"pld_text = pld[pld['cleaned_text'].notnull()]\...n', 'political_lean', 'issue', 'title']].count()", u"top_six_cols = pld_text.columns  #['election-2... in top_six_cols:\n#     pipe(col)\ntop_six_cols", u"top_six_cols = pld_text.columns  #['election-2...p_six_cols:\n#     pipe(col)\npld_text['issues']", u"top_six_cols = pld_text.columns  #['election-2...op_six_cols:\n#     pipe(col)\npld_text['issue']", u"top_six_cols = pld_text.columns  #['election-2...   pipe(col)\npld_text['issue'].groupby('issue')", u"top_six_cols = pld_text.columns  #['election-2...ols:\n#     pipe(col)\npld_text.groupby('issue')", u"top_six_cols = pld_text.columns  #['election-2...:\n#     pipe(col)\npld_text.groupby('issue')[0]", u"top_six_cols = pld_text.columns  #['election-2...ols:\n#     pipe(col)\npld_text.groupby('issue')", u"top_six_cols = pld_text.columns  #['election-2...    pipe(col)\npld_text.groupby('issue').count()", u"top_six_cols = pld_text.columns  #['election-2... pipe(col)\npld_text.groupby('issue').count()[0]", u"top_six_cols = pld_text.columns  #['election-2... pipe(col)\npld_text.groupby('issue').count()[1]", u"top_six_cols = pld_text.columns  #['election-2...    pipe(col)\npld_text.groupby('issue').count()", u"top_six_cols = pld_text.columns  #['election-2...col)\npld_text.groupby('issue').count()['issue']", u"top_six_cols = pld_text.columns  #['election-2...    pipe(col)\npld_text.groupby('issue').count()", ...], 'Issue': <class '__main__.Issue'>, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, ...}
        self.user_ns = {'BeautifulSoup': <class 'bs4.BeautifulSoup'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'Domain': <class '__main__.Domain'>, 'FeatureUnion': <class 'sklearn.pipeline.FeatureUnion'>, 'GetText': <class '__main__.GetText'>, 'Goose': <class 'goose.Goose'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', u"from goose import Goose\nimport pandas as pd\n...etime\nget_ipython().magic(u'matplotlib inline')", u"pld = pd.read_csv('0_16000.csv')", u"# del pld['Unnamed: 0']\n# pld.head()\n# cols ...'meta_description', 'cleaned_text']\npld.head(3)", u"del pld['Unnamed: 0']\n# pld.head()\n# cols = ...'meta_description', 'cleaned_text']\npld.head(3)", u"# del pld['Unnamed: 0']\npld.columns = ['url_r...'meta_description', 'cleaned_text']\npld.head(3)", u"pld_text = pld[pld['cleaned_text'].notnull()]\...n', 'political_lean', 'issue', 'title']].count()", u"top_six_cols = pld_text.columns  #['election-2... in top_six_cols:\n#     pipe(col)\ntop_six_cols", u"top_six_cols = pld_text.columns  #['election-2...p_six_cols:\n#     pipe(col)\npld_text['issues']", u"top_six_cols = pld_text.columns  #['election-2...op_six_cols:\n#     pipe(col)\npld_text['issue']", u"top_six_cols = pld_text.columns  #['election-2...   pipe(col)\npld_text['issue'].groupby('issue')", u"top_six_cols = pld_text.columns  #['election-2...ols:\n#     pipe(col)\npld_text.groupby('issue')", u"top_six_cols = pld_text.columns  #['election-2...:\n#     pipe(col)\npld_text.groupby('issue')[0]", u"top_six_cols = pld_text.columns  #['election-2...ols:\n#     pipe(col)\npld_text.groupby('issue')", u"top_six_cols = pld_text.columns  #['election-2...    pipe(col)\npld_text.groupby('issue').count()", u"top_six_cols = pld_text.columns  #['election-2... pipe(col)\npld_text.groupby('issue').count()[0]", u"top_six_cols = pld_text.columns  #['election-2... pipe(col)\npld_text.groupby('issue').count()[1]", u"top_six_cols = pld_text.columns  #['election-2...    pipe(col)\npld_text.groupby('issue').count()", u"top_six_cols = pld_text.columns  #['election-2...col)\npld_text.groupby('issue').count()['issue']", u"top_six_cols = pld_text.columns  #['election-2...    pipe(col)\npld_text.groupby('issue').count()", ...], 'Issue': <class '__main__.Issue'>, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
/Users/stanleystevensWhistle/Develop/data_science/sfdat28-stevens/projects/political_lean/<ipython-input-123-9484b6344e1d> in <module>()
     11     ('wordcount', WordCount())
     12   ])),
     13   ('logreg', LogisticRegression())
     14 ])
     15 
---> 16 print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()
     17 
     18 print datetime.datetime.now() - time
     19 
     20 

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/cross_validation.py in cross_val_score(estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs...0.0001,
          verbose=0, warm_start=False))]), X=                                                ...                 711  

[12501 rows x 10 columns], y=0        Lean Left
1             Left
2        L...       Center
Name: political_lean, dtype: object, scoring='accuracy', cv=sklearn.cross_validation.StratifiedKFold(labels=...r'], n_folds=5, shuffle=False, random_state=None), n_jobs=-1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs')
   1428     parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
   1429                         pre_dispatch=pre_dispatch)
   1430     scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
   1431                                               train, test, verbose, None,
   1432                                               fit_params)
-> 1433                       for train, test in cv)
        cv = sklearn.cross_validation.StratifiedKFold(labels=...r'], n_folds=5, shuffle=False, random_state=None)
   1434     return np.array(scores)[:, 0]
   1435 
   1436 
   1437 class FitFailedWarning(RuntimeWarning):

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    805             if pre_dispatch == "all" or n_jobs == 1:
    806                 # The iterable was consumed all at once by the above for loop.
    807                 # No need to wait for async callbacks to trigger to
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time
    813             self._print('Done %3i out of %3i | elapsed: %s finished',
    814                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Wed Nov 30 17:05:16 2016
PID: 12366Python 2.7.12: /Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/bin/python
...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('features', FeatureUnion(n_jobs...0.0001,
          verbose=0, warm_start=False))]),                                                 ...                 711  

[12501 rows x 10 columns], 0        Lean Left
1             Left
2        L...       Center
Name: political_lean, dtype: object, make_scorer(accuracy_score), array([ 1259,  1260,  1597, ..., 12498, 12499, 12500]), array([   0,    1,    2, ..., 2570, 2579, 2821]), 0, None, None)
        kwargs = {}
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('features', FeatureUnion(n_jobs...0.0001,
          verbose=0, warm_start=False))]),                                                 ...                 711  

[12501 rows x 10 columns], 0        Lean Left
1             Left
2        L...       Center
Name: political_lean, dtype: object, make_scorer(accuracy_score), array([ 1259,  1260,  1597, ..., 12498, 12499, 12500]), array([   0,    1,    2, ..., 2570, 2579, 2821]), 0, None, None), {})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs...0.0001,
          verbose=0, warm_start=False))]), X=                                                ...                 711  

[12501 rows x 10 columns], y=0        Lean Left
1             Left
2        L...       Center
Name: political_lean, dtype: object, scorer=make_scorer(accuracy_score), train=array([ 1259,  1260,  1597, ..., 12498, 12499, 12500]), test=array([   0,    1,    2, ..., 2570, 2579, 2821]), verbose=0, parameters=None, fit_params={}, return_train_score=False, return_parameters=False, error_score='raise')
   1526 
   1527     try:
   1528         if y_train is None:
   1529             estimator.fit(X_train, **fit_params)
   1530         else:
-> 1531             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method Pipeline.fit of Pipeline(steps=[('....0001,
          verbose=0, warm_start=False))])>
        X_train =                                                 ...                  711  

[9999 rows x 10 columns]
        y_train = 1686          Mixed
1687          Mixed
2153    ...       Center
Name: political_lean, dtype: object
        fit_params = {}
   1532 
   1533     except Exception as e:
   1534         if error_score == 'raise':
   1535             raise

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('features', FeatureUnion(n_jobs...0.0001,
          verbose=0, warm_start=False))]), X=                                                ...                  711  

[9999 rows x 10 columns], y=1686          Mixed
1687          Mixed
2153    ...       Center
Name: political_lean, dtype: object, **fit_params={})
    159             pipeline.
    160         y : iterable, default=None
    161             Training targets. Must fulfill label requirements for all steps of
    162             the pipeline.
    163         """
--> 164         Xt, fit_params = self._pre_transform(X, y, **fit_params)
        Xt = undefined
        fit_params = {}
        self._pre_transform = <bound method Pipeline._pre_transform of Pipelin....0001,
          verbose=0, warm_start=False))])>
        X =                                                 ...                  711  

[9999 rows x 10 columns]
        y = 1686          Mixed
1687          Mixed
2153    ...       Center
Name: political_lean, dtype: object
    165         self.steps[-1][-1].fit(Xt, y, **fit_params)
    166         return self
    167 
    168     def fit_transform(self, X, y=None, **fit_params):

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/pipeline.py in _pre_transform(self=Pipeline(steps=[('features', FeatureUnion(n_jobs...0.0001,
          verbose=0, warm_start=False))]), X=                                                ...                  711  

[9999 rows x 10 columns], y=1686          Mixed
1687          Mixed
2153    ...       Center
Name: political_lean, dtype: object, **fit_params={})
    140             step, param = pname.split('__', 1)
    141             fit_params_steps[step][param] = pval
    142         Xt = X
    143         for name, transform in self.steps[:-1]:
    144             if hasattr(transform, "fit_transform"):
--> 145                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
        Xt =                                                 ...                  711  

[9999 rows x 10 columns]
        transform.fit_transform = <bound method FeatureUnion.fit_transform of Feat...0x1382539d0>)],
       transformer_weights=None)>
        y = 1686          Mixed
1687          Mixed
2153    ...       Center
Name: political_lean, dtype: object
        fit_params_steps = {'features': {}, 'logreg': {}}
        name = 'features'
    146             else:
    147                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
    148                               .transform(Xt)
    149         return Xt, fit_params_steps[self.steps[-1][0]]

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/sklearn/pipeline.py in fit_transform(self=FeatureUnion(n_jobs=1,
       transformer_list=[... 0x1382539d0>)],
       transformer_weights=None), X=                                                ...                  711  

[9999 rows x 10 columns], y=1686          Mixed
1687          Mixed
2153    ...       Center
Name: political_lean, dtype: object, **fit_params={})
    497             for name, trans in self.transformer_list)
    498 
    499         Xs, transformers = zip(*result)
    500         self._update_transformer_list(transformers)
    501         if any(sparse.issparse(f) for f in Xs):
--> 502             Xs = sparse.hstack(Xs).tocsr()
        Xs = (<9999x26818 sparse matrix of type '<type 'numpy.... stored elements in Compressed Sparse Row format>, <9999x165 sparse matrix of type '<type 'numpy.in... stored elements in Compressed Sparse Row format>, <9999x16532 sparse matrix of type '<type 'numpy.... stored elements in Compressed Sparse Row format>, 1686      2940
1687      1938
2153       248
291...      711
Name: cleaned_text_length, dtype: int64)
        Xs.tocsr = undefined
    503         else:
    504             Xs = np.hstack(Xs)
    505         return Xs
    506 

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/scipy/sparse/construct.py in hstack(blocks=(<9999x26818 sparse matrix of type '<type 'numpy.... stored elements in Compressed Sparse Row format>, <9999x165 sparse matrix of type '<type 'numpy.in... stored elements in Compressed Sparse Row format>, <9999x16532 sparse matrix of type '<type 'numpy.... stored elements in Compressed Sparse Row format>, 1686      2940
1687      1938
2153       248
291...      711
Name: cleaned_text_length, dtype: int64), format=None, dtype=None)
    459     >>> hstack([A,B]).toarray()
    460     array([[1, 2, 5],
    461            [3, 4, 6]])
    462 
    463     """
--> 464     return bmat([blocks], format=format, dtype=dtype)
        blocks = (<9999x26818 sparse matrix of type '<type 'numpy.... stored elements in Compressed Sparse Row format>, <9999x165 sparse matrix of type '<type 'numpy.in... stored elements in Compressed Sparse Row format>, <9999x16532 sparse matrix of type '<type 'numpy.... stored elements in Compressed Sparse Row format>, 1686      2940
1687      1938
2153       248
291...      711
Name: cleaned_text_length, dtype: int64)
        format = None
        dtype = None
    465 
    466 
    467 def vstack(blocks, format=None, dtype=None):
    468     """

...........................................................................
/Users/stanleystevensWhistle/miniconda2/envs/stanleyyork/lib/python2.7/site-packages/scipy/sparse/construct.py in bmat(blocks=array([[ <9999x26818 sparse matrix of type '<typ...d elements in COOrdinate format>]], dtype=object), format=None, dtype=None)
    576 
    577                 if brow_lengths[i] == 0:
    578                     brow_lengths[i] = A.shape[0]
    579                 elif brow_lengths[i] != A.shape[0]:
    580                     raise ValueError('blocks[%d,:] has incompatible '
--> 581                                      'row dimensions' % i)
        i = 0
    582 
    583                 if bcol_lengths[j] == 0:
    584                     bcol_lengths[j] = A.shape[1]
    585                 elif bcol_lengths[j] != A.shape[1]:

ValueError: blocks[0,:] has incompatible row dimensions
___________________________________________________________________________