In [44]:
from goose import Goose
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
from textblob import TextBlob, Word
from sklearn.base import TransformerMixin
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV
import datetime
%matplotlib inline

In [4]:
pld = pd.read_csv('0_12700.csv')

In [30]:
# del pld[0]
# cols = pld.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# cols
# pld = pld[cols]
pld.columns = ['cleaned_text', 'url_raw', 'url_clean', 'url_domain', 'ugly_text', 'issue', 'political_lean', 'title', 'meta_description']
pld.head(3)

Unnamed: 0,cleaned_text,url_raw,url_clean,url_domain,ugly_text,issue,political_lean,title,meta_description
0,UPDATE: Gov. Fallin vetoed the bill on Friday....,https://www.washingtonpost.com/news/post-natio...,washingtonpost.com/news/post-nation/wp/2016/05...,washingtonpost.com,2 Desktop notifications are ...,abortion,Lean Left,‘A target on Roe v. Wade ’: Oklahoma bill maki...,Gov. Mary Fallin (R) has not said if she plans...
1,"While the Hillary flap was merely a blip, give...",http://www.salon.com/2016/04/07/camille_paglia...,salon.com/2016/04/07/camille_paglia_feminists_...,salon.com,\n\n\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t...,abortion,Left,"Camille Paglia: Feminists have abortion wrong,...",Reproductive rights have become ideological to...
2,Ever since Texas laws closed about half of the...,http://www.vox.com/2016/3/20/11269226/texas-ab...,vox.com/2016/3/20/11269226/texas-abortion-wome...,vox.com,"\n \n \n\n(function(w,d,s,l,i){w[l]=w[l]...",abortion,Lean Left,Study: women had to drive 4 times farther afte...,Here's exactly how Texas anti-abortion laws bu...


In [119]:
pld_text = pld[pld['cleaned_text'].notnull()]
pld_text[['cleaned_text', 'url_domain', 'political_lean']].count()

cleaned_text      9830
url_domain        9830
political_lean    9830
dtype: int64

In [32]:
# PIPELINE CAN ALSO BE USED WITH GRIDSEARCHCV, BUT VERY SLOW
# pipe = Pipeline([
#   ('features', FeatureUnion([
#         ('counts', CountVectorizer()),
#         ('tf_idf', TfidfVectorizer())
#   ])),
#   ('classifier', MultinomialNB())
# ])

# SEARCH FOR AN OPTIMAL N_GRAM VALUE USING GRADSEARCHCV
# gram_range = [(1, n) for n in range(1, 3)]
# param_grid = {
#     'features__counts__ngram_range': gram_range,
#     'features__tf_idf__ngram_range': gram_range,
# }

# grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
# grid.fit(df.msg, df.label)
# print grid.best_score_, grid.best_params_

In [33]:
pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

0.58514052559347773

In [34]:
pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('classifier', MultinomialNB())
])

cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

0.49756282608223162

In [204]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.587583794081
0:08:32.436998


In [206]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.496657158128
0:01:25.248623


In [192]:
class Domain(TransformerMixin):

    def transform(self, X, **transform_params):
        domain_vect = CountVectorizer(max_features=60)
        domains = pd.DataFrame(domain_vect.fit_transform(X.url_domain).toarray(), columns=domain_vect.get_feature_names())
        print "DomainVectorizer:"
        print domains.shape
        return domains

    def fit(self, X, y=None, **fit_params):
        return self

In [201]:
class WordVect(TransformerMixin):

    def transform(self, X, **transform_params):
        word_vect = CountVectorizer(max_features=5000)
        words = pd.DataFrame(word_vect.fit_transform(X.cleaned_text).toarray(), columns=word_vect.get_feature_names())
        print "WordVectorizer:"
        print words.shape
        return words

    def fit(self, X, y=None, **fit_params):
        return self

In [202]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('words', WordVect()),
        ('domain', Domain())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

WordVectorizer:
(7862, 5000)
DomainVectorizer:
(7862, 60)
WordVectorizer:
(7863, 5000)
DomainVectorizer:
(7863, 60)
WordVectorizer:
(1968, 5000)
DomainVectorizer:
(1968, 60)
WordVectorizer:
(7864, 5000)
DomainVectorizer:
(7864, 60)
WordVectorizer:
(1967, 5000)
DomainVectorizer:
(1967, 60)
WordVectorizer:
(7864, 5000)
DomainVectorizer:
(7864, 60)
WordVectorizer:
(1966, 5000)
DomainVectorizer:
(1966, 60)
WordVectorizer:
(1966, 5000)
DomainVectorizer:
(1966, 60)
WordVectorizer:
(7867, 5000)
DomainVectorizer:
(7867, 60)
WordVectorizer:
(1963, 5000)
DomainVectorizer:
(1963, 60)
0.0245073801886
0:00:31.749927


In [215]:
pld_text['combined_feature'] = pld_text['cleaned_text'] + pld_text['url_domain']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [216]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['combined_feature'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.810270758743
0:03:02.051172


In [234]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['url_domain'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.973550495457
0:00:00.725872


In [235]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld['url_domain'], pld.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.977842377062
0:00:01.987797


In [236]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(ngram_range=(1,3)))
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['combined_feature'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.833877222986
1:08:38.473914
