In [1]:
from goose import Goose
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
from textblob import TextBlob, Word
from sklearn.base import TransformerMixin
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV
import datetime
%matplotlib inline

In [2]:
pld = pd.read_csv('0_16000.csv')

In [7]:
# del pld['Unnamed: 0']
# pld.head()
# cols = pld.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# cols
# pld = pld[cols]
pld.columns = ['url_raw', 'url_clean', 'url_domain', 'ugly_text', 'issue', 'political_lean', 'title', 'meta_description', 'cleaned_text']
pld.head(3)

Unnamed: 0,url_raw,url_clean,url_domain,ugly_text,issue,political_lean,title,meta_description,cleaned_text
0,https://www.washingtonpost.com/news/post-natio...,washingtonpost.com/news/post-nation/wp/2016/05...,washingtonpost.com,2 Desktop notifications are ...,abortion,Lean Left,‘A target on Roe v. Wade ’: Oklahoma bill maki...,Gov. Mary Fallin (R) has not said if she plans...,UPDATE: Gov. Fallin vetoed the bill on Friday....
1,http://www.salon.com/2016/04/07/camille_paglia...,salon.com/2016/04/07/camille_paglia_feminists_...,salon.com,\n\n\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t...,abortion,Left,"Camille Paglia: Feminists have abortion wrong,...",Reproductive rights have become ideological to...,"While the Hillary flap was merely a blip, give..."
2,http://www.vox.com/2016/3/20/11269226/texas-ab...,vox.com/2016/3/20/11269226/texas-abortion-wome...,vox.com,"\n \n \n\n(function(w,d,s,l,i){w[l]=w[l]...",abortion,Lean Left,Study: women had to drive 4 times farther afte...,Here's exactly how Texas anti-abortion laws bu...,Ever since Texas laws closed about half of the...


In [8]:
pld_text = pld[pld['cleaned_text'].notnull()]
pld_text[['cleaned_text', 'url_domain', 'political_lean', 'issue', 'title']].count()

cleaned_text      12501
url_domain        12501
political_lean    12501
issue             12501
title             12462
dtype: int64

In [32]:
# PIPELINE CAN ALSO BE USED WITH GRIDSEARCHCV, BUT VERY SLOW
# pipe = Pipeline([
#   ('features', FeatureUnion([
#         ('counts', CountVectorizer()),
#         ('tf_idf', TfidfVectorizer())
#   ])),
#   ('classifier', MultinomialNB())
# ])

# SEARCH FOR AN OPTIMAL N_GRAM VALUE USING GRADSEARCHCV
# gram_range = [(1, n) for n in range(1, 3)]
# param_grid = {
#     'features__counts__ngram_range': gram_range,
#     'features__tf_idf__ngram_range': gram_range,
# }

# grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
# grid.fit(df.msg, df.label)
# print grid.best_score_, grid.best_params_

In [9]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.581698121969
0:05:05.378078


In [10]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.463778334447
0:00:42.478829


In [204]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.587583794081
0:08:32.436998


In [206]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer()),
        ('tf_idf', TfidfVectorizer())
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text.cleaned_text, pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.496657158128
0:01:25.248623


In [11]:
class Domain(TransformerMixin):

    def transform(self, X, **transform_params):
        domains = self.vect.transform(X.url_domain)
        return domains

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['url_domain'])
        return self

In [12]:
class WordVect(TransformerMixin):

    def transform(self, X, **transform_params):
        words = self.vect.transform(X.cleaned_text)
        return words

    def fit(self, X, y=None, **fit_params):
        self.vect = CountVectorizer(**fit_params)
        self.vect.fit(X['cleaned_text'])
        return self

In [13]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('words', WordVect()),
        ('domain', Domain())
  ])),
  ('classifier', LogisticRegression())
])

print cross_val_score(pipe, pld_text, pld_text['political_lean'], cv=5, scoring='accuracy', n_jobs=-1).mean()

print datetime.datetime.now() - time

WordVectorizer:
(9999, 57965)
DomainVectorizer:
(9999, 165)
WordVectorizer:
(10000, 58137)
DomainVectorizer:
(10000, 153)
WordVectorizer:
(10000, 58492)
DomainVectorizer:
(10000, 170)
WordVectorizer:
(10002, 57278)
DomainVectorizer:
(10002, 157)
WordVectorizer:
(2502, 57965)
DomainVectorizer:
(2502, 165)
WordVectorizer:
(2499, 57278)
DomainVectorizer:
(2499, 157)
WordVectorizer:
(2501, 58137)
DomainVectorizer:
(2501, 153)
WordVectorizer:
(2501, 58492)
DomainVectorizer:
(2501, 170)
WordVectorizer:
(10003, 57929)
DomainVectorizer:
(10003, 167)
WordVectorizer:
(2498, 57929)
DomainVectorizer:
(2498, 167)
0.853034443257
0:03:12.789926


In [16]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(stop_words='english'))
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['cleaned_text'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.569940582322
0:03:00.713157


In [18]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(stop_words='english'))
  ])),
  ('classifier', MultinomialNB())
])

print cross_val_score(pipe, pld_text['cleaned_text'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.461698717961
0:00:44.510973


In [19]:
time = datetime.datetime.now()

pipe = Pipeline([
  ('features', FeatureUnion([
        ('counts', CountVectorizer(ngram_range=(1, 2), min_df=3))
  ])),
  ('logreg', LogisticRegression())
])

print cross_val_score(pipe, pld_text['cleaned_text'], pld_text.political_lean, cv=5, scoring='accuracy').mean()

print datetime.datetime.now() - time

0.62386386678
0:21:44.069018


In [29]:
def pipe(col):
    time = datetime.datetime.now()
    pld_issue = pld_text[pld_text.issue == col]
    pipe = Pipeline([
      ('features', FeatureUnion([
            ('counts', CountVectorizer())
      ])),
      ('logreg', LogisticRegression())
    ])
    print " "
    print "issue: " + str(col)
    print cross_val_score(pipe, pld_issue['cleaned_text'], pld_issue.political_lean, cv=5, scoring='accuracy').mean()
    print datetime.datetime.now() - time

In [30]:
top_six_cols = ['election-2012', 'healthcare-0', 'immigration', 'economic-policy-debt-deficit', 'economy-jobs', 'gun-legislation']
for col in top_six_cols:
    pipe(col)

 
issue: election-2012
0.591405090095
0:00:35.115171
 
issue: healthcare-0
0.602989695589
0:00:06.761739
 
issue: immigration
0.550232077636
0:00:04.739664
 
issue: economic-policy-debt-deficit
0.572712600216
0:00:04.506347
 
issue: economy-jobs
0.560315790971
0:00:03.312702
 
issue: gun-legislation
0.469886876366
0:00:03.816199
