In [45]:
%load_ext autoreload
%autoreload 2

# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
logging.getLogger('urllib3').setLevel(logging.WARNING)


# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random
import pytz
from datetime import datetime, timedelta

import requests

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    log.debug(f"Exctracting features from {url}")
    try:
        article = Article(url)
        article.download()
        # the below method may only extract a snippet... 
        # check the database for results of text extraction
        # and apply additional processing if needed after 
        # article has been stored in the DB
        # see code below Newrepublic for example
        article.parse()
    except Exception as e:
        log.critical(f'Data not saved: {e}')
        return datetime.datetime.now()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [None]:
collection = 'huffpost'
source = 'https://www.huffingtonpost.com/archive/'
earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2017-05-10')
stopWords = ['HEALTHY LIVING',
            'ENTERTAINMENT',
            'STYLE',
            'COMEDY',
            'WEDDINGS',
            'SPORTS',
            'ARTS & CULTURE',
            'TASTE',
            'PARENTS',
            'BOOKS',
            'HUFFPOST PERSONAL',
            'QUEER VOICES']

base_url = 'https://www.huffingtonpost.com/'
date = 1

while True:
    date_to_scrape = str(latest_date-timedelta(date))[:10]
    log.debug(f"==================\n Processing DATE {date_to_scrape}\n URL: {source+date_to_scrape}")
    s = Source(source+date_to_scrape)
    date += 1
    s.download()
    soup = bs.BeautifulSoup(s.html,'lxml')
    for div in soup.findAll('div', {'class':'card__details'}):
        try:
            category = div.span.text
            if category not in stopWords:
                for a in div.findAll('a',{'class':'card__link yr-card-headline'}):
                    try:
                        url = urljoin(base_url, a['href'])
                        log.debug(f"Processing url: {url}")
                        article_date = scrape(url, db, collection)
                    except Exception as e:
                        log.debug(e)
            else:
                log.debug(f'Skipping category {category}')
        except Exception as e:
            log.debug(e)
    
    if (latest_date-timedelta(date)) <= earliest_date:
        log.debug(f"Finished. Last date scraped: {latest_date-timedelta(date)}")
        break

 Processing DATE 2017-05-09
 URL: https://www.huffingtonpost.com/archive/2017-05-09
DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/san-francisco-receives-record-100-million-to-fight-homelessness_us_5911400ee4b0d5d9049f6aad
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/san-francisco-receives-record-100-million-to-fight-homelessness_us_5911400ee4b0d5d9049f6aad
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/dear-white-people-dear-white-people-being-ethnic-is-cool-being-ethnic-is-cool_us_59121da4e4b0a58297e02b95
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/dear-white-people-dear-white-people-being-ethnic-is-cool-being-ethnic-is-cool_us_59121da4e4b0a58297e02b95
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category PARENTS
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/john-legend-on-10-y

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/emmanuel-macron-macarons_us_5911b5ede4b0e7021e9b1a27
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/paul-ryan-school-visit-harlem-success-academy-protests_us_59123326e4b05e1ca202cc78
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/paul-ryan-school-visit-harlem-success-academy-protests_us_59123326e4b05e1ca202cc78
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/anti-muslim-hate-crimes-2016-council-on-american-islamic-relations_us_5910acf4e4b0d5d9049e96d5
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/anti-muslim-hate-crimes-2016-council-on-american-islamic-relations_us_5910acf4e4b0d5d9049e96d5
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/latinx-show-their-pride-and-resilience-with-spectacular-grad-caps_us_591

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/9-mothers-day-gifts-to-boost-your-moms-tech-cessory-game_us_590a2819e4b02655f843b503
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/4-business-mistakes-ill-never-make-again_us_5911ec18e4b05e1ca202299a
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/4-business-mistakes-ill-never-make-again_us_5911ec18e4b05e1ca202299a
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/gloria-steinem-sva-commencement_us_5911f393e4b05e1ca2023d4b
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/gloria-steinem-sva-commencement_us_5911f393e4b05e1ca2023d4b
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category SPORTS
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Skipping category WEDDINGS
DEBUG:__main__:Skipping category PAR

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/all-you-wanna-do-is-ride-around-sally-and-expose-malfeasance_us_5910d9fbe4b0d5d9049f205b
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/jordan-peeles-get-out-make-history_us_5910988ce4b0104c7350e65e
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/jordan-peeles-get-out-make-history_us_5910988ce4b0104c7350e65e
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/obama-profile-in-courage-speech_us_590fcef1e4b0104c73500640
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/obama-profile-in-courage-speech_us_590fcef1e4b0104c73500640
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/warren-buffett-health-care_us_5910790fe4b0e7021e994ade
DEBU

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/a-victory-for-macron-and-for-the-european-union-now_us_590fd1e7e4b046ea176aecc9
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/a-victory-for-macron-and-for-the-european-union-now_us_590fd1e7e4b046ea176aecc9
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/in-france-neoliberalism-defeats-neofascismfor-now_us_590fe000e4b056aa2363d69c
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/in-france-neoliberalism-defeats-neofascismfor-now_us_590fe000e4b056aa2363d69c
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/the-vicious-knot-of-syria-the-untangling-process-contains_us_590ff9efe4b0f7118072462a
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-vicious-knot-of-syria-the-untangling-process-contains_us_590ff9efe4b0f71180

DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category TASTE
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/alligator-attack-10-year-old-girl-jaws_us_59106340e4b0d5d9049de24d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/alligator-attack-10-year-old-girl-jaws_us_59106340e4b0d5d9049de24d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/the-billionaire-journalist_us_591078ede4b046ea176aed43
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-billionaire-journalist_us_591078ede4b046ea176aed43
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/as-fentanyl-spreads-states-step-up-responses_us_5910849fe4b046ea176aed5e
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/as-fentanyl-spreads-states-step-up-responses_us_5910849fe

DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/france-presidential-election-runoff_us_5907778ee4b0bb2d087017c0
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/france-presidential-election-runoff_us_5907778ee4b0bb2d087017c0
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/why-you-should-care-about-this-weekends-french-presidential_us_590e63dfe4b056aa2363d5c4
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/why-you-should-care-about-this-weekends-french-presidential_us_590e63dfe4b056aa2363d5c4
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/the-westphalian-model-to-resolve-conflicts-in-the-middle_us_590eab19e4b046ea176aec29
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-westphalian-model-to-r

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/the-public-school-system-is-rigged-against-poor-and_us_590e660be4b0f71180724558
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-public-school-system-is-rigged-against-poor-and_us_590e660be4b0f71180724558
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/wikileaks-macron-france_us_590e904de4b0104c734f82aa
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/wikileaks-macron-france_us_590e904de4b0104c734f82aa
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/boston-doctors-found-slain_us_590f4134e4b0e7021e98623d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/boston-doctors-found-slain_us_590f4134e4b0e7021e98623d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/jared-kushne

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/jeff-sessions-department-of-injustice_us_590dd80ee4b0f711807244f1
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/in-life-we-have-choices_us_590df3f1e4b0f71180724500
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/in-life-we-have-choices_us_590df3f1e4b0f71180724500
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/911-memorial-brats-never-forget_us_590e02d4e4b0f7118072450e
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/911-memorial-brats-never-forget_us_590e02d4e4b0f7118072450e
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/a-eulogy-for-your-healthcare_us_590e183ce4b056aa2363d587
DEBUG:__main__:Exctracting features fr

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-photoshopped-picture_us_590d9366e4b0d5d9049ccbc9
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-photoshopped-picture_us_590d9366e4b0d5d9049ccbc9
DEBUG:__main__:Saved to DB
 Processing DATE 2017-05-05
 URL: https://www.huffingtonpost.com/archive/2017-05-05
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/xeni-jardin-health-care-gop_us_590c685ee4b0d5d9049b9216
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/xeni-jardin-health-care-gop_us_590c685ee4b0d5d9049b9216
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/mark-green-withdraws_us_590cda69e4b0e7021e97bc3b
DEBUG:__main__:Exctracting features from https://www.huf

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/late-shift-in-key-states-helped-elect-trump-report-finds_us_590c69bfe4b0e7021e96d651
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/late-shift-in-key-states-helped-elect-trump-report-finds_us_590c69bfe4b0e7021e96d651
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/gop-read-the-bill_us_590bd1a7e4b0d5d9049b1d38
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/gop-read-the-bill_us_590bd1a7e4b0d5d9049b1d38
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-voter-fraud_us_590cc4e8e4b0d5d9049c4155
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-voter-fraud_us_590cc4e8e4b0d5d9049c4155
DEBUG:__main__:Sav

DEBUG:__main__:Skipping category PARENTS
DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/why-we-cant-let-trump-and-congress-tax-public-infrastructure_us_590cb5d3e4b0f7118072442a
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/why-we-cant-let-trump-and-congress-tax-public-infrastructure_us_590cb5d3e4b0f7118072442a
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/get-away-jordan-an-eulogy-for-jordan-edwards_us_590ca9fce4b0f7118072440f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/get-away-jordan-an-eulogy-for-jordan-edwards_us_590ca9fce4b0f7118072440f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/a-monumental-cave-in_us_590cbfd9e4b056aa2363d489
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/a-monumental-cave-in_us_590cbfd9e4b056aa2363d489

DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/this-subscription-box-helps-support-businesses-ran-by-women-of-color_us_590b5b33e4b0e7021e955c20
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/this-subscription-box-helps-support-businesses-ran-by-women-of-color_us_590b5b33e4b0e7021e955c20
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category PARENTS
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/obama-stand-against-populism-endorses-macron_us_590b2535e4b0bb2d0875c54e
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/obama-stand-against-populism-endorses-macron_us_590b2535e4b0bb2d0875c54e
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/e

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/bernie-sanders-republican-health-care-bill_us_590b9d4ce4b0104c734d5fa9
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/campaign-cash-targets-republican-health-care-repeal-votes_us_590b9a97e4b0d5d9049ad1fc
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/campaign-cash-targets-republican-health-care-repeal-votes_us_590b9a97e4b0d5d9049ad1fc
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/nancy-pelosi-health-care-trump_us_590b43e9e4b0d5d90499e041
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/nancy-pelosi-health-care-trump_us_590b43e9e4b0d5d90499e041
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huf

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/james-comey-legal-moralist-or-self-serving-unconscionable_us_590ad7e8e4b03b105b44bf83
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/james-comey-legal-moralist-or-self-serving-unconscionable_us_590ad7e8e4b03b105b44bf83
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/texas-teens-senseless-death-leaves-a-community-waiting_us_590ae630e4b05279d4edc2ee
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/texas-teens-senseless-death-leaves-a-community-waiting_us_590ae630e4b05279d4edc2ee
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/do-republicans-really-think-the-ahca-is-a-good-idea_us_590ae2c8e4b03b105b44bf85
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/do-republicans-really-think-the-ahca-is-a-good-idea_us_590ae2c8e4b03b105b44bf85
DEBUG:__ma

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/an-open-letter-to-people-who-think-women-arent-funny_us_59075a96e4b02655f83ec973
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ethan-nadelmann-trump-drug-policy-alliance_us_5909efeae4b02655f842f58f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ethan-nadelmann-trump-drug-policy-alliance_us_5909efeae4b02655f842f58f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/border-patrol-asylum_us_590a450ce4b02655f843f159
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/border-patrol-asylum_us_590a450ce4b02655f843f159
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/baton-rouge-residents-demand-doj-wake-up-to-what-justice-should-loo

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/blasphemy-and-terrorism-catchall-phrases-to-repress_us_59095302e4b05279d4edc0ff
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/the-presidency-as-performance-art_us_59095ca5e4b084f59b49fe16
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-presidency-as-performance-art_us_59095ca5e4b084f59b49fe16
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category TASTE
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Skipping category TASTE
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/north-korea-russia_us_5909d7a7e4b05c39

DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/what-these-13-may-day-activists-want-trump-to-know-about-immigrants_us_5907fceae4b02655f83fc510
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/what-these-13-may-day-activists-want-trump-to-know-about-immigrants_us_5907fceae4b02655f83fc510
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/arts-funding-trump-budget_us_59078df1e4b0bb2d08704af4
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/arts-funding-trump-budget_us_59078df1e4b0bb2d08704af4
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/gop-yelling-at-dying-people-strategy-not-paying-off-yet_us_5908fb60e4b0bb2d0872e1e4
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/gop-yelling-at-dying-people-strategy-not-paying-of

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ivanka-trump-access-hollywood-tape_us_5908b20ae4b0bb2d0872455c
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ivanka-trump-access-hollywood-tape_us_5908b20ae4b0bb2d0872455c
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/paul-ryan-preexisting-conditions-bogus_us_5908ceece4b05c3976838062
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/paul-ryan-preexisting-conditions-bogus_us_5908ceece4b05c3976838062
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ro-khanna-syria-white-helmets_us_5908fbcce4b02655f841d5b8
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ro-khanna-syria-white-helmets_us_5908fbcce4b02655f841d5b8
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/joe-walsh-

DEBUG:__main__:Saved to DB
 Processing DATE 2017-05-01
 URL: https://www.huffingtonpost.com/archive/2017-05-01
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Skipping category TASTE
DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/us-congressional-talks-yield-deal-to-fund-government-through-september_us_590698fde4b02655f83e698f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/us-congressional-talks-yield-deal-to-fund-government-through-september_us_590698fde4b02655f83e698f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/supergirl-flies-with-elizabeth-warren_us_59075e27e4b0bb2d086fe259
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/supergirl-flies-with-elizabeth-warren_us_59075e27e4b0bb2d086fe259
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: h

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/may-day-haymarket-affair_us_59075a44e4b02655f83ec8a6
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/may-day-haymarket-affair_us_59075a44e4b02655f83ec8a6
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/turbulence-on-aeroflot-moscow-bangkok-flight-leaves-at-least-27-hurt_us_5907246ee4b0bb2d086f92fe
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/turbulence-on-aeroflot-moscow-bangkok-flight-leaves-at-least-27-hurt_us_5907246ee4b0bb2d086f92fe
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/driver-crashes-into-tanker_us_590759fae4b0bb2d086fd5d0
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/driver-crashes-into-tanker_us_590759fae4b0bb2d086fd5d0
DEBUG

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-likens-himself-to-andrew-jackson-a-president_us_5907bf6be4b05279d4edbf0c
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-is-confused-about-why-the-civil-war-was_us_5907858fe4b03b105b44bae9
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-is-confused-about-why-the-civil-war-was_us_5907858fe4b03b105b44bae9
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/how-obama-could-end-criticism-of-his-wall-street-speech_us_5907c819e4b084f59b49fc15
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/how-obama-could-end-criticism-of-his-wall-street-speech_us_5907c819e4b084f59b49fc15
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/another-gop-tax-plan-for-captains_us_5905f986e4b084f59b49fa04
DEBUG:__m

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/mike-pence-donald-trump-nato_us_59060d00e4b0bb2d086f4332
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/climber-ueli-steck-killed-in-nepal_us_59060b62e4b0bb2d086f428e
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/climber-ueli-steck-killed-in-nepal_us_59060b62e4b0bb2d086f428e
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-health-care-bill_us_5906391fe4b02655f83e4733
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-health-care-bill_us_5906391fe4b02655f83e4733
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/rob-quist-montana-house-race_us_590628f1e4b05c3976805500
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/rob-quist-montana-house-race_us_590628f1e4b05c3

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/why-so-many-still-adore-trumpdespite-everything_us_59062ceee4b05279d4edbd26
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/why-so-many-still-adore-trumpdespite-everything_us_59062ceee4b05279d4edbd26
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/the-democratic-party-continues-to-ignore-reality_us_5906049fe4b03b105b44b983
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-democratic-party-continues-to-ignore-reality_us_5906049fe4b03b105b44b983
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trumps-war-against-his-mandate_us_5905fd23e4b05279d4edbcf5
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trumps-war-against-his-mandate_us_5905fd23e4b05279d4edbcf5
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url:

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-democrats-hannah-risheq_us_59021e02e4b081a5c0fbbe0d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-democrats-hannah-risheq_us_59021e02e4b081a5c0fbbe0d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/obamacare-repeal-compromise-bill_us_5903aac7e4b02655f83d7fc5
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/obamacare-repeal-compromise-bill_us_5903aac7e4b02655f83d7fc5
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/wyoming-tutu-revolution-lgbtq_us_5903f2e5e4b02655f83da8de
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/wyoming-tutu-revolution-lgbtq_us_5903f2e5e4b02655f83da8de
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/elizabeth-warren-donald-trump-inauguration-bill-maher_us_59

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/texas-cop-accusing-of-staging-his-own-death-may-be-in-mexico_us_5904d0f7e4b0bb2d086ee6a1
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/texas-cop-accusing-of-staging-his-own-death-may-be-in-mexico_us_5904d0f7e4b0bb2d086ee6a1
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/women-dont-live-in-a-hellscape_us_590382e0e4b0bb2d086e30a3
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/women-dont-live-in-a-hellscape_us_590382e0e4b0bb2d086e30a3
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/china-deports-us-citizen-convicted-of-espionage_us_5904c852e4b0bb2d086ee63b
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/china-deports-us-citizen-convicted-of-espionage_us_5904c852e4b0bb2d086ee63b
DEBUG:__main__:Saved

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/brazil-cities-paralyzed-by-nationwide-strike-against-austerity_us_590355d7e4b0bb2d086d7ff8
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/taliban-spring-offensive_us_5902cc20e4b02655f83b5c59
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/taliban-spring-offensive_us_5902cc20e4b02655f83b5c59
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/chris-murphy-gun-violence-victims_us_59038c14e4b02655f83d39f2
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/chris-murphy-gun-violence-victims_us_59038c14e4b02655f83d39f2
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/bret-stephens-nyt_us_5903b95fe4b05c39767fa198
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/bret-stephens-nyt_us_5903b95fe4b05c3

DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/charmaine-yoest-hhs_us_590370c4e4b02655f83d020d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/charmaine-yoest-hhs_us_590370c4e4b02655f83d020d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category PARENTS
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/what-its-like-getting-dressed-when-youre-blind_us_58f0f747e4b0da2ff8603b01
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/what-its-like-getting-dressed-when-youre-blind_us_58f0f747e4b0da2ff8603b01
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/california-sea-level-rise_us_59027f0fe4b0bb2d086c5f31
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/california-sea-level-rise_us_59027f0fe4b0bb2d

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/study-americans-more-likely-to-vote-for-highly-religious_us_59027b23e4b084f59b49f79c
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/study-americans-more-likely-to-vote-for-highly-religious_us_59027b23e4b084f59b49f79c
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/it-is-the-first-climate-lawsuit-to-make-it-this-far_us_5903396be4b084f59b49f81a
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/it-is-the-first-climate-lawsuit-to-make-it-this-far_us_5903396be4b084f59b49f81a
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/3-lessons-learned-from-a-31-year-old-intern_us_5903486ce4b05279d4edbb31
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/3-lessons-learned-from-a-31-year-old-intern_us_5903486ce4b05279d4edbb31
DEBUG:_

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-100-days-private-prisons_us_590203d8e4b0026db1def8fb
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/rachel-maddow-trump-taxes_us_59023a9ce4b0026db1df9157
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/rachel-maddow-trump-taxes_us_59023a9ce4b0026db1df9157
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trumps-labor-secretary-confirmed_us_5901f3dee4b081a5c0fb5563
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trumps-labor-secretary-confirmed_us_5901f3dee4b081a5c0fb5563
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/jerry-brown-trump-climate-change_us_59026ba4e4b02655f83b3b24
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/jerry-brown-trump-climate-change_us_59026ba4e4b02655f

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/liberia-backstreet-abortions_us_59021196e4b0af6d718c63b8
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/liberia-backstreet-abortions_us_59021196e4b0af6d718c63b8
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/a-photo-shoot-fit-for-a-little-princess_us_59022883e4b0026db1df59a2
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/a-photo-shoot-fit-for-a-little-princess_us_59022883e4b0026db1df59a2
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/jason-chaffetz-tmz_us_59020145e4b0026db1def43b
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/jason-chaffetz-tmz_us_59020145e4b0026db1def43b
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/watch-cops-narrow-rescue-of-man-jumping-from-sixth-floor_us_5901c74fe

In [66]:
log.debug(f"Stopped scraping at {date_to_scrape}")

DEBUG:__main__:Stopped scraping at 2017-05-09


In [60]:
date

54