In [1]:
%load_ext autoreload
%autoreload 2

# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
logging.getLogger('urllib3').setLevel(logging.WARNING)


# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random
import pytz
from datetime import datetime, timedelta

import requests

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    log.debug(f"Exctracting features from {url}")
    try:
        article = Article(url)
        article.download()
        # the below method may only extract a snippet... 
        # check the database for results of text extraction
        # and apply additional processing if needed after 
        # article has been stored in the DB
        # see code below Newrepublic for example
        article.parse()
    except Exception as e:
        log.critical(f'Data not saved: {e}')
        return datetime.datetime.now()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [4]:
collection = 'huffpost'
source = 'https://www.huffingtonpost.com/archive/'
earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2017-03-15')
stopWords = ['HEALTHY LIVING',
            'ENTERTAINMENT',
            'STYLE',
            'COMEDY',
            'WEDDINGS',
            'SPORTS',
            'ARTS & CULTURE',
            'TASTE',
            'PARENTS',
            'BOOKS',
            'HUFFPOST PERSONAL',
            'QUEER VOICES']

base_url = 'https://www.huffingtonpost.com/'
date = 1

while True:
    date_to_scrape = str(latest_date-timedelta(date))[:10]
    log.debug(f"==================\n Processing DATE {date_to_scrape}\n URL: {source+date_to_scrape}")
    s = Source(source+date_to_scrape)
    date += 1
    s.download()
    soup = bs.BeautifulSoup(s.html,'lxml')
    for div in soup.findAll('div', {'class':'card__details'}):
        try:
            category = div.span.text
            if category not in stopWords:
                for a in div.findAll('a',{'class':'card__link yr-card-headline'}):
                    try:
                        url = urljoin(base_url, a['href'])
                        log.debug(f"Processing url: {url}")
                        article_date = scrape(url, db, collection)
                    except Exception as e:
                        log.debug(e)
            else:
                log.debug(f'Skipping category {category}')
        except Exception as e:
            log.debug(e)
    
    if (latest_date-timedelta(date)) <= earliest_date:
        log.debug(f"Finished. Last date scraped: {latest_date-timedelta(date)}")
        break

 Processing DATE 2017-03-20
 URL: https://www.huffingtonpost.com/archive/2017-03-20
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/house-intelligence-committee-gop-leaks-russia-trump_us_58d019b8e4b00705db518886
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/house-intelligence-committee-gop-leaks-russia-trump_us_58d019b8e4b00705db518886
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/african-visas-denied_us_58d04590e4b0be71dcf74bd6
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/african-visas-denied_us_58d04590e4b0be71dcf74bd6
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/peru-flooding_us_58d0309ee4b00705db51cae9
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/peru-flooding_us_58d0309ee4b00705db51cae9
DEBUG:__main__:Saved to DB
DEBUG:__

DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category TASTE
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/jessica-chastain-shuts-down-bro-who-tried-to-mansplain-womens-health_us_58cfdfb2e4b0be71dcf629a9
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/jessica-chastain-shuts-down-bro-who-tried-to-mansplain-womens-health_us_58cfdfb2e4b0be71dcf629a9
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/stephen-hawking-donald-trump_us_58d016cee4b00705db51828d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/stephen-hawking-donald-trump_us_58d016cee4b00705db51828d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/catchy-schoolhouse-rock-parody-reminds-us-that-plan-b-is-just-a-pill_us_58cfdda4e4b00705db50d69f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/catchy-schoolhouse-rock-parody-remind

DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trumps-budget-will-harm-the-planet-and-the-economy_us_58cfcbd1e4b07112b6472f93
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trumps-budget-will-harm-the-planet-and-the-economy_us_58cfcbd1e4b07112b6472f93
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category PARENTS
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/meals-on-wheels-is-working-for-everyone_us_58cfc95be4b0be71dcf60dd5
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/meals-on-wheels-is-working-for-everyone_us_58cfc95be4b0be71dcf60dd5
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/how-trumps-budget-would-hurt-kids-and-us-workers_us_58cfd8d8e4b0537abd9572c8
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/how-trumps-budget-woul

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/germany-trump-nato-defense-money_us_58ce9691e4b00705db502d37
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/germany-trump-nato-defense-money_us_58ce9691e4b00705db502d37
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/will-hurd-trump-apologize_us_58ce92c3e4b0ec9d29dcc228
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/will-hurd-trump-apologize_us_58ce92c3e4b0ec9d29dcc228
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ex-police-chief-unlawfully-detained_us_58ceba82e4b00705db503b40
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ex-police-chief-unlawfully-detained_us_58ceba82e4b00705db503b40
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/why-trump-administration-is-

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/remembering-jimmy-breslin-and-the-newspaper-world_us_58cf02bee4b0e0d348b34501
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-work-mar-a-lago_us_58cde456e4b0be71dcf54620
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-work-mar-a-lago_us_58cde456e4b0be71dcf54620
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/in-sickness-and-for-the-wealthy-in-health_us_58ce8117e4b07112b6472ebd
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/in-sickness-and-for-the-wealthy-in-health_us_58ce8117e4b07112b6472ebd
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/mouth-swab-marijuana_us_58ce0c7ce4b0be71dcf55bc6
DEBUG:__main__:Exctracting features from https://www.huffingtonpos

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/tom-perez-dnc-transition-team-progressive-critics_us_58cd2cfce4b0be71dcf5295f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/tom-perez-dnc-transition-team-progressive-critics_us_58cd2cfce4b0be71dcf5295f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trumpcare-mental-health-paul-ryan_us_58cb9812e4b0be71dcf36721
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trumpcare-mental-health-paul-ryan_us_58cb9812e4b0be71dcf36721
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/g20-free-trade-pledge-dropped_us_58cd770de4b0be71dcf53953
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/g20-free-trade-pledge-dropped_us_58cd770de4b0be71dcf53953
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__mai

DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-merkel-twitter_us_58cc672fe4b0be71dcf50347
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-merkel-twitter_us_58cc672fe4b0be71dcf50347
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/berkeley-border-wall-contractors_us_58cc7f57e4b0be71dcf50d7b
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/berkeley-border-wall-contractors_us_58cc7f57e4b0be71dcf50d7b
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/k9-pit-bulls-rescue-dogs-clay-county_us_58cd7ee0e4b0ec9d29dc8464
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/k9-pit-bulls-rescue-dogs-clay-county_us_58cd7ee0e4b0e

DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/dems-trump-obamacare_us_58cb1bcfe4b00705db4dd7f3
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/dems-trump-obamacare_us_58cb1bcfe4b00705db4dd7f3
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ARTS & CULTURE
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-merkel-news-conference_us_58cc1ec7e4b00705db4f3f91
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-merkel-news-conference_us_58cc1ec7e4b00705db4f3f91
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/arkansas-legislators-remove-robert-e-lee-martin-luther-king-day_us_58cc5dfce4b00705db4fac00
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/arkansas-legislators-remove-robert-e-lee-martin-luther-king-day_us_58

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/secret-service-laptop-stolen_us_58cc25e5e4b0ec9d29dbe23f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/banard-college-divestment_us_58ca3a55e4b0be71dcf17590
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/banard-college-divestment_us_58ca3a55e4b0be71dcf17590
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/van-jones-trump-budget-drops-a-bomb-financially-on-his-own-supporters_us_58cb421ee4b00705db4df819
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/van-jones-trump-budget-drops-a-bomb-financially-on-his-own-supporters_us_58cb421ee4b00705db4df819
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/us-appeals-ruling-against-trumps-revised-travel-ban-to-higher-court_us_58cc4ef1e4b0ec9d29dc27e5
DEBUG:__main__:Exctracti

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/hey-mick-mulvaney-let-me-help-you-justify-budgeting_us_58cbe67de4b07112b6472c94
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/hey-mick-mulvaney-let-me-help-you-justify-budgeting_us_58cbe67de4b07112b6472c94
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/first-african-american-head-prosecutor-in-florida-wrongfully_us_58cbecd1e4b0e0d348b34253
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/first-african-american-head-prosecutor-in-florida-wrongfully_us_58cbecd1e4b0e0d348b34253
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trumpcares-broken-promises_us_58cbf682e4b0537abd956fed
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trumpcares-broken-promises_us_58cbf682e4b0537abd956fed
DEBUG:__main_

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-budget-worker-protections_us_58caa983e4b00705db4cbe7f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-budget-worker-protections_us_58caa983e4b00705db4cbe7f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-budget_us_58ca12a7e4b0ec9d29d8a9ad
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-budget_us_58ca12a7e4b0ec9d29d8a9ad
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/jesse-williams-says-voter-fraud-is-a-myth_us_58ca9688e4b0ec9d29d9242a
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/jesse-williams-says-voter-fraud-is-a-myth_us_58ca9688e4b0ec9d29d9242a
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: h

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-budget-food-clean-water-funding_us_58ca1297e4b0ec9d29d8a9a1
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category QUEER VOICES
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ivanka-trump-child-care-budget_us_58cafe45e4b0ec9d29da47eb
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ivanka-trump-child-care-budget_us_58cafe45e4b0ec9d29da47eb
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HEALTHY LIVING
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category TASTE
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/white-house-budget-sesame-street-pbs_us_58ca8cade4b0be71dcf1d3eb
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/white-house-budget-sesame-street-pbs_us_58ca8cade4b0be71dcf1d3eb
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpo

KeyboardInterrupt: 

In [66]:
log.debug(f"Stopped scraping at {date_to_scrape}")

DEBUG:__main__:Stopped scraping at 2017-05-09


In [60]:
date

54