In [45]:
%load_ext autoreload
%autoreload 2

# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
logging.getLogger('urllib3').setLevel(logging.WARNING)


# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random
import pytz
from datetime import datetime, timedelta

import requests

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    log.debug(f"Exctracting features from {url}")
    try:
        article = Article(url)
        article.download()
        # the below method may only extract a snippet... 
        # check the database for results of text extraction
        # and apply additional processing if needed after 
        # article has been stored in the DB
        # see code below Newrepublic for example
        article.parse()
    except Exception as e:
        log.critical(f'Data not saved: {e}')
        return datetime.datetime.now()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [None]:
collection = 'huffpost'
source = 'https://www.huffingtonpost.com/archive/'
earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2018-07-29')
stopWords = ['HEALTHY LIVING',
            'ENTERTAINMENT',
            'STYLE',
            'COMEDY',
            'WEDDINGS',
            'SPORTS',
            'ARTS & CULTURE',
            'TASTE',
            'PARENTS',
            'BOOKS',
            'HUFFPOST PERSONAL']

base_url = 'https://www.huffingtonpost.com/'
date = 1

while True:
    s = Source(source+str(latest_date-timedelta(date))[:10])
    date += 1
    s.download()
    soup = bs.BeautifulSoup(s.html,'lxml')
    for div in soup.findAll('div', {'class':'card__details'}):
        category = div.span.text
        if category not in stopWords:
            for a in div.findAll('a',{'class':'card__link yr-card-headline'}):
                try:
                    url = urljoin(base_url, a['href'])
                    log.debug(f"Processing url: {url}")
                    article_date = scrape(url, db, collection)
                except Exception as e:
                    log.debug(e)
        else:
            log.debug(f'Skipping category {category}')
    
    if (latest_date-timedelta(date)) <= earliest_date:
        log.debug(f"Finished. Last date scraped: {latest_date-timedelta(date)}")
        break

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/opinion-young-trump-corruption_us_5b5b5b55e4b0de86f496d9be
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/opinion-young-trump-corruption_us_5b5b5b55e4b0de86f496d9be
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HUFFPOST PERSONAL
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-jr-robert-mueller-airport-photograph_us_5b5c5a2ae4b0fd5c73cfcade
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-jr-robert-mueller-airport-photograph_us_5b5c5a2ae4b0fd5c73cfcade
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/opinion-covert-trump-poverty-america_us_5b5b4f23e4b0fd5c73cf1341
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/opinion-covert-trump-poverty-america_us_5b5b4f23e4b0fd5c73cf1341
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing 

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/greece-suspects-arson-forest-fire_us_5b5ae015e4b0b15aba977598
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/greece-suspects-arson-forest-fire_us_5b5ae015e4b0b15aba977598
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ways-bomb-job-interview-according-hiring-managers_us_5b58c2cbe4b0de86f492db80
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ways-bomb-job-interview-according-hiring-managers_us_5b58c2cbe4b0de86f492db80
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/putin-trump-second-meeting_us_5b5b19c1e4b0fd5c73cebf57
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/putin-trump-second-meeting_us_5b5b19c1e4b0fd5c73cebf57
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HUFFPOST PERSONAL
DEBUG:__main_

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/california-wildfires_us_5b5aaab7e4b0b15aba971263
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/california-wildfires_us_5b5aaab7e4b0b15aba971263
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/rhino-transport-deaths-lion-attack_us_5b5b1b13e4b0de86f4967318
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/rhino-transport-deaths-lion-attack_us_5b5b1b13e4b0de86f4967318
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/imagine-dragons-love-loud-music-festival_us_5b565ddbe4b0fd5c73c82457
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/imagine-dragons-love-loud-music-festival_us_5b565ddbe4b0fd5c73c82457
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/a-new-report-on-the-impa

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/support-from-coworkers-is-essential-for-breastfeeding_us_5b5b34e2e4b0eb29100e5960
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/support-from-coworkers-is-essential-for-breastfeeding_us_5b5b34e2e4b0eb29100e5960
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-changing-on-russia-meeting_us_5b5b0c0be4b0de86f4964cf4
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-changing-on-russia-meeting_us_5b5b0c0be4b0de86f4964cf4
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/democratic-congressional-campaign-committee-pay-interns-diversity_us_5b5a3065e4b0fd5c73cd5793
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/democratic-congressional-campaign-committee-pay-interns-diversity_us_5b5a3065e4b0fd5c73cd5793
DEBUG:_

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/amazon-face-recognition-tool-confused-28-lawmakers-with-arrestees_us_5b59c0c9e4b0de86f4941823
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/united-nations-us-tuberculosis-pharmaceutical-companies_us_5b594307e4b0fd5c73cb9369
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/united-nations-us-tuberculosis-pharmaceutical-companies_us_5b594307e4b0fd5c73cb9369
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/opinion-bardella-house-republicans-russia_us_5b5a1044e4b0b15aba96b025
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/opinion-bardella-house-republicans-russia_us_5b5a1044e4b0b15aba96b025
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HUFFPOST PERSONAL
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/post-interview-move-higher-s

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/david-wojnarowicz-whitney_us_5b4cd683e4b0bc69a78a14c8
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/david-wojnarowicz-whitney_us_5b4cd683e4b0bc69a78a14c8
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/fake-zebras-cairo-zoo_us_5b5a22bee4b0b15aba96c1f1
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/fake-zebras-cairo-zoo_us_5b5a22bee4b0b15aba96c1f1
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category WEDDINGS
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/michigan-gop-gerrymandering-emails_us_5b5971dbe4b0de86f4935f05
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/michigan-gop-gerrymandering-emails_us_5b5971dbe4b0de86f4935f05
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/3d-printed-guns-lawsuit_us_5b589e43e4

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/how-to-eat-hummus_us_5b573200e4b0b15aba927ea5
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/how-to-eat-hummus_us_5b573200e4b0b15aba927ea5
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Skipping category ENTERTAINMENT
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/money-diary-refinery29-problems_us_5b59d09be4b0b15aba960b7d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/money-diary-refinery29-problems_us_5b59d09be4b0b15aba960b7d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/lion-trophies-us-g

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/why-crackdown-fears-may-keep-legal-immigrants-from_us_5b57302be4b08c2f0a5e0fbe
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category HUFFPOST PERSONAL
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/dear-first-trimester_us_5b55f5f4e4b0eb29100e5908
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/dear-first-trimester_us_5b55f5f4e4b0eb29100e5908
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/gatsbying-instagram-social-media_us_5b5766c3e4b0fd5c73c97a23
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/gatsbying-instagram-social-media_us_5b5766c3e4b0fd5c73c97a23
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/doctors-policymakers-struggle-to-keep-up-as-opioid-use-surges-among-women_us_5b3b82fde4b07b827cbb0ca8
DEBUG:__main__:Exctracting fe

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/the-establishment-is-beating-back-the-progressive-revolution-bernie-sanders-and-alexandria-ocasio-cortez-want-to-turn-the-tide_us_5b53722fe4b0de86f48d95a3
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ivanka-trump-fashion-brand-twitter-reacts_us_5b581733e4b0de86f491b5ce
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ivanka-trump-fashion-brand-twitter-reacts_us_5b581733e4b0de86f491b5ce
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-poses-with-jeanine-pirro-and-the-book-shes-promoting-in-the-oval-office_us_5b57a692e4b0de86f4917eaf
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-poses-with-jeanine-pirro-and-the-book-shes-promoting-in-the-oval-office_us_5b57a692e4b0de86f4917eaf
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www

DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-corporate-penalties_us_5b589b72e4b0fd5c73caedac
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-corporate-penalties_us_5b589b72e4b0fd5c73caedac
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/donald-trump-get-me-a-coke-meme_us_5b585390e4b0de86f49213bd
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/donald-trump-get-me-a-coke-meme_us_5b585390e4b0de86f49213bd
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/photos-show-chinese-factory-workers-making-trump-2020-flags_us_5b589259e4b0b15aba94476a
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/photos-show-chinese-factory-workers-making-trump-2020-flags_us_5b589259e4b0b15aba94476a
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/e

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/iran-trump-tweet-threat_us_5b56c14ce4b0de86f49002d8
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/iran-trump-tweet-threat_us_5b56c14ce4b0de86f49002d8
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ritz-goldfish-salmonella-recall_us_5b56ba3de4b0fd5c73c84b22
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ritz-goldfish-salmonella-recall_us_5b56ba3de4b0fd5c73c84b22
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/opinion-butler-protests_us_5b564158e4b0fd5c73c8127f
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/opinion-butler-protests_us_5b564158e4b0fd5c73c8127f
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/attack-sanctuary-city-mayor-hate-crime-burien_us_5b5728d4e

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/mystery-shrouds-iowa-college-student-who-disappeared-while-jogging_us_5b57581ce4b0fd5c73c95e09
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/ice-domestic-violence-abuse_us_5b561740e4b0b15aba914404
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/ice-domestic-violence-abuse_us_5b561740e4b0b15aba914404
DEBUG:__main__:Saved to DB
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Skipping category COMEDY
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/bill-cosby-classified-as-sexually-violent-predator-by-criminal-assessment-board_us_5b578ef7e4b0de86f4916d35
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/bill-cosby-classified-as-sexually-violent-predator-by-criminal-assessment-board_us_5b578ef7e4b0de86f4916d35
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huf

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/environmental-activists-killed-every-week_us_5b55ed2be4b0de86f48f2fb6
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/environmental-activists-killed-every-week_us_5b55ed2be4b0de86f48f2fb6
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/justin-theroux-queer-eye_us_5b573d7ce4b0fd5c73c92f17
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/justin-theroux-queer-eye_us_5b573d7ce4b0fd5c73c92f17
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/sanders-defends-trump-tweet-russian-election-interference-hoax_us_5b56574de4b0b15aba91984d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/sanders-defends-trump-tweet-russian-election-interference-hoax_us_5b56574de4b0b15aba91984d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing 

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/opinion-nagel-birthright-israel-walkout_us_5b54f04fe4b0b15aba90107d
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/opinion-nagel-birthright-israel-walkout_us_5b54f04fe4b0b15aba90107d
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/27-funny-tweets-about-being-a-youngest-child_us_5b44dfa4e4b0c523e2636660
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/27-funny-tweets-about-being-a-youngest-child_us_5b44dfa4e4b0c523e2636660
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/opinion-kuttner-democrats-midterms_us_5b54fdbee4b0de86f48e4926
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/opinion-kuttner-democrats-midterms_us_5b54fdbee4b0de86f48e4926
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huf

DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-tapes-michael-avenatti_us_5b553e0be4b0b15aba902530
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/more-than-trump-recording-michael-cohen_us_5b562ac3e4b0b15aba917093
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/more-than-trump-recording-michael-cohen_us_5b562ac3e4b0b15aba917093
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-tweet-iran-hassan-rouhani_us_5b555051e4b0de86f48e5e33
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-tweet-iran-hassan-rouhani_us_5b555051e4b0de86f48e5e33
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/cuomo-pardon-immigrants_us_5b562bfde4b0b15aba9172c2
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/cuomo-pardon-immigrants_us_5b562bfde4b

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/radio-free-europe-posted-stealth-us-facebook-ads_us_5b5538e8e4b0fd5c73c6ae4e
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/radio-free-europe-posted-stealth-us-facebook-ads_us_5b5538e8e4b0fd5c73c6ae4e
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-revoking-security-clearances_us_5b562356e4b0de86f48f97bb
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-revoking-security-clearances_us_5b562356e4b0de86f48f97bb
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.huffingtonpost.com/entry/trump-epa-plans-to-revoke-californias-smog-fighting-power_us_5b5605ede4b0b15aba912e33
DEBUG:__main__:Exctracting features from https://www.huffingtonpost.com/entry/trump-epa-plans-to-revoke-californias-smog-fighting-power_us_5b5605ede4b0b15aba912e33
DEBUG:__main__:Saved to DB
DE