# Prep

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [68]:
from bson.objectid import ObjectId

def show_doc(db, collection, id):
    '''
    Finds a document by 'id' and prints contents to the console
    
    Parameters:
    --------
    db         : database name
    collection : mongodb collection
    id         : mongodb document id
    
    Returns:
    --------
    Prints first 100 symbols of each document's key to console
    '''
    
    doc = db[collection].find_one({'_id':ObjectId(id)})
    for k in doc:
        print(f"{k} : {str(doc[k])}")

# Some comands to keep dbs clean

In [None]:
# deletes all 'meta' fields from all docs
# htmlCol.update({}, {$unset: {meta:1}}, false, true); # mongo shell comand
htmlCol.update({}, {'$unset': {'meta':1}}, multi=True) # pymongo way

In [None]:
# leaves only unique documents by 'url' field

htmlCol.create_index(
    "url",
    unique=True
)

In [None]:
# pymongo 'find' returns cursor that allows iterating through results
# calling first object [0] allows accessing the dictionary with results
# the ['html'] is the key in the dictionary
html = htmlCol.find({'url':'http://www.msnbc.com/velshi-ruhle/watch/jeff-sessions-is-justifying-harsh-immigration-policy-with-the-bible-1256689731629'},\
            projection={'html':True, '_id':False})[0]['html']

In [None]:
# find documents NOT containing a 'tag': regex expression
import re
tag = re.compile('dek___3AQpw.')
docs = htmlCol.find({"html" : {'$not': tag}})
for d in docs[:20]: print(d['url'])

# Multithreading

In [None]:
from multiprocessing import Process

# use multiprocessing to extract features
def func():
    DB_NAME = 'scrape'
    db = pm.MongoClient(host='localhost', port=27017, maxPoolSize=500)[DB_NAME]

    for collection in ['left','right']: docs_parser(db[collection])

proc = Process(target=func)
proc.start()

# **Production code**

In [2]:
# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# dependencies
import pymongo as pm

from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

In [None]:
SOURCES = {
    'left'  : [
        'https://newrepublic.com',
        'https://www.motherjones.com'
# 3. Slate
# 4. The Intercept
# 5. Daily Beast
# 6. The Atlantic
# 7. Washington Post
# 8. Politico
# 9. The Guardian
# 10. BBC
    ],
    'right' : [
        'https://www.breitbart.com'
# 2. Fox News
# 3. New York Post
# 4. The American Conservative
# 5. Washington Times
# 6. Daily Wire
# 7. The Fiscal Times
# 8. The Hill
# 9. The Daily Caller
# 10. Reason
    ]
}

In [3]:
def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )

In [None]:
def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    article = Article(url)
    article.download()
    # the below method may only extract a snippet... 
    # check the database for results of text extraction
    # and apply additional processing if needed after 
    # article has been stored in the DB
    # see code below Newrepublic for example
    article.parse()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

# Newrepublic

In [62]:
import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser

# name of collection for this media
collection = 'newRep'
source = 'https://newrepublic.com/latest'
page   = 1

earliest_date = date_parser('2018-01-01')

while True:
    log.debug(f'PROCESSING PAGE: {page}')
    s = Source(source+'?page='+str(page))
    s.download()

    soup = bs.BeautifulSoup(s.html,'lxml')

    # line below needs to be updated per news source
    # to include the specific tags for article text 
    # defined differently for each site
    for section in soup.findAll('article'):
        url = urljoin(s.url, section.a['href'])
        log.debug(f'Processing url: {url}')
        
        article_date = scrape(url, db, collection)

    if article_date < earliest_date:
        log.debug(f'Reached earliest date requested: {article_date}')
        break
    page += 1

DEBUG:__main__:PROCESSING PAGE: 1
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=1 HTTP/1.1" 200 78494
DEBUG:__main__:Processing url: https://newrepublic.com/article/150202/trumps-farmer-bailout-americas-broken-food-system
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/150202/trumps-farmer-bailout-americas-broken-food-system HTTP/1.1" 200 20173
DEBUG:__main__:Processing url: https://newrepublic.com/article/150169/zadie-smiths-right-wrong
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/150169/zadie-smiths-right-wrong HTTP/1.1" 200 20603
DEBUG:__main__:Processing url: https://newrepublic.com/article/150159/doxx-racist
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1

DEBUG:__main__:Processing url: https://newrepublic.com/article/149947/white-house-pr-nightmare-never-came
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149947/white-house-pr-nightmare-never-came HTTP/1.1" 200 20967
DEBUG:__main__:Processing url: https://newrepublic.com/article/149955/trump-not-putins-puppet
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149955/trump-not-putins-puppet HTTP/1.1" 200 19697
DEBUG:__main__:Processing url: https://newrepublic.com/article/149950/russians-saw-helsinki-summit
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149950/russians-saw-helsinki-summit HTTP/1.1" 200 19777
DEBUG:__main__:Processing url: https://newrepublic.com/article/149939/american

DEBUG:__main__:PROCESSING PAGE: 6
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=6 HTTP/1.1" 200 60122
DEBUG:__main__:Processing url: https://newrepublic.com/article/149717/watershed-moment-american-history-trump-nominates-brett-kavanaugh
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149717/watershed-moment-american-history-trump-nominates-brett-kavanaugh HTTP/1.1" 200 20686
DEBUG:__main__:Processing url: https://newrepublic.com/article/149683/trump-might-lose-trade-war-china
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149683/trump-might-lose-trade-war-china HTTP/1.1" 200 19429
DEBUG:__main__:Processing url: https://newrepublic.com/article/149628/americas-enduring-failure

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149491/1990s-got-wrong HTTP/1.1" 200 22743
DEBUG:__main__:Processing url: https://newrepublic.com/article/149438/big-pharma-captured-one-percent
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149438/big-pharma-captured-one-percent HTTP/1.1" 200 42370
DEBUG:__main__:Processing url: https://newrepublic.com/article/149433/glow-gets-empowered
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149433/glow-gets-empowered HTTP/1.1" 200 20848
DEBUG:__main__:Processing url: https://newrepublic.com/article/149437/echoes-chinese-exclusion
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149437/echoes-chinese-exclusion HTTP/1.1

DEBUG:__main__:Processing url: https://newrepublic.com/article/149309/wilbur-rosss-financial-shenanigans-massive-scandal
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149309/wilbur-rosss-financial-shenanigans-massive-scandal HTTP/1.1" 200 19593
DEBUG:__main__:Processing url: https://newrepublic.com/article/148864/democrats-mayors-presidential-candidates
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148864/democrats-mayors-presidential-candidates HTTP/1.1" 200 23787
DEBUG:__main__:PROCESSING PAGE: 11
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=11 HTTP/1.1" 200 98360
DEBUG:__main__:Processing url: https://newrepublic.com/article/149305/atttime-warner-merger-already-governm

DEBUG:__main__:Processing url: https://newrepublic.com/article/149145/jesus-lock-up
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149145/jesus-lock-up HTTP/1.1" 200 18958
DEBUG:__main__:Processing url: https://newrepublic.com/article/149138/conservatives-conned-james-comey
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149138/conservatives-conned-james-comey HTTP/1.1" 200 21522
DEBUG:__main__:Processing url: https://newrepublic.com/article/149128/tea-tariffs-rural-yunnan
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149128/tea-tariffs-rural-yunnan HTTP/1.1" 200 20948
DEBUG:__main__:Processing url: https://newrepublic.com/article/149118/empty-space-rachel-cusk
DEBUG:urllib3.conn

DEBUG:__main__:Processing url: https://newrepublic.com/article/148762/united-states-v-trump-supreme-court-case
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148762/united-states-v-trump-supreme-court-case HTTP/1.1" 200 21477
DEBUG:__main__:Processing url: https://newrepublic.com/article/148293/democracy-isnt-enough-elections-free-markets-latin-america
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148293/democracy-isnt-enough-elections-free-markets-latin-america HTTP/1.1" 200 29971
DEBUG:__main__:Processing url: https://newrepublic.com/article/148744/howard-schultzs-third-way
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148744/howard-schultzs-third-way HTTP/1.1" 200 19027
DEBU

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148606/nuclear-industrys-winners-losers HTTP/1.1" 200 21518
DEBUG:__main__:Processing url: https://newrepublic.com/article/148603/roseanne-barr-science-ambien-tweeting
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148603/roseanne-barr-science-ambien-tweeting HTTP/1.1" 200 18713
DEBUG:__main__:Processing url: https://newrepublic.com/article/148609/religious-rights-metoo-reckoning-coming
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148609/religious-rights-metoo-reckoning-coming HTTP/1.1" 200 30686
DEBUG:__main__:Processing url: https://newrepublic.com/article/148601/family-values
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148515/trumps-war-labor-now-includes-teens HTTP/1.1" 200 19117
DEBUG:__main__:Processing url: https://newrepublic.com/article/148512/party-women-party-white-men
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148512/party-women-party-white-men HTTP/1.1" 200 20030
DEBUG:__main__:Processing url: https://newrepublic.com/article/148505/turns-outsider-president
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148505/turns-outsider-president HTTP/1.1" 200 22203
DEBUG:__main__:Processing url: https://newrepublic.com/article/148507/netflixs-obama-deal-says-future-streaming
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/1

DEBUG:__main__:Processing url: https://newrepublic.com/article/148452/myth-trumps-populist-revolt
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148452/myth-trumps-populist-revolt HTTP/1.1" 200 20733
DEBUG:__main__:Processing url: https://newrepublic.com/article/148451/many-years-will-robert-mueller-need
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148451/many-years-will-robert-mueller-need HTTP/1.1" 200 21470
DEBUG:__main__:Processing url: https://newrepublic.com/article/148444/chesil-beach-longing-unfulfilled
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148444/chesil-beach-longing-unfulfilled HTTP/1.1" 200 18182
DEBUG:__main__:Processing url: https://newrepublic.com/article

DEBUG:__main__:Processing url: https://newrepublic.com/article/148190/know-healing-crystals-come-from
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148190/know-healing-crystals-come-from HTTP/1.1" 200 31557
DEBUG:__main__:Processing url: https://newrepublic.com/article/148388/long-tortured-history-job-guarantee
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148388/long-tortured-history-job-guarantee HTTP/1.1" 200 22600
DEBUG:__main__:Processing url: https://newrepublic.com/article/148380/democrats-can-over-promise-like-trump
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148380/democrats-can-over-promise-like-trump HTTP/1.1" 200 23089
DEBUG:__main__:Processing url: https://newre

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148313/understanding-authoritarianism-soccer HTTP/1.1" 200 22613
DEBUG:__main__:Processing url: https://newrepublic.com/article/148312/weird-fiction-alive
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148312/weird-fiction-alive HTTP/1.1" 200 20465
DEBUG:__main__:Processing url: https://newrepublic.com/article/148308/centrist-grievance-victim-politics
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148308/centrist-grievance-victim-politics HTTP/1.1" 200 21520
DEBUG:__main__:Processing url: https://newrepublic.com/article/148311/dont-blame-phones-narcissism
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148201/columbias-war-labor HTTP/1.1" 200 18876
DEBUG:__main__:Processing url: https://newrepublic.com/article/148199/five-questions-trump-missing-robert-muellers-list
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148199/five-questions-trump-missing-robert-muellers-list HTTP/1.1" 200 20294
DEBUG:__main__:Processing url: https://newrepublic.com/article/148198/marvel-killing-movies
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148198/marvel-killing-movies HTTP/1.1" 200 22390
DEBUG:__main__:Processing url: https://newrepublic.com/article/148194/migrant-caravan-enters-trumps-hostile-immigration-maze
DEBUG:urllib3.connectionpool:Starting

DEBUG:__main__:PROCESSING PAGE: 33
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=33 HTTP/1.1" 200 66181
DEBUG:__main__:Processing url: https://newrepublic.com/article/148135/usda-goes-hog-wild
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148135/usda-goes-hog-wild HTTP/1.1" 200 19646
DEBUG:__main__:Processing url: https://newrepublic.com/article/148137/wendell-berry-wants
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148137/wendell-berry-wants HTTP/1.1" 200 19512
DEBUG:__main__:Processing url: https://newrepublic.com/article/148126/epa-acting-like-big-tobacco
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https:

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148077/seems-good-true HTTP/1.1" 200 19196
DEBUG:__main__:Processing url: https://newrepublic.com/article/148076/slippery-james-comey
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148076/slippery-james-comey HTTP/1.1" 200 23858
DEBUG:__main__:Processing url: https://newrepublic.com/article/148063/amy-schumers-brain-damage
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148063/amy-schumers-brain-damage HTTP/1.1" 200 19073
DEBUG:__main__:Processing url: https://newrepublic.com/article/148062/important-number-business
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148062/important-number-business HTTP/1.1" 200 20

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148013/will-supreme-court-rein-civil-forfeiture HTTP/1.1" 200 19127
DEBUG:__main__:PROCESSING PAGE: 38
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=38 HTTP/1.1" 200 124145
DEBUG:__main__:Processing url: https://newrepublic.com/article/148011/problem-cold-war-comparisons
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148011/problem-cold-war-comparisons HTTP/1.1" 200 22014
DEBUG:__main__:Processing url: https://newrepublic.com/article/148005/probably-not-end-stages-trumps-presidency
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /articl

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147894/bhagwans-bottom-line-i HTTP/1.1" 200 18719
DEBUG:__main__:Processing url: https://newrepublic.com/article/147863/bhagwans-death-wish
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147863/bhagwans-death-wish HTTP/1.1" 200 19002
DEBUG:__main__:Processing url: https://newrepublic.com/article/147902/bhagwans-mind-control
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147902/bhagwans-mind-control HTTP/1.1" 200 19268
DEBUG:__main__:Processing url: https://newrepublic.com/article/147871/bhagwans-sexism
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:

DEBUG:__main__:PROCESSING PAGE: 43
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=43 HTTP/1.1" 200 70118
DEBUG:__main__:Processing url: https://newrepublic.com/article/147882/us-can-learn-britain-equal-pay
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147882/us-can-learn-britain-equal-pay HTTP/1.1" 200 19267
DEBUG:__main__:Processing url: https://newrepublic.com/article/147923/facebooks-innocence-project
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147923/facebooks-innocence-project HTTP/1.1" 200 19065
DEBUG:__main__:Processing url: https://newrepublic.com/article/147359/nuisance-laws-making-poverty-crime
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newr

DEBUG:__main__:Processing url: https://newrepublic.com/article/147753/strange-online-aesthetic-youtube-shooting-suspect
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147753/strange-online-aesthetic-youtube-shooting-suspect HTTP/1.1" 200 19756
DEBUG:__main__:Processing url: https://newrepublic.com/article/147751/humble-proposal-sanction-hungarian-kleptocrats
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147751/humble-proposal-sanction-hungarian-kleptocrats HTTP/1.1" 200 23846
DEBUG:__main__:Processing url: https://newrepublic.com/article/147377/model-businessman-dave-eggers-misses-story
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147377/model-businessman-dave-eggers-misses-st

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147374/company-men-legal-struggle-citizens-united-corporations-rights-people HTTP/1.1" 200 36229
DEBUG:__main__:Processing url: https://newrepublic.com/article/147683/culture-violent-white-guys
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147683/culture-violent-white-guys HTTP/1.1" 200 20702
DEBUG:__main__:PROCESSING PAGE: 48
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=48 HTTP/1.1" 200 73931
DEBUG:__main__:Processing url: https://newrepublic.com/article/147691/provocative-brilliance-death-stalin
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.co

DEBUG:__main__:Processing url: https://newrepublic.com/article/147623/saffron-curtain-buddhism-weaponized-cold-war
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147623/saffron-curtain-buddhism-weaponized-cold-war HTTP/1.1" 200 21854
DEBUG:__main__:Processing url: https://newrepublic.com/article/147619/pinkertons-still-never-sleep
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147619/pinkertons-still-never-sleep HTTP/1.1" 200 20830
DEBUG:__main__:Processing url: https://newrepublic.com/article/147621/worst-job-washington-trump-lawyer-john-dowd-quits
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147621/worst-job-washington-trump-lawyer-john-dowd-quits HTTP/1.1" 200 19586
DEBUG:__

DEBUG:__main__:Processing url: https://newrepublic.com/article/147531/banality-disappointment
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147531/banality-disappointment HTTP/1.1" 200 19073
DEBUG:__main__:Processing url: https://newrepublic.com/article/147391/hype-best
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147391/hype-best HTTP/1.1" 200 35180
DEBUG:__main__:Processing url: https://newrepublic.com/article/147366/anti-intelligence
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147366/anti-intelligence HTTP/1.1" 200 43710
DEBUG:__main__:PROCESSING PAGE: 53
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147432/dont-look-democrats-regulate-big-tech HTTP/1.1" 200 19144
DEBUG:__main__:Processing url: https://newrepublic.com/article/147423/will-another-white-big-banker-oversee-wall-street
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147423/will-another-white-big-banker-oversee-wall-street HTTP/1.1" 200 20096
DEBUG:__main__:Processing url: https://newrepublic.com/article/147426/forget-tax-breaks-education-key-attracting-businesses
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147426/forget-tax-breaks-education-key-attracting-businesses HTTP/1.1" 200 20215
DEBUG:__main__:Processing url: https://newrepublic.com/article/147416/moviepass-

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147303/reality-womens-pain HTTP/1.1" 200 22335
DEBUG:__main__:Processing url: https://newrepublic.com/article/147293/rise-corporate-social-responsibility
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147293/rise-corporate-social-responsibility HTTP/1.1" 200 19949
DEBUG:__main__:Processing url: https://newrepublic.com/article/147294/congress-can-wage-effective-trade-war
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147294/congress-can-wage-effective-trade-war HTTP/1.1" 200 21074
DEBUG:__main__:Processing url: https://newrepublic.com/article/147290/trumps-disdain-democracy-promotion
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.co

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=60 HTTP/1.1" 200 65436
DEBUG:__main__:Processing url: https://newrepublic.com/article/147223/trumps-fantasies-meet-harsh-reality-presidency
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147223/trumps-fantasies-meet-harsh-reality-presidency HTTP/1.1" 200 20813
DEBUG:__main__:Processing url: https://newrepublic.com/article/147226/good-girls-ordinary-women-turn-crime
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147226/good-girls-ordinary-women-turn-crime HTTP/1.1" 200 19960
DEBUG:__main__:Processing url: https://newrepublic.com/article/147227/court-cloud
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/14722

DEBUG:__main__:Processing url: https://newrepublic.com/article/147167/cpac-kids-alright-on-guns
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147167/cpac-kids-alright-on-guns HTTP/1.1" 200 19461
DEBUG:__main__:Processing url: https://newrepublic.com/article/147160/third-way-think-russiagate
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147160/third-way-think-russiagate HTTP/1.1" 200 20007
DEBUG:__main__:Processing url: https://newrepublic.com/article/147142/photos-parkland-survivors-take-stand
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147142/photos-parkland-survivors-take-stand HTTP/1.1" 200 23664
DEBUG:__main__:Processing url: https://newrepublic.com/article/147164/billy-

DEBUG:__main__:PROCESSING PAGE: 65
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=65 HTTP/1.1" 200 115568
DEBUG:__main__:Processing url: https://newrepublic.com/article/147098/halt-labors-slow-death
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147098/halt-labors-slow-death HTTP/1.1" 200 18261
DEBUG:__main__:Processing url: https://newrepublic.com/article/147102/opportunistic-rise-europes-far-right
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147102/opportunistic-rise-europes-far-right HTTP/1.1" 200 20757
DEBUG:__main__:Processing url: https://newrepublic.com/article/147087/mirai-nagasu-chloe-kim-not-immigrant-fantasy
DEBUG:urllib3.connectionpool:Starting new HTTPS connect

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146908/information-wants-chinese HTTP/1.1" 200 20774
DEBUG:__main__:Processing url: https://newrepublic.com/article/147031/facebooks-purpose-digital-addiction
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147031/facebooks-purpose-digital-addiction HTTP/1.1" 200 19826
DEBUG:__main__:Processing url: https://newrepublic.com/article/147011/rural-americas-drinking-water-crisis
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147011/rural-americas-drinking-water-crisis HTTP/1.1" 200 35494
DEBUG:__main__:Processing url: https://newrepublic.com/article/147018/memo-debate-republican-sham
DEBUG:urllib3.connectionpool:Starting new HTTPS connecti

DEBUG:__main__:Processing url: https://newrepublic.com/article/146959/truth-appalachia
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146959/truth-appalachia HTTP/1.1" 200 19875
DEBUG:__main__:PROCESSING PAGE: 70
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=70 HTTP/1.1" 200 75269
DEBUG:__main__:Processing url: https://newrepublic.com/article/146952/elizabeth-warren-model-political-leadership
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146952/elizabeth-warren-model-political-leadership HTTP/1.1" 200 20527
DEBUG:__main__:Processing url: https://newrepublic.com/article/146945/waco-perfect-show-trump-erain-bad-way
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146821/lively-irrelevance-conservative-magazines HTTP/1.1" 200 21585
DEBUG:__main__:Processing url: https://newrepublic.com/article/146796/scientists-saved-bear-fish-skinand-stumbled-game-changing-idea
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146796/scientists-saved-bear-fish-skinand-stumbled-game-changing-idea HTTP/1.1" 200 22658
DEBUG:__main__:Processing url: https://newrepublic.com/article/146820/man-us-steel
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146820/man-us-steel HTTP/1.1" 200 20046
DEBUG:__main__:Processing url: https://newrepublic.com/article/146806/democrats-can-learn-cecile-richards
DEBUG:urllib3.connectionpo

DEBUG:__main__:Processing url: https://newrepublic.com/article/146691/trump-likely-finish-first-term
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146691/trump-likely-finish-first-term HTTP/1.1" 200 22523
DEBUG:__main__:Processing url: https://newrepublic.com/article/146683/trump-fox-news-mainstreaming-white-nationalism
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146683/trump-fox-news-mainstreaming-white-nationalism HTTP/1.1" 200 19335
DEBUG:__main__:PROCESSING PAGE: 75
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=75 HTTP/1.1" 200 67098
DEBUG:__main__:Processing url: https://newrepublic.com/article/146677/trumps-middle-east-policy-goodish-bad-ugly
DEBUG:urllib3.connecti

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146589/mormonisms-crisis-faith HTTP/1.1" 200 22513
DEBUG:__main__:Processing url: https://newrepublic.com/article/146600/trumps-alarming-definition-treason
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146600/trumps-alarming-definition-treason HTTP/1.1" 200 19042
DEBUG:__main__:Processing url: https://newrepublic.com/article/146596/trumps-global-bigotry
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146596/trumps-global-bigotry HTTP/1.1" 200 20186
DEBUG:__main__:Processing url: https://newrepublic.com/article/146598/conservatives-trustbusters-now
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:u

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146501/afterlife-steve-bannon HTTP/1.1" 200 22387
DEBUG:__main__:Processing url: https://newrepublic.com/article/146507/trumps-voter-fraud-crusade-just-beginning
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146507/trumps-voter-fraud-crusade-just-beginning HTTP/1.1" 200 18994
DEBUG:__main__:Processing url: https://newrepublic.com/article/146505/michael-wolffs-revelations
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146505/michael-wolffs-revelations HTTP/1.1" 200 19628
DEBUG:__main__:PROCESSING PAGE: 80
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.c

In [96]:
# update text field to include more data
for doc in db[collection].find():
    soup = bs.BeautifulSoup(doc['html'],'lxml')
    text = ''
    for div in soup.findAll('div',{"class": "content-body"}):
        text += div.text
    if len(doc['meta']['text']) < len(text):
        db[collection].update_one(
            {'url' : doc['url']},
            {
                '$set':
                    {
                     'meta.text' : text
                    }
            }
            ,
            upsert=True
        )

In [97]:
url = 'https://newrepublic.com/article/146927/bad-dreams'
doc = db[collection].find_one({'url':url})

# The Atlantic

In [6]:
import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random
import pytz

utc=pytz.UTC

# name of collection for this media
collection = 'theAtlantic'
source = 'https://www.theatlantic.com/latest/'

# start 'page' at '1' but if you run across an error
# efficient way is to update this page to the same number
# where you experienced the error AFTER you correct the error in the code
# then rerun this cell
page   = 78

earliest_date = utc.localize(date_parser('2018-01-01'))

while True:
    log.debug(f'\n\n PROCESSING PAGE: {source+"?page="+str(page)}\n\n\
              ====================================\n\n')
    s = Source(source+'?page='+str(page))
    page += 1
    s.download()
    soup = bs.BeautifulSoup(s.html,'lxml')

    # line below needs to be updated per news source
    # to include the specific tags for article text 
    # defined differently for each site
    for section in soup.findAll('li', {"class":"article"}):
        url = urljoin(s.url, section.a['href'])
        log.debug(f'Processing url: {url}')
        
        article_date = scrape(url, db, collection)
        
        # the Atlantic blocks right away after few quick downloads
        # so it requires sleeping, testing showed 2 to 7 seconds is enough
        sleep(random.uniform(2,7))

    try:
        if utc.localize(article_date) < earliest_date:
            log.debug(f'Reached earliest date requested: {article_date}')
            break
    except Exception as e:
        log.debug(f'Exception: {e}')
        continue
    

DEBUG:__main__:

 PROCESSING PAGE: https://www.theatlantic.com/latest/?page=50



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /latest/?page=50 HTTP/1.1" 200 24756
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/the-democratic-party-wants-to-end-unemployment/560153/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/05/the-democratic-party-wants-to-end-unemployment/560153/ HTTP/1.1" 200 45321
DEBUG:__main__:Processing url: https://www.theatlantic.com/health/archive/2018/05/protecting-police-dogs-from-fentanyl/560132/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /health/archive/2018/05/protecting-police-dogs-from-

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /magazine/archive/2018/06/a-muslim-among-the-settlers/559145/ HTTP/1.1" 200 62848
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/05/patrick-melrose-showtime-sky-atlantic-review/560009/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/05/patrick-melrose-showtime-sky-atlantic-review/560009/ HTTP/1.1" 200 33382
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/when-white-nationalists-try-the-cop-defense/560045/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/05/when-white-nationalists-try-the-cop-defense/560045/ HTTP/1.1" 200 34480
DEBUG:__main__:Processing url: https://www.theatlantic.com/techn

DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/05/the-world-still-spins-around-male-genius/559925/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/05/the-world-still-spins-around-male-genius/559925/ HTTP/1.1" 200 37361
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/blankenship-west-virginia-republican-establishment-mcconnell-trump/560006/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/05/blankenship-west-virginia-republican-establishment-mcconnell-trump/560006/ HTTP/1.1" 200 33622
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/05/ebola-returns-to-the-congo-just-as-trump-decides-to-rescind-ebola-funds/560012/
DEBUG:urlli

DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/05/iran-deal/559964/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/05/iran-deal/559964/ HTTP/1.1" 200 31222
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/05/us-leaves-iran-deal/559646/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/05/us-leaves-iran-deal/559646/ HTTP/1.1" 200 33732
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/the-stunning-speed-of-eric-schneidermans-resignation/559922/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/201

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/05/gina-haspels-lose-lose-proposition-for-democrats/559826/ HTTP/1.1" 200 33418
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/05/trump-iran-nuclear-deal/559727/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/05/trump-iran-nuclear-deal/559727/ HTTP/1.1" 200 33132
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/05/donald-glover-this-is-america-childish-gambino/559805/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/05/donald-glover-this-is-america-childish-gambino/559805/ H

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/05/iran-deal-nuclear-crisis/559730/ HTTP/1.1" 200 35447
DEBUG:__main__:Processing url: https://www.theatlantic.com/photo/2018/05/photos-of-kilaueas-newest-lava-fissures-on-hawaiis-big-island/559751/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /photo/2018/05/photos-of-kilaueas-newest-lava-fissures-on-hawaiis-big-island/559751/ HTTP/1.1" 200 28543
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/05/aaron-traywick-death-ascendance-biomedical/559745/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/05/aaron-traywick-death-ascendance-biomedical/5597

DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/05/how-a-medieval-society-survived-nearly-60-years-of-drought/559616/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/05/how-a-medieval-society-survived-nearly-60-years-of-drought/559616/ HTTP/1.1" 200 32513
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/05/problems-with-breastfeeding-breast-pump/559559/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/05/problems-with-breastfeeding-breast-pump/559559/ HTTP/1.1" 200 34276
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/05/tully-jason-reitman-diablo-cody-charlize-theron-review/559538/
DEBUG:urllib3.connectionpool:Starting new HTTPS connecti

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/05/the-rain-review-netflix/559439/ HTTP/1.1" 200 32136
DEBUG:__main__:Exception: Not naive datetime (tzinfo is already set)
DEBUG:__main__:

 PROCESSING PAGE: https://www.theatlantic.com/latest/?page=55



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /latest/?page=55 HTTP/1.1" 200 24665
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/sbc-patterson/559532/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/05/sbc-patterson/559532/ HTTP/1.1" 200 34097
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/american-

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology/archive/2018/05/how-facebook-became-the-tech-company-people-love-to-hate/559418/ HTTP/1.1" 200 31893
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/05/animating-the-atlantic-archives-helen-keller-du-bois-robert-frost-albert-einstein/559388/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/05/animating-the-atlantic-archives-helen-keller-du-bois-robert-frost-albert-einstein/559388/ HTTP/1.1" 200 35167
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/05/the-future-hardships-of-rural-america/559343/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/05/the-future-hardships-of-rural-america

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /education/archive/2018/05/university-california-pell-graduation-enrollment/559325/ HTTP/1.1" 200 33058
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/05/facebook-the-unstoppable/559301/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology/archive/2018/05/facebook-the-unstoppable/559301/ HTTP/1.1" 200 32761
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/05/john-mulaneys-kid-gorgeous-netflix-special-review/559286/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/05/john-mulaneys-kid-gorgeous-netflix-special-review/559286/ HTTP/1.1" 200 32648
DEBUG:__main__:Processing url: https://www.theatlantic.com/politic

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/04/what-happened-at-the-white-house-correspondents-dinner/559232/ HTTP/1.1" 200 35163
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/iran-deal/559235/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/iran-deal/559235/ HTTP/1.1" 200 34527
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/germany-jews-muslim-migrants/558677/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/germany-jews-muslim-migrants/558677/ HTTP/1.1" 200 39303
DEBUG:__main__:Processing url: htt

DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/04/disobedience-review/558974/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/04/disobedience-review/558974/ HTTP/1.1" 200 31865
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/north-south-korea-peace-treaty/558932/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/north-south-korea-peace-treaty/558932/ HTTP/1.1" 200 33534
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/golden-state-killer-east-area-rapist-dna-genealogy/559070/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatla

DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/the-difference-between-killer-and-terrorist/558998/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/the-difference-between-killer-and-terrorist/558998/ HTTP/1.1" 200 38264
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/04/what-happens-when-your-bomb-defusing-robot-becomes-a-weapon/558758/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology/archive/2018/04/what-happens-when-your-bomb-defusing-robot-becomes-a-weapon/558758/ HTTP/1.1" 200 38185
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/04/how-slack-got-ahead-in-diversity/558806/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1

DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/will-the-supreme-court-bless-trumps-travel-ban/558956/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/will-the-supreme-court-bless-trumps-travel-ban/558956/ HTTP/1.1" 200 33722
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/04/the-evidence-is-not-with-joy-reid/558935/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology/archive/2018/04/the-evidence-is-not-with-joy-reid/558935/ HTTP/1.1" 200 32759
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/04/raising-kids-living-together-and-not-a-wedding-ring-in-sight/558917/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/trump-ronny-jackson-senate-confirmation/558833/ HTTP/1.1" 200 33296
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/toronto-van-incel/558836/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/toronto-van-incel/558836/ HTTP/1.1" 200 32776
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/how-cities-defend-against-vehicular-attacks/558782/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/how-cities-defend-against-vehicular-attacks/558782/ HTTP/1.1" 200 31613
DEBUG:__main__:Exception: Not naive datetime (tzinfo is already set)
DEBUG:__main__:

 PROCESSIN

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /newsletters/archive/2018/04/the-atlantic-daily-april-23-2018/558722/ HTTP/1.1" 200 35339
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/the-atlantic-politics-policy-daily-macron-america-great-again/558740/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/the-atlantic-politics-policy-daily-macron-america-great-again/558740/ HTTP/1.1" 200 32395
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/04/kanye-west-shania-twain-candace-owens-donald-trump/558683/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/04/kanye-west-shania-twain-candace-owens-donald-trump/558683/ HTTP/1.1" 200 33735
DEBUG:__

DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/north-korea-trump-kim-jong-un-nuclear-weapons-summit/558644/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/north-korea-trump-kim-jong-un-nuclear-weapons-summit/558644/ HTTP/1.1" 200 31860
DEBUG:__main__:Processing url: https://www.theatlantic.com/photo/2018/04/photos-from-state-dinners-past/558578/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /photo/2018/04/photos-from-state-dinners-past/558578/ HTTP/1.1" 200 31517
DEBUG:__main__:Processing url: https://www.theatlantic.com/magazine/archive/2018/05/jonathan-rauch-adolescence/556865/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/the-atlantic-politics-policy-daily-blunt-talk/558587/ HTTP/1.1" 200 30105
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/comey-mueller/558599/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/comey-mueller/558599/ HTTP/1.1" 200 30635
DEBUG:__main__:Processing url: https://www.theatlantic.com/notes/2018/04/what-comey-did-wrong/558536/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /notes/2018/04/what-comey-did-wrong/558536/ HTTP/1.1" 200 33989
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/cuba-diaz

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /letters/archive/2018/04/letters-the-ethical-options/558161/ HTTP/1.1" 200 34263
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/bajau-sea-nomads-diving-evolution-spleen/558359/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/bajau-sea-nomads-diving-evolution-spleen/558359/ HTTP/1.1" 200 34524
DEBUG:__main__:Processing url: https://www.theatlantic.com/business/archive/2018/04/philip-glass-taxi-driver-composer/558278/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /business/archive/2018/04/philip-glass-taxi-driver-composer/558278/ HTTP/1.1" 200 36835
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/radio-atlantic-t

DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/donald-trump-hasnt-taken-over-the-gop/558305/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/donald-trump-hasnt-taken-over-the-gop/558305/ HTTP/1.1" 200 35684
DEBUG:__main__:Processing url: https://www.theatlantic.com/health/archive/2018/04/autism-communication-language-social/558326/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /health/archive/2018/04/autism-communication-language-social/558326/ HTTP/1.1" 200 40538
DEBUG:__main__:Processing url: https://www.theatlantic.com/magazine/archive/2018/05/the-persuasive-female/556847/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/04/dear-therapist-should-i-contact-my-birth-mom/558245/ HTTP/1.1" 200 33507
DEBUG:__main__:Processing url: https://www.theatlantic.com/magazine/archive/2018/05/how-to-sway-a-baboon-despot/556892/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /magazine/archive/2018/05/how-to-sway-a-baboon-despot/556892/ HTTP/1.1" 200 31628
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/madeleine-albright-conversation-fascism/558254/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/madeleine-albright-conversation-fascism/558254/ HTTP/1.1" 200 32488
DEBUG:__main_

DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/searching-the-arctic-seas-for-lifesaving-drugs/557231/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/searching-the-arctic-seas-for-lifesaving-drugs/557231/ HTTP/1.1" 200 43338
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/startle-reflex-evolution-emotion/558158/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/startle-reflex-evolution-emotion/558158/ HTTP/1.1" 200 32369
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/zapping-peoples-brains-didnt-cure-their-depression-until-it-did/558032/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
D

DEBUG:__main__:Processing url: https://www.theatlantic.com/education/archive/2018/04/college-online-degree-blended-learning/557642/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /education/archive/2018/04/college-online-degree-blended-learning/557642/ HTTP/1.1" 200 33434
DEBUG:__main__:Processing url: https://www.theatlantic.com/magazine/archive/2018/05/in-praise-of-fair-weather-fandom/556841/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /magazine/archive/2018/05/in-praise-of-fair-weather-fandom/556841/ HTTP/1.1" 200 36779
DEBUG:__main__:Processing url: https://www.theatlantic.com/health/archive/2018/04/how-france-reduced-heroin-overdoses-by-79-in-four-years/558023/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connec

DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/syria-chemical-weapons/558041/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/syria-chemical-weapons/558041/ HTTP/1.1" 200 33537
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/yeast-sequencing-china/557930/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/yeast-sequencing-china/557930/ HTTP/1.1" 200 31021
DEBUG:__main__:Processing url: https://www.theatlantic.com/magazine/archive/2018/05/the-mobster-who-bought-his-teenage-son-a-hockey-team/556853/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/assault-on-democracy/557912/ HTTP/1.1" 200 31828
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/are-we-earths-only-civilization/557180/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/are-we-earths-only-civilization/557180/ HTTP/1.1" 200 34808
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/04/rampage-review/557924/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/04/rampage-review/557924/ HTTP/1.1" 200 32256
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology

DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/ryan/557879/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/ryan/557879/ HTTP/1.1" 200 34349
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/the-mystery-of-the-hummingbirds-whistling-tail/557858/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/the-mystery-of-the-hummingbirds-whistling-tail/557858/ HTTP/1.1" 200 34059
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/04/child-data-privacy/557840/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/04/child-data-

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/paul-ryan-medicare-medicaid-trump-deficits/557798/ HTTP/1.1" 200 32578
DEBUG:__main__:Processing url: https://www.theatlantic.com/video/index/557696/home-school-episode-1/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /video/index/557696/home-school-episode-1/ HTTP/1.1" 200 26112
DEBUG:__main__:Processing url: https://www.theatlantic.com/photo/2018/04/china-viewed-from-above/557789/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /photo/2018/04/china-viewed-from-above/557789/ HTTP/1.1" 200 30329
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/russia-syria-fake-news/557660/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theat

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/mta-zba/557702/ HTTP/1.1" 200 34818
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/the-myth-of-learning-styles/557687/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/the-myth-of-learning-styles/557687/ HTTP/1.1" 200 34496
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/ranked-choice-voting-maine/557669/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/ranked-choice-voting-maine/557669/ HTTP/1.1" 200 34817
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/

DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/midterms-impeachment/557609/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/midterms-impeachment/557609/ HTTP/1.1" 200 29149
DEBUG:__main__:Exception: Not naive datetime (tzinfo is already set)
DEBUG:__main__:

 PROCESSING PAGE: https://www.theatlantic.com/latest/?page=70



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /latest/?page=70 HTTP/1.1" 200 24728
DEBUG:__main__:Processing url: https://www.theatlantic.com/magazine/archive/2018/05/barbara-ehrenreich-natural-causes/556859/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /magazine/archive/2018/05/barbara-eh

DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/04/syria-israel-iran/557501/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/04/syria-israel-iran/557501/ HTTP/1.1" 200 31950
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/04/black-parents-media-stereotypes/557408/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/04/black-parents-media-stereotypes/557408/ HTTP/1.1" 200 31770
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/04/mark-zuckerberg-atlantic-exclusive/557489/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /newsletters/archive/2018/04/the-atlantic-daily-april-6-2018/557444/ HTTP/1.1" 200 33548
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/the-atlantic-politics-policy-daily-blake-news/557453/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/the-atlantic-politics-policy-daily-blake-news/557453/ HTTP/1.1" 200 32091
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/radio-atlantic-trumpocracy/557447/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/radio-atlantic-trumpocracy/557447/ HTTP/1.1" 200 29317
DEBUG:__mai

DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/us-china/557345/ HTTP/1.1" 200 36266
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/black-holes-milky-way-galaxy/557333/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/black-holes-milky-way-galaxy/557333/ HTTP/1.1" 200 31805
DEBUG:__main__:Processing url: https://www.theatlantic.com/photo/2018/04/bitcoin-a-stock-photo-cryptocurrency-primer/557339/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /photo/2018/04/bitcoin-a-stock-photo-cryptocurrency-primer/557339/ HTTP/1.1" 200 28226
DEBUG:__main__:Processing url: https://www.theatlantic.com/video/index/557324/revolving-doors/
DEBUG:urllib3.connectionpool:Starting new HTTPS connec

DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/04/what-motivated-the-youtube-shooters-terrorism/557237/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology/archive/2018/04/what-motivated-the-youtube-shooters-terrorism/557237/ HTTP/1.1" 200 32525
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/04/mantis-shrimp-eye-camera/557195/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/04/mantis-shrimp-eye-camera/557195/ HTTP/1.1" 200 32342
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/04/silicon-valley-under-attack-no-words/557249/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /newsletters/archive/2018/04/the-atlantic-daily-april-3-2018/557168/ HTTP/1.1" 200 36259
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/the-active-shooter-checklist/557198/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/04/the-active-shooter-checklist/557198/ HTTP/1.1" 200 31599
DEBUG:__main__:Processing url: https://www.theatlantic.com/video/index/557162/taher-raad-iraq-rap/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /video/index/557162/taher-raad-iraq-rap/ HTTP/1.1" 200 85082
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/t

DEBUG:__main__:Processing url: https://www.theatlantic.com/newsletters/archive/2018/04/the-atlantic-daily-april-2-2018/557054/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /newsletters/archive/2018/04/the-atlantic-daily-april-2-2018/557054/ HTTP/1.1" 200 34333
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/04/i-married-a-jew-80-years-later/557108/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/04/i-married-a-jew-80-years-later/557108/ HTTP/1.1" 200 33293
DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/04/the-atlantic-politics-policy-daily-eggsecutive-time/557114/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www

DEBUG:__main__:

 PROCESSING PAGE: https://www.theatlantic.com/latest/?page=75



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /latest/?page=75 HTTP/1.1" 200 24632
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/04/revisiting-martin-luther-king-jrs-most-haunting-sermon/556277/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/04/revisiting-martin-luther-king-jrs-most-haunting-sermon/556277/ HTTP/1.1" 200 35254
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/03/isle-of-dogs-and-japan-in-the-western-imagination/556916/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /enter

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/03/gaza-march-israel/556937/ HTTP/1.1" 200 32578
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/03/tiangong-space-station-crash/556925/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/03/tiangong-space-station-crash/556925/ HTTP/1.1" 200 33378
DEBUG:__main__:Processing url: https://www.theatlantic.com/photo/2018/03/photos-of-the-week-holy-week-rodeo-volkswagen-graveyard-soccer-on-an-ice-floe/556991/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /photo/2018/03/photos-of-the-week-holy-week-rodeo-volkswagen-graveyard-soccer-on-an-ice-floe/556991/

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/03/are-trumps-attacks-on-amazon-offered-in-bad-faith/556784/ HTTP/1.1" 200 35389
DEBUG:__main__:Processing url: https://www.theatlantic.com/science/archive/2018/03/dna-proves-alien-is-actually-a-girl-so-who-was-she/556625/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /science/archive/2018/03/dna-proves-alien-is-actually-a-girl-so-who-was-she/556625/ HTTP/1.1" 200 35138
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/03/islam-france-macron/556604/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/03/islam-france-macron/556604/ HTTP/1.1" 20

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/03/kim-jong-un-china/556670/ HTTP/1.1" 200 32231
DEBUG:__main__:Processing url: https://www.theatlantic.com/technology/archive/2018/03/what-congress-should-ask-mark-zuckerberg/556655/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /technology/archive/2018/03/what-congress-should-ask-mark-zuckerberg/556655/ HTTP/1.1" 200 37169
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/03/todays-rebels-are-model-children/556682/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/03/todays-rebels-are-model-children/556682/ HTTP/1.1" 200 33804
DEBUG:__main__:Proces

DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/03/countries-expel-russian-diplomats/556493/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/03/countries-expel-russian-diplomats/556493/ HTTP/1.1" 200 33474
DEBUG:__main__:Processing url: https://www.theatlantic.com/notes/2018/03/learning-from-history-a-goal-a-delusion-a-trap/556496/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /notes/2018/03/learning-from-history-a-goal-a-delusion-a-trap/556496/ HTTP/1.1" 200 37006
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/03/how-roseanne-handled-the-80s-culture-wars/556367/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:urllib3.connectio

DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/03/silicon-valley-season-5-premiere-review/556511/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com:443
DEBUG:resources.newspaper.newspaper.article:Download failed on URL https://www.theatlantic.com/entertainment/archive/2018/03/silicon-valley-season-5-premiere-review/556511/ because of HTTPSConnectionPool(host='www.theatlantic.com', port=443): Max retries exceeded with url: /entertainment/archive/2018/03/silicon-valley-season-5-premiere-review/556511/ (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x112001630>, 'Connection to www.theatlantic.com timed out. (connect timeout=7)'))


ArticleException: Article `download()` failed with HTTPSConnectionPool(host='www.theatlantic.com', port=443): Max retries exceeded with url: /entertainment/archive/2018/03/silicon-valley-season-5-premiere-review/556511/ (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x112001630>, 'Connection to www.theatlantic.com timed out. (connect timeout=7)')) on URL https://www.theatlantic.com/entertainment/archive/2018/03/silicon-valley-season-5-premiere-review/556511/