# Prep

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [68]:
from bson.objectid import ObjectId

def show_doc(db, collection, id):
    '''
    Finds a document by 'id' and prints contents to the console
    
    Parameters:
    --------
    db         : database name
    collection : mongodb collection
    id         : mongodb document id
    
    Returns:
    --------
    Prints first 100 symbols of each document's key to console
    '''
    
    doc = db[collection].find_one({'_id':ObjectId(id)})
    for k in doc:
        print(f"{k} : {str(doc[k])}")

# Some comands to keep dbs clean

In [None]:
# deletes all 'meta' fields from all docs
# htmlCol.update({}, {$unset: {meta:1}}, false, true); # mongo shell comand
htmlCol.update({}, {'$unset': {'meta':1}}, multi=True) # pymongo way

In [None]:
# leaves only unique documents by 'url' field

htmlCol.create_index(
    "url",
    unique=True
)

In [None]:
# pymongo 'find' returns cursor that allows iterating through results
# calling first object [0] allows accessing the dictionary with results
# the ['html'] is the key in the dictionary
html = htmlCol.find({'url':'http://www.msnbc.com/velshi-ruhle/watch/jeff-sessions-is-justifying-harsh-immigration-policy-with-the-bible-1256689731629'},\
            projection={'html':True, '_id':False})[0]['html']

In [None]:
# find documents NOT containing a 'tag': regex expression
import re
tag = re.compile('dek___3AQpw.')
docs = htmlCol.find({"html" : {'$not': tag}})
for d in docs[:20]: print(d['url'])

# Multithreading

In [None]:
from multiprocessing import Process

# use multiprocessing to extract features
def func():
    DB_NAME = 'scrape'
    db = pm.MongoClient(host='localhost', port=27017, maxPoolSize=500)[DB_NAME]

    for collection in ['left','right']: docs_parser(db[collection])

proc = Process(target=func)
proc.start()

# **Production code**

In [None]:
# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# dependencies
import pymongo as pm

from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

In [None]:
SOURCES = {
    'left'  : [
        'https://newrepublic.com',
        'https://www.motherjones.com'
# 3. Slate
# 4. The Intercept
# 5. Daily Beast
# 6. The Atlantic
# 7. Washington Post
# 8. Politico
# 9. The Guardian
# 10. BBC
    ],
    'right' : [
        'https://www.breitbart.com'
# 2. Fox News
# 3. New York Post
# 4. The American Conservative
# 5. Washington Times
# 6. Daily Wire
# 7. The Fiscal Times
# 8. The Hill
# 9. The Daily Caller
# 10. Reason
    ]
}

In [29]:
def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )

In [64]:
def scrape(url, earliest_date, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    article = Article(url)
    article.download()
    article.parse()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

# Newrepublic

In [62]:
import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser

# name of collection for this media
collection = 'newRep'
source = 'https://newrepublic.com/latest'
page   = 1

earliest_date = date_parser('2018-01-01')

while True:
    log.debug(f'PROCESSING PAGE: {page}')
    s = Source(source+'?page='+str(page))
    s.download()

    soup = bs.BeautifulSoup(s.html,'lxml')

    for section in soup.findAll('article'):
        url = urljoin(s.url, section.a['href'])
        log.debug(f'Processing url: {url}')
        
        article_date = scrape(url, earliest_date, db, collection)

    if article_date < earliest_date:
        log.debug(f'Reached earliest date requested: {article_date}')
        break
    page += 1

DEBUG:__main__:PROCESSING PAGE: 1
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=1 HTTP/1.1" 200 78494
DEBUG:__main__:Processing url: https://newrepublic.com/article/150202/trumps-farmer-bailout-americas-broken-food-system
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/150202/trumps-farmer-bailout-americas-broken-food-system HTTP/1.1" 200 20173
DEBUG:__main__:Processing url: https://newrepublic.com/article/150169/zadie-smiths-right-wrong
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/150169/zadie-smiths-right-wrong HTTP/1.1" 200 20603
DEBUG:__main__:Processing url: https://newrepublic.com/article/150159/doxx-racist
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1

DEBUG:__main__:Processing url: https://newrepublic.com/article/149947/white-house-pr-nightmare-never-came
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149947/white-house-pr-nightmare-never-came HTTP/1.1" 200 20967
DEBUG:__main__:Processing url: https://newrepublic.com/article/149955/trump-not-putins-puppet
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149955/trump-not-putins-puppet HTTP/1.1" 200 19697
DEBUG:__main__:Processing url: https://newrepublic.com/article/149950/russians-saw-helsinki-summit
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149950/russians-saw-helsinki-summit HTTP/1.1" 200 19777
DEBUG:__main__:Processing url: https://newrepublic.com/article/149939/american

DEBUG:__main__:PROCESSING PAGE: 6
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=6 HTTP/1.1" 200 60122
DEBUG:__main__:Processing url: https://newrepublic.com/article/149717/watershed-moment-american-history-trump-nominates-brett-kavanaugh
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149717/watershed-moment-american-history-trump-nominates-brett-kavanaugh HTTP/1.1" 200 20686
DEBUG:__main__:Processing url: https://newrepublic.com/article/149683/trump-might-lose-trade-war-china
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149683/trump-might-lose-trade-war-china HTTP/1.1" 200 19429
DEBUG:__main__:Processing url: https://newrepublic.com/article/149628/americas-enduring-failure

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149491/1990s-got-wrong HTTP/1.1" 200 22743
DEBUG:__main__:Processing url: https://newrepublic.com/article/149438/big-pharma-captured-one-percent
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149438/big-pharma-captured-one-percent HTTP/1.1" 200 42370
DEBUG:__main__:Processing url: https://newrepublic.com/article/149433/glow-gets-empowered
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149433/glow-gets-empowered HTTP/1.1" 200 20848
DEBUG:__main__:Processing url: https://newrepublic.com/article/149437/echoes-chinese-exclusion
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149437/echoes-chinese-exclusion HTTP/1.1

DEBUG:__main__:Processing url: https://newrepublic.com/article/149309/wilbur-rosss-financial-shenanigans-massive-scandal
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149309/wilbur-rosss-financial-shenanigans-massive-scandal HTTP/1.1" 200 19593
DEBUG:__main__:Processing url: https://newrepublic.com/article/148864/democrats-mayors-presidential-candidates
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148864/democrats-mayors-presidential-candidates HTTP/1.1" 200 23787
DEBUG:__main__:PROCESSING PAGE: 11
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=11 HTTP/1.1" 200 98360
DEBUG:__main__:Processing url: https://newrepublic.com/article/149305/atttime-warner-merger-already-governm

DEBUG:__main__:Processing url: https://newrepublic.com/article/149145/jesus-lock-up
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149145/jesus-lock-up HTTP/1.1" 200 18958
DEBUG:__main__:Processing url: https://newrepublic.com/article/149138/conservatives-conned-james-comey
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149138/conservatives-conned-james-comey HTTP/1.1" 200 21522
DEBUG:__main__:Processing url: https://newrepublic.com/article/149128/tea-tariffs-rural-yunnan
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/149128/tea-tariffs-rural-yunnan HTTP/1.1" 200 20948
DEBUG:__main__:Processing url: https://newrepublic.com/article/149118/empty-space-rachel-cusk
DEBUG:urllib3.conn

DEBUG:__main__:Processing url: https://newrepublic.com/article/148762/united-states-v-trump-supreme-court-case
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148762/united-states-v-trump-supreme-court-case HTTP/1.1" 200 21477
DEBUG:__main__:Processing url: https://newrepublic.com/article/148293/democracy-isnt-enough-elections-free-markets-latin-america
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148293/democracy-isnt-enough-elections-free-markets-latin-america HTTP/1.1" 200 29971
DEBUG:__main__:Processing url: https://newrepublic.com/article/148744/howard-schultzs-third-way
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148744/howard-schultzs-third-way HTTP/1.1" 200 19027
DEBU

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148606/nuclear-industrys-winners-losers HTTP/1.1" 200 21518
DEBUG:__main__:Processing url: https://newrepublic.com/article/148603/roseanne-barr-science-ambien-tweeting
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148603/roseanne-barr-science-ambien-tweeting HTTP/1.1" 200 18713
DEBUG:__main__:Processing url: https://newrepublic.com/article/148609/religious-rights-metoo-reckoning-coming
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148609/religious-rights-metoo-reckoning-coming HTTP/1.1" 200 30686
DEBUG:__main__:Processing url: https://newrepublic.com/article/148601/family-values
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148515/trumps-war-labor-now-includes-teens HTTP/1.1" 200 19117
DEBUG:__main__:Processing url: https://newrepublic.com/article/148512/party-women-party-white-men
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148512/party-women-party-white-men HTTP/1.1" 200 20030
DEBUG:__main__:Processing url: https://newrepublic.com/article/148505/turns-outsider-president
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148505/turns-outsider-president HTTP/1.1" 200 22203
DEBUG:__main__:Processing url: https://newrepublic.com/article/148507/netflixs-obama-deal-says-future-streaming
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/1

DEBUG:__main__:Processing url: https://newrepublic.com/article/148452/myth-trumps-populist-revolt
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148452/myth-trumps-populist-revolt HTTP/1.1" 200 20733
DEBUG:__main__:Processing url: https://newrepublic.com/article/148451/many-years-will-robert-mueller-need
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148451/many-years-will-robert-mueller-need HTTP/1.1" 200 21470
DEBUG:__main__:Processing url: https://newrepublic.com/article/148444/chesil-beach-longing-unfulfilled
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148444/chesil-beach-longing-unfulfilled HTTP/1.1" 200 18182
DEBUG:__main__:Processing url: https://newrepublic.com/article

DEBUG:__main__:Processing url: https://newrepublic.com/article/148190/know-healing-crystals-come-from
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148190/know-healing-crystals-come-from HTTP/1.1" 200 31557
DEBUG:__main__:Processing url: https://newrepublic.com/article/148388/long-tortured-history-job-guarantee
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148388/long-tortured-history-job-guarantee HTTP/1.1" 200 22600
DEBUG:__main__:Processing url: https://newrepublic.com/article/148380/democrats-can-over-promise-like-trump
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148380/democrats-can-over-promise-like-trump HTTP/1.1" 200 23089
DEBUG:__main__:Processing url: https://newre

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148313/understanding-authoritarianism-soccer HTTP/1.1" 200 22613
DEBUG:__main__:Processing url: https://newrepublic.com/article/148312/weird-fiction-alive
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148312/weird-fiction-alive HTTP/1.1" 200 20465
DEBUG:__main__:Processing url: https://newrepublic.com/article/148308/centrist-grievance-victim-politics
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148308/centrist-grievance-victim-politics HTTP/1.1" 200 21520
DEBUG:__main__:Processing url: https://newrepublic.com/article/148311/dont-blame-phones-narcissism
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148201/columbias-war-labor HTTP/1.1" 200 18876
DEBUG:__main__:Processing url: https://newrepublic.com/article/148199/five-questions-trump-missing-robert-muellers-list
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148199/five-questions-trump-missing-robert-muellers-list HTTP/1.1" 200 20294
DEBUG:__main__:Processing url: https://newrepublic.com/article/148198/marvel-killing-movies
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148198/marvel-killing-movies HTTP/1.1" 200 22390
DEBUG:__main__:Processing url: https://newrepublic.com/article/148194/migrant-caravan-enters-trumps-hostile-immigration-maze
DEBUG:urllib3.connectionpool:Starting

DEBUG:__main__:PROCESSING PAGE: 33
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=33 HTTP/1.1" 200 66181
DEBUG:__main__:Processing url: https://newrepublic.com/article/148135/usda-goes-hog-wild
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148135/usda-goes-hog-wild HTTP/1.1" 200 19646
DEBUG:__main__:Processing url: https://newrepublic.com/article/148137/wendell-berry-wants
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148137/wendell-berry-wants HTTP/1.1" 200 19512
DEBUG:__main__:Processing url: https://newrepublic.com/article/148126/epa-acting-like-big-tobacco
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https:

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148077/seems-good-true HTTP/1.1" 200 19196
DEBUG:__main__:Processing url: https://newrepublic.com/article/148076/slippery-james-comey
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148076/slippery-james-comey HTTP/1.1" 200 23858
DEBUG:__main__:Processing url: https://newrepublic.com/article/148063/amy-schumers-brain-damage
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148063/amy-schumers-brain-damage HTTP/1.1" 200 19073
DEBUG:__main__:Processing url: https://newrepublic.com/article/148062/important-number-business
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148062/important-number-business HTTP/1.1" 200 20

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148013/will-supreme-court-rein-civil-forfeiture HTTP/1.1" 200 19127
DEBUG:__main__:PROCESSING PAGE: 38
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=38 HTTP/1.1" 200 124145
DEBUG:__main__:Processing url: https://newrepublic.com/article/148011/problem-cold-war-comparisons
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/148011/problem-cold-war-comparisons HTTP/1.1" 200 22014
DEBUG:__main__:Processing url: https://newrepublic.com/article/148005/probably-not-end-stages-trumps-presidency
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /articl

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147894/bhagwans-bottom-line-i HTTP/1.1" 200 18719
DEBUG:__main__:Processing url: https://newrepublic.com/article/147863/bhagwans-death-wish
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147863/bhagwans-death-wish HTTP/1.1" 200 19002
DEBUG:__main__:Processing url: https://newrepublic.com/article/147902/bhagwans-mind-control
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147902/bhagwans-mind-control HTTP/1.1" 200 19268
DEBUG:__main__:Processing url: https://newrepublic.com/article/147871/bhagwans-sexism
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:

DEBUG:__main__:PROCESSING PAGE: 43
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=43 HTTP/1.1" 200 70118
DEBUG:__main__:Processing url: https://newrepublic.com/article/147882/us-can-learn-britain-equal-pay
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147882/us-can-learn-britain-equal-pay HTTP/1.1" 200 19267
DEBUG:__main__:Processing url: https://newrepublic.com/article/147923/facebooks-innocence-project
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147923/facebooks-innocence-project HTTP/1.1" 200 19065
DEBUG:__main__:Processing url: https://newrepublic.com/article/147359/nuisance-laws-making-poverty-crime
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newr

DEBUG:__main__:Processing url: https://newrepublic.com/article/147753/strange-online-aesthetic-youtube-shooting-suspect
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147753/strange-online-aesthetic-youtube-shooting-suspect HTTP/1.1" 200 19756
DEBUG:__main__:Processing url: https://newrepublic.com/article/147751/humble-proposal-sanction-hungarian-kleptocrats
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147751/humble-proposal-sanction-hungarian-kleptocrats HTTP/1.1" 200 23846
DEBUG:__main__:Processing url: https://newrepublic.com/article/147377/model-businessman-dave-eggers-misses-story
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147377/model-businessman-dave-eggers-misses-st

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147374/company-men-legal-struggle-citizens-united-corporations-rights-people HTTP/1.1" 200 36229
DEBUG:__main__:Processing url: https://newrepublic.com/article/147683/culture-violent-white-guys
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147683/culture-violent-white-guys HTTP/1.1" 200 20702
DEBUG:__main__:PROCESSING PAGE: 48
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=48 HTTP/1.1" 200 73931
DEBUG:__main__:Processing url: https://newrepublic.com/article/147691/provocative-brilliance-death-stalin
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.co

DEBUG:__main__:Processing url: https://newrepublic.com/article/147623/saffron-curtain-buddhism-weaponized-cold-war
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147623/saffron-curtain-buddhism-weaponized-cold-war HTTP/1.1" 200 21854
DEBUG:__main__:Processing url: https://newrepublic.com/article/147619/pinkertons-still-never-sleep
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147619/pinkertons-still-never-sleep HTTP/1.1" 200 20830
DEBUG:__main__:Processing url: https://newrepublic.com/article/147621/worst-job-washington-trump-lawyer-john-dowd-quits
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147621/worst-job-washington-trump-lawyer-john-dowd-quits HTTP/1.1" 200 19586
DEBUG:__

DEBUG:__main__:Processing url: https://newrepublic.com/article/147531/banality-disappointment
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147531/banality-disappointment HTTP/1.1" 200 19073
DEBUG:__main__:Processing url: https://newrepublic.com/article/147391/hype-best
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147391/hype-best HTTP/1.1" 200 35180
DEBUG:__main__:Processing url: https://newrepublic.com/article/147366/anti-intelligence
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147366/anti-intelligence HTTP/1.1" 200 43710
DEBUG:__main__:PROCESSING PAGE: 53
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147432/dont-look-democrats-regulate-big-tech HTTP/1.1" 200 19144
DEBUG:__main__:Processing url: https://newrepublic.com/article/147423/will-another-white-big-banker-oversee-wall-street
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147423/will-another-white-big-banker-oversee-wall-street HTTP/1.1" 200 20096
DEBUG:__main__:Processing url: https://newrepublic.com/article/147426/forget-tax-breaks-education-key-attracting-businesses
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147426/forget-tax-breaks-education-key-attracting-businesses HTTP/1.1" 200 20215
DEBUG:__main__:Processing url: https://newrepublic.com/article/147416/moviepass-

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147303/reality-womens-pain HTTP/1.1" 200 22335
DEBUG:__main__:Processing url: https://newrepublic.com/article/147293/rise-corporate-social-responsibility
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147293/rise-corporate-social-responsibility HTTP/1.1" 200 19949
DEBUG:__main__:Processing url: https://newrepublic.com/article/147294/congress-can-wage-effective-trade-war
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147294/congress-can-wage-effective-trade-war HTTP/1.1" 200 21074
DEBUG:__main__:Processing url: https://newrepublic.com/article/147290/trumps-disdain-democracy-promotion
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.co

DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=60 HTTP/1.1" 200 65436
DEBUG:__main__:Processing url: https://newrepublic.com/article/147223/trumps-fantasies-meet-harsh-reality-presidency
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147223/trumps-fantasies-meet-harsh-reality-presidency HTTP/1.1" 200 20813
DEBUG:__main__:Processing url: https://newrepublic.com/article/147226/good-girls-ordinary-women-turn-crime
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147226/good-girls-ordinary-women-turn-crime HTTP/1.1" 200 19960
DEBUG:__main__:Processing url: https://newrepublic.com/article/147227/court-cloud
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/14722

DEBUG:__main__:Processing url: https://newrepublic.com/article/147167/cpac-kids-alright-on-guns
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147167/cpac-kids-alright-on-guns HTTP/1.1" 200 19461
DEBUG:__main__:Processing url: https://newrepublic.com/article/147160/third-way-think-russiagate
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147160/third-way-think-russiagate HTTP/1.1" 200 20007
DEBUG:__main__:Processing url: https://newrepublic.com/article/147142/photos-parkland-survivors-take-stand
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147142/photos-parkland-survivors-take-stand HTTP/1.1" 200 23664
DEBUG:__main__:Processing url: https://newrepublic.com/article/147164/billy-

DEBUG:__main__:PROCESSING PAGE: 65
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=65 HTTP/1.1" 200 115568
DEBUG:__main__:Processing url: https://newrepublic.com/article/147098/halt-labors-slow-death
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147098/halt-labors-slow-death HTTP/1.1" 200 18261
DEBUG:__main__:Processing url: https://newrepublic.com/article/147102/opportunistic-rise-europes-far-right
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147102/opportunistic-rise-europes-far-right HTTP/1.1" 200 20757
DEBUG:__main__:Processing url: https://newrepublic.com/article/147087/mirai-nagasu-chloe-kim-not-immigrant-fantasy
DEBUG:urllib3.connectionpool:Starting new HTTPS connect

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146908/information-wants-chinese HTTP/1.1" 200 20774
DEBUG:__main__:Processing url: https://newrepublic.com/article/147031/facebooks-purpose-digital-addiction
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147031/facebooks-purpose-digital-addiction HTTP/1.1" 200 19826
DEBUG:__main__:Processing url: https://newrepublic.com/article/147011/rural-americas-drinking-water-crisis
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/147011/rural-americas-drinking-water-crisis HTTP/1.1" 200 35494
DEBUG:__main__:Processing url: https://newrepublic.com/article/147018/memo-debate-republican-sham
DEBUG:urllib3.connectionpool:Starting new HTTPS connecti

DEBUG:__main__:Processing url: https://newrepublic.com/article/146959/truth-appalachia
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146959/truth-appalachia HTTP/1.1" 200 19875
DEBUG:__main__:PROCESSING PAGE: 70
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=70 HTTP/1.1" 200 75269
DEBUG:__main__:Processing url: https://newrepublic.com/article/146952/elizabeth-warren-model-political-leadership
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146952/elizabeth-warren-model-political-leadership HTTP/1.1" 200 20527
DEBUG:__main__:Processing url: https://newrepublic.com/article/146945/waco-perfect-show-trump-erain-bad-way
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146821/lively-irrelevance-conservative-magazines HTTP/1.1" 200 21585
DEBUG:__main__:Processing url: https://newrepublic.com/article/146796/scientists-saved-bear-fish-skinand-stumbled-game-changing-idea
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146796/scientists-saved-bear-fish-skinand-stumbled-game-changing-idea HTTP/1.1" 200 22658
DEBUG:__main__:Processing url: https://newrepublic.com/article/146820/man-us-steel
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146820/man-us-steel HTTP/1.1" 200 20046
DEBUG:__main__:Processing url: https://newrepublic.com/article/146806/democrats-can-learn-cecile-richards
DEBUG:urllib3.connectionpo

DEBUG:__main__:Processing url: https://newrepublic.com/article/146691/trump-likely-finish-first-term
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146691/trump-likely-finish-first-term HTTP/1.1" 200 22523
DEBUG:__main__:Processing url: https://newrepublic.com/article/146683/trump-fox-news-mainstreaming-white-nationalism
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146683/trump-fox-news-mainstreaming-white-nationalism HTTP/1.1" 200 19335
DEBUG:__main__:PROCESSING PAGE: 75
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /latest?page=75 HTTP/1.1" 200 67098
DEBUG:__main__:Processing url: https://newrepublic.com/article/146677/trumps-middle-east-policy-goodish-bad-ugly
DEBUG:urllib3.connecti

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146589/mormonisms-crisis-faith HTTP/1.1" 200 22513
DEBUG:__main__:Processing url: https://newrepublic.com/article/146600/trumps-alarming-definition-treason
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146600/trumps-alarming-definition-treason HTTP/1.1" 200 19042
DEBUG:__main__:Processing url: https://newrepublic.com/article/146596/trumps-global-bigotry
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146596/trumps-global-bigotry HTTP/1.1" 200 20186
DEBUG:__main__:Processing url: https://newrepublic.com/article/146598/conservatives-trustbusters-now
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:u

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146501/afterlife-steve-bannon HTTP/1.1" 200 22387
DEBUG:__main__:Processing url: https://newrepublic.com/article/146507/trumps-voter-fraud-crusade-just-beginning
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146507/trumps-voter-fraud-crusade-just-beginning HTTP/1.1" 200 18994
DEBUG:__main__:Processing url: https://newrepublic.com/article/146505/michael-wolffs-revelations
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.com:443 "GET /article/146505/michael-wolffs-revelations HTTP/1.1" 200 19628
DEBUG:__main__:PROCESSING PAGE: 80
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): newrepublic.com
DEBUG:urllib3.connectionpool:https://newrepublic.c

In [96]:
# update text field to include more data
for doc in db[collection].find():
    soup = bs.BeautifulSoup(doc['html'],'lxml')
    text = ''
    for div in soup.findAll('div',{"class": "content-body"}):
        text += div.text
    if len(doc['meta']['text']) < len(text):
        db[collection].update_one(
            {'url' : doc['url']},
            {
                '$set':
                    {
                     'meta.text' : text
                    }
            }
            ,
            upsert=True
        )

In [97]:
url = 'https://newrepublic.com/article/146927/bad-dreams'
doc = db[collection].find_one({'url':url})

# The Atlantic

In [None]:
import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random

# name of collection for this media
collection = 'theAtlantic'
source = 'https://www.theatlantic.com/latest/'
page   = 1

earliest_date = date_parser('2018-07-01')

while True:
    log.debug(f'PROCESSING PAGE: {page}')
    s = Source(source+'?page='+str(page))
    s.download()

    soup = bs.BeautifulSoup(s.html,'lxml')

    for section in soup.findAll('li', {"class":"article"}):
        url = urljoin(s.url, section.a['href'])
        log.debug(f'Processing url: {url}')
        
        article_date = scrape(url, earliest_date, db, collection)
        
        sleep(random.uniform(1,5))

    if article_date < earliest_date:
        log.debug(f'Reached earliest date requested: {article_date}')
        break
    page += 1

DEBUG:__main__:PROCESSING PAGE: 1
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /latest/?page=1 HTTP/1.1" 200 24879
DEBUG:__main__:Processing url: https://www.theatlantic.com/photo/2018/07/photos-made-in-china/566150/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /photo/2018/07/photos-made-in-china/566150/ HTTP/1.1" 200 32347
DEBUG:__main__:Processing url: https://www.theatlantic.com/education/archive/2018/07/conservative-high-schoolers-are-ready-to-own-the-libs/566177/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /education/archive/2018/07/conservative-high-schoolers-are-ready-to-own-the-libs/566177/ HTTP/1.1" 200 33645
DEBUG:__main__:Processing url: https://www.theatlantic.com

DEBUG:__main__:Processing url: https://www.theatlantic.com/politics/archive/2018/07/the-24-candidates-for-2018-sunk-by-metoo-allegations/565457/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /politics/archive/2018/07/the-24-candidates-for-2018-sunk-by-metoo-allegations/565457/ HTTP/1.1" 200 60672
DEBUG:__main__:Processing url: https://www.theatlantic.com/entertainment/archive/2018/07/batman-catwoman-and-the-marriage-plot-in-comics/565295/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /entertainment/archive/2018/07/batman-catwoman-and-the-marriage-plot-in-comics/565295/ HTTP/1.1" 200 35860
DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/07/afghanistan-us-taliban/565799/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlan

DEBUG:__main__:Processing url: https://www.theatlantic.com/international/archive/2018/07/trump-pressure-iran-rouhani/566000/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /international/archive/2018/07/trump-pressure-iran-rouhani/566000/ HTTP/1.1" 200 33028
DEBUG:__main__:Processing url: https://www.theatlantic.com/family/archive/2018/07/why-dont-more-men-take-their-wives-last-names/565898/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GET /family/archive/2018/07/why-dont-more-men-take-their-wives-last-names/565898/ HTTP/1.1" 200 32914
DEBUG:__main__:Processing url: https://www.theatlantic.com/business/archive/2018/07/women-umpires/564641/
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.theatlantic.com
DEBUG:urllib3.connectionpool:https://www.theatlantic.com:443 "GE