In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser

import requests
from pprint import pprint

import requests
from lxml import html
import json
from resources.config import *
from datetime import timedelta

USERNAME = username
PASSWORD = password
apiKey = apiKey

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

In [8]:
# open browser session and login

LOGIN_URL = "https://myaccount.nytimes.com/auth/login"
session_requests = requests.session()

# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
authenticity_token = json.loads(tree.xpath("//div[@id='myAccountAuth']/@data-auth-options")[0].replace("'", "\""))['authToken']

# Create payload
payload = {
    "username": USERNAME, 
    "password": PASSWORD, 
    "csrfmiddlewaretoken": authenticity_token
}

# Perform login
result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))

In [15]:
# setup newsapi.org credentials
collection = 'nytimes'

logging.getLogger('urllib3').setLevel(logging.WARNING)

page     = 1
source   = 'the-new-york-times'
pageSize = 100

earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2018-02-04')

params = {
        'apiKey'   : apiKey,
        'pageSize' : pageSize,
        'page'     : page,
        'from'     : earliest_date,
        'to'       : latest_date,
        'sources'  : source
    }

# base url
api_url = 'https://newsapi.org/v2/everything?'

In [14]:
# scrape news
while latest_date > earliest_date:
    log.debug(f'Requesting period: {latest_date - timedelta(30)}-{latest_date}')
       
    page = 1
    params['from'] = latest_date - timedelta(30)
    params['to']   = latest_date
    
    r = requests.get(api_url, params=params)

    totalPages = r.json()['totalResults']//100+1

    log.debug(f'TOTAL PAGES FOR {source}: {totalPages}')
    
    for p in range(page,totalPages):
        log.debug(f'\n\n PROCESSING PAGE: {page}\n')

        params['page'] = page
        page += 1

        r = requests.get(api_url, params=params)

        for a in r.json()['articles']:
            try:
                url = a['url']
                log.debug(f"Processing url: {url}")
                result = session_requests.get(url, headers = dict(referer = url))
                soup = bs.BeautifulSoup(result.text, 'lxml')
                text = ''
                for d in soup.findAll('div', {'class':'StoryBodyCompanionColumn'}):
                    text += d.text

                saveToDB(db, collection, url, result.text, meta={
                    'date'    : date_parser(a['publishedAt']),
                    'title'   : a['title'],
                    'text'    : text,
                    'authors' : a['author']
                })

            except Exception as e:
                log.debug(e)

    latest_date -= timedelta(30)
            
log.debug('Ended scrape')

DEBUG:__main__:Requesting period: 2018-01-09 00:00:00-2018-02-08 00:00:00
DEBUG:__main__:TOTAL PAGES FOR the-new-york-times: 41
DEBUG:__main__:

 PROCESSING PAGE: 1

DEBUG:__main__:Processing url: https://www.nytimes.com/2015/05/31/opinion/the-voices-of-students-and-the-winners-are.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/books/michel-foucault-new-book.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/republicans-gerrymandering-power.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/sunday-review/sexual-harassment-masculine-jobs.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/changemaker-social-entrepreneur.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/obituaries/john-perry-barlow-internet-champion-dies.html
DEBUG:__mai

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/world/europe/louvre-nazi-looted-art.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/world/middleeast/syria-war-idlib.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://cn.nytimes.com/china/20180209/south-china-seas-photos/?umt_source=RSS
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/style/how-to-be-sorry.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/world/asia/south-china-seas-photos.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/movies/fifty-shades-freed-review-jamie-dornan-dakota-johnson.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/world/middleeast/israel-netanyahu-police.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Proces

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/well/live/talk-to-your-doctor-about-your-bucket-list.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/nyregion/new-york-today-fashion-rejects.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/trump-lie-mueller-investigation.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/trump-democracy-midterm-elections.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/kyle-duncan-john-thompson.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/trump-republican-scam.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/08/opinion/environment-oil-scare.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Pro

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/science/starfish-eyes.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/business/tesla-musk.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/movies/oscar-nominated-documentary-shorts-review.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/obituaries/dr-victor-sidel-public-health-champion-is-dead-at-86.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/sports/olympics/olympics-russia-doping.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/technology/personaltech/tech-gear-warm-winter-olympics.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/books/plagiarism-software-unveils-a-new-source-for-11-of-shakespeares-

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/arts/1517-to-paris-clint-eastwood-heroes-train-attack.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/sports/russian-athletes-appeals.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/world/asia/north-korea-olympics-sanctions.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/us/california-today-stock-market-tumble.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/business/media/los-angeles-times-sale.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/business/dealbook/los-angeles-times-sale.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/07/us/nazi-congress-illinois.html
DEBUG:__main__:Saved to DB
DEBUG:__main__

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/world/middleeast/syria-bombing-damascus-united-nations.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/technology/snap-revenue-earnings.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/opinion/trump-financial-crisis.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/world/africa/south-sudan-protests-us-arms-embargo.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/climate/idaho-schools-climate-change.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/opinion/corruption-bribery-food-words.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://cn.nytimes.com/asia-pacific/20180207/korea-history/?umt_source=RSS
DEBUG:__main__:Saved to DB
DEBUG:__

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/world/maldives-president-political-crisis.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/lens/seeking-humanity-in-a-prison-passion-play.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/movies/tarantino-thurman-interview.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/us/baltimore-police-corruption.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/realestate/commercial/twa-terminal-jfk-airport.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/world/australia/we-have-to-get-naked-sydney.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/us/politics/government-shutdown-trump-immigration.html
DEBUG:__main__:Sav

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/books/review/fire-sermon-jamie-quatro.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/books/review/new-noteworthy-matthew-schneier.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/books/review/buzz-lieberman-vibrator-nation-comella-sex-toys.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/magazine/the-towers-came-down-and-with-them-the-promise-of-public-housing.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/books/review/match-book-poems-young-readers.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/world/asia/mike-pence-north-korea.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/06/world/asia/hong-kong-jo

DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/world/africa/kenya-american-wildlife-expert-killed.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/style/liz-smith-memorial.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://cn.nytimes.com/china/20180206/hong-kong-joshua-wong-appeal/?umt_source=RSS
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/opinion/rebekah-mercer-museum-credibility-.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/world/europe/uk-nhs-trump.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/theater/womens-voices-festival-a-potent-reminder-of-who-goes-unheard-onstage.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/sports/michigan-state-larry-nassar.html
DEBUG:__main__:Saved to DB
DEBUG:__main__

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/technology/renee-james-intel-ampere-startup.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/business/dealbook/broadcom-qualcomm-bid.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/business/dealbook/broadcom-qualcomm-deal.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/technology/silicon-valley-brotopia-emily-chang.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/business/media/mlk-commercial-ram-dodge.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/style/olympics-artists-in-residence-program.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/05/nyregion/new-york-today-marijuana-history.html
DEBUG:__main__:S

DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/04/business/economy/powell-steps-becomes-fed-chief-as-economy-starts-to-show-strain.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://cn.nytimes.com/asia-pacific/20180205/pyeongchang-winter-olympics-south-korea/?umt_source=RSS
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/04/technology/amazon-asked-for-patience-remarkably-wall-street-complied.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/04/sports/super-bowl-patriots-eagles.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/04/obituaries/lin-bolen-dead.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/04/world/asia/korea-olympics.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/02/04/business/media/super-bowl-commercials.html
DEBUG:

KeyboardInterrupt: 