In [1]:
%load_ext autoreload
%autoreload 2

In [126]:
# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser

import requests
from pprint import pprint

import requests
from lxml import html
import json
from resources.config import *
from datetime import timedelta

USERNAME = username
PASSWORD = password
apiKey = apiKey

In [127]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

In [128]:
# open browser session and login

LOGIN_URL = "https://myaccount.nytimes.com/auth/login"
session_requests = requests.session()

# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
authenticity_token = json.loads(tree.xpath("//div[@id='myAccountAuth']/@data-auth-options")[0].replace("'", "\""))['authToken']

# Create payload
payload = {
    "username": USERNAME, 
    "password": PASSWORD, 
    "csrfmiddlewaretoken": authenticity_token
}

# Perform login
result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))

In [129]:
# setup newsapi.org credentials
collection = 'nytimes'

logging.getLogger('urllib3').setLevel(logging.WARNING)

page     = 1
source   = 'the-new-york-times'
pageSize = 100

earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2018-07-31')

params = {
        'apiKey'   : apiKey,
        'pageSize' : pageSize,
        'page'     : page,
        'from'     : earliest_date,
        'to'       : latest_date,
        'sources'  : source
    }

# base url
api_url = 'https://newsapi.org/v2/everything?'

In [None]:
# scrape news
while latest_date > earliest_date:
    log.debug(f'Requesting period: {latest_date - timedelta(30)}-{latest_date}')
       
    page = 1
    params['from'] = latest_date - timedelta(30)
    params['to']   = latest_date
    
    r = requests.get(api_url, params=params)

    totalPages = r.json()['totalResults']//100+1

    log.debug(f'TOTAL PAGES FOR {source}: {totalPages}')
    
    for p in range(page,totalPages):
        log.debug(f'\n\n PROCESSING PAGE: {page}\n')

        params['page'] = page
        page += 1

        r = requests.get(api_url, params=params)

        for a in r.json()['articles']:
            try:
                url = a['url']
                log.debug(f"Processing url: {url}")
                result = session_requests.get(url, headers = dict(referer = URL))
                soup = bs.BeautifulSoup(result.text, 'lxml')
                text = ''
                for d in soup.findAll('div', {'class':'StoryBodyCompanionColumn'}):
                    text += d.text

                saveToDB(db, collection, url, result.text, meta={
                    'date'    : date_parser(a['publishedAt']),
                    'title'   : a['title'],
                    'text'    : text,
                    'authors' : a['author']
                })

            except Exception as e:
                log.debug(e)

    latest_date -= timedelta(30)
            
log.debug('Ended scrape')

DEBUG:__main__:Requesting period: 2018-07-01 00:00:00-2018-07-31 00:00:00
DEBUG:__main__:TOTAL PAGES FOR the-new-york-times: 41
DEBUG:__main__:

 PROCESSING PAGE: 1

DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/world/asia/imran-khan-pakistan-us-afghanistan.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/sports/mets-trade-deadline.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/nyregion/queens-killings-custody-netherlands.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/world/africa/russian-journalists-killed-central-african-republic.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/business/dealbook/fintech-bank-charter.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://cn.nytimes.com/opinion/20180801/how-censorship-breeds-instability/?umt_source=RSS
D

DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/movies/first-aids-movie-gay-drama-buddies.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/us/politics/paul-manafort-trial.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/dining/coco-pazzo-trattoria-yves-review.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/opinion/letters/trump-pence.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/science/king-penguin-decline-antarctica.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/slideshow/2018/07/31/dining/yves-restaurant-nyc.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/slideshow/2018/07/31/dining/coco-pazzo-trattoria-nyc.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.co

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/opinion/transaction-costs-and-tethers-why-im-a-crypto-skeptic.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/sports/lebron-james-instagram-nba.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/science/lassie-help-dog.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/us/migrant-children-separation-anxiety.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/health/opioids-spinal-injections.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/31/briefing/brexit-russia-mars.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/world/asia/philippines-bombing-abu-sayyaf.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/us/minneapolis-police-thurman-blevins.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/us/politics/trump-government-shutdown.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/opinion/trump-immigrants-iowa-farmers-workers.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/arts/television/review-making-it-amy-poehler-nick-offerman.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://cn.nytimes.com/culture/20180731/chen-shi-zheng-beijing-music-festival/?umt_source=RSS
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/opinion/letters/trump-sulzberger-press.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/opinion/inside-the-world-of-racist-science-fi

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/opinion/how-to-talk-to-a-racist.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/climate/record-heat-waves.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/upshot/shopping-for-health-care-simply-doesnt-work-so-what-might.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/arts/feminized-technology-robots.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/well/adhd-anxiety-depression-medication-children-parents-doctors.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/arts/hands-videos-facebook-youtube.html
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://www.nytimes.com/2018/07/30/arts/dogs-cats-internet.html
