In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.executable

'/anaconda3/bin/python'

In [1]:
# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser

import requests
from pprint import pprint

import requests
from lxml import html
import json
from resources.config import *
from datetime import timedelta

USERNAME = username
PASSWORD = password
apiKey = apiKey

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

In [14]:
# open browser session and login

LOGIN_URL = "https://myaccount.nytimes.com/auth/login"
session_requests = requests.session()

# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
authenticity_token = json.loads(tree.xpath("//div[@id='myAccountAuth']/@data-auth-options")[0].replace("'", "\""))['authToken']

# Create payload
payload = {
    "username": USERNAME, 
    "password": PASSWORD, 
    "csrfmiddlewaretoken": authenticity_token
}

# Perform login
result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): myaccount.nytimes.com
DEBUG:urllib3.connectionpool:https://myaccount.nytimes.com:443 "GET /auth/login HTTP/1.1" 200 1505
DEBUG:urllib3.connectionpool:https://myaccount.nytimes.com:443 "POST /auth/login HTTP/1.1" 200 1534


In [15]:
result

<Response [200]>

In [16]:
# setup newsapi.org credentials
collection = 'nytimes'

# logging.getLogger('urllib3').setLevel(logging.WARNING)

page     = 1
source   = 'the-new-york-times'
pageSize = 100

earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2017-11-06')

params = {
        'apiKey'   : apiKey,
        'pageSize' : pageSize,
        'page'     : page,
        'from'     : earliest_date,
        'to'       : latest_date,
        'sources'  : source
    }

# base url
api_url = 'https://newsapi.org/v2/everything?'

In [None]:
# scrape news
while latest_date > earliest_date:
    log.debug(f'Requesting period: {latest_date - timedelta(30)}-{latest_date}')
       
    page = 1
    params['from'] = latest_date - timedelta(30)
    params['to']   = latest_date
    
    r = requests.get(api_url, params=params)

    totalPages = r.json()['totalResults']//100+1

    log.debug(f'TOTAL PAGES FOR {source}: {totalPages}')
    
    for p in range(page,totalPages):
        log.debug(f'\n\n PROCESSING PAGE: {page}\n')

        params['page'] = page
        page += 1

        r = requests.get(api_url, params=params)

        for a in r.json()['articles']:
            try:
                url = a['url']
                log.debug(f"Processing url: {url}")
                result = session_requests.get(url, headers = dict(referer = url))
                soup = bs.BeautifulSoup(result.text, 'lxml')
                text = ''
                for d in soup.findAll('div', {'class':'StoryBodyCompanionColumn'}):
                    text += d.text

                saveToDB(db, collection, url, result.text, meta={
                    'date'    : date_parser(a['publishedAt']),
                    'title'   : a['title'],
                    'text'    : text,
                    'authors' : a['author']
                })

            except Exception as e:
                log.debug(e)

    latest_date -= timedelta(30)
            
log.debug('Ended scrape')

In [18]:
# collection | empty query | sort in pymongo accepts list of tuples 
# as arguments as opposed to dictionary for native mongoDB queries
# because python's dict does not store values order
# native mongoDB: .sort({"meta.date": 1})  –dict-like object
# pymongo way:    .sort([("meta.date",1)]) –list of tuples
# the result is pymongo cursor that is iterable as list
# thus we access the first item of the list as [0] and then
# access the underlying dict object with ['meta']['date']

db[collection].find().sort([("meta.date",1)]).limit(1)[0]['meta']['date']

datetime.datetime(2017, 11, 5, 11, 0, 47)