In [1]:
%load_ext autoreload
%autoreload 2

# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
import pytz
from datetime import timedelta

import requests

from resources.config import apiKey

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    log.debug(f"Exctracting features from {url}")
    try:
        article = Article(url)
        article.download()
        # the below method may only extract a snippet... 
        # check the database for results of text extraction
        # and apply additional processing if needed after 
        # article has been stored in the DB
        # see code below Newrepublic for example
        article.parse()
    except Exception as e:
        log.critical(f'Data not saved: {e}')
        return datetime.datetime.now()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

In [3]:
# setup newsapi.org credentials
collection = 'wsj'

logging.getLogger('urllib3').setLevel(logging.WARNING)

page     = 1
source   = 'the-wall-street-journal'
pageSize = 100

earliest_date = date_parser('2017-01-01')
latest_date = date_parser('2018-07-31')

params = {
        'apiKey'   : apiKey,
        'pageSize' : pageSize,
        'page'     : page,
        'from'     : earliest_date,
        'to'       : latest_date,
        'sources'  : source
    }

# base url
api_url = 'https://newsapi.org/v2/everything?'

In [4]:
# scrape news
while latest_date > earliest_date:
    log.debug(f'Requesting period: {latest_date - timedelta(30)}-{latest_date}')
       
    page = 1
    params['from'] = latest_date - timedelta(30)
    params['to']   = latest_date
    
    r = requests.get(api_url, params=params)
    break

    totalPages = r.json()['totalResults']//100+1

    log.debug(f'TOTAL PAGES FOR {source}: {totalPages}')
    
    for p in range(page,totalPages):
        log.debug(f'\n\n PROCESSING PAGE: {page} of {totalPages}\n')

        params['page'] = page
        page += 1

        r = requests.get(api_url, params=params)

        for a in r.json()['articles']:
            try:
                url = a['url']
                log.debug(f"Processing url: {url}")
                result = requests.get(url)
                soup = bs.BeautifulSoup(result.text, 'lxml')
                text = soup.find('article').text

                saveToDB(db, collection, url, result.text, meta={
                    'date'    : date_parser(a['publishedAt']),
                    'title'   : a['title'],
                    'text'    : text,
                    'authors' : a['author']
                })

            except Exception as e:
                log.debug(e)

    latest_date -= timedelta(30)
            
log.debug('Ended scrape')

DEBUG:__main__:Requesting period: 2018-07-01 00:00:00-2018-07-31 00:00:00
DEBUG:__main__:Ended scrape


In [6]:
r.json()['articles'][0]

{'source': {'id': 'the-wall-street-journal',
  'name': 'The Wall Street Journal'},
 'author': 'Anthony Harrup',
 'title': 'The Wall Street Journal: No deaths reported in Mexico airliner crash',
 'description': 'An Aeromexico airliner has crashed in the northern Durango state of Mexico, however the state’s governor says no one was killed.',
 'url': 'https://www.wsj.com/articles/airliner-crashes-in-mexico-no-reported-deaths-1533076351',
 'urlToImage': 'http://s.marketwatch.com/public/resources/MWimages/MW-GN527_durang_MG_20180731194618.jpg',
 'publishedAt': '2018-07-31T23:49:59Z'}

In [7]:
url = 'https://www.wsj.com/articles/airliner-crashes-in-mexico-no-reported-deaths-1533076351'
r = requests.get(url)
soup = bs.BeautifulSoup(r.text, 'lxml')

In [9]:
soup.findAll('p')

[<p class="">August 1, 2018</p>,
 <p>An Aeromexico airliner crashed during takeoff Tuesday in the northern Mexican state of Durango, causing a number of injuries but no deaths, the airline and authorities said.</p>,
 <p>Aeromexico said Flight 2431, an Embraer 190 plane with 97 passengers and four crew members aboard, was heading from Durango City to Mexico City. </p>,
 <p>“At the moment we don’t...
   </p>,
 <p class="style__column-name_2q_SeZUL5gpEK9-5nObB5R ">WSJ Membership</p>,
 <p class="style__column-name_2q_SeZUL5gpEK9-5nObB5R ">Customer Service</p>,
 <p class="style__column-name_2q_SeZUL5gpEK9-5nObB5R ">Tools &amp; Features</p>,
 <p class="style__column-name_2q_SeZUL5gpEK9-5nObB5R ">Ads</p>,
 <p class="style__column-name_2q_SeZUL5gpEK9-5nObB5R ">More</p>]

VIA_TWITTER = ["wsj.com"]

def changeRefer(details)

    foundReferer = False
    foundUA = False

    def useTwitter = VIA_TWITTER.map(function(url) {
    if (details.url.includes(url)) {
      return true;
    }
    return false;
  })
  .reduce(function(a, b) { return a || b}, false);

  var reqHeaders = details.requestHeaders.filter(function(header) {

    // block cookies by default
    if (header.name !== "Cookie") {
      return header;
    } 

  }).map(function(header) {
    
    if (header.name === "Referer") {
      header.value = setRefer(useTwitter);
      foundReferer = true;
    }
    if (header.name === "User-Agent") {
      header.value = setUserAgent(useTwitter);
      foundUA = true;
    }
    return header;
  })
  
  // append referer
  if (!foundReferer) {
    reqHeaders.push({
      "name": "Referer",
      "value": setRefer(useTwitter)
    })
  }
  if (!foundUA) {
    reqHeaders.push({
      "name": "User-Agent",
      "value": setUserAgent(useTwitter)
    })
  }
  return {requestHeaders: reqHeaders};
}

function blockCookies(details) {
  for (var i = 0; i < details.responseHeaders.length; ++i) {
    if (details.responseHeaders[i].name === "Set-Cookie") {
      details.responseHeaders.splice(i, 1);
    }
  }
  return {responseHeaders: details.responseHeaders};
}

function setRefer(useTwitter) {
  if (useTwitter) return "https://t.co/T1323aaaa"; 
  else return "https://www.google.com/";
}

function setUserAgent(useTwitter) {
  if (useTwitter) return "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.1.32 (KHTML, like Gecko) Mobile/14C92 Twitter for iPhone";
  else return "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
}

chrome.webRequest.onBeforeSendHeaders.addListener(changeRefer, {
  urls: ["<all_urls>"],
  types: ["main_frame"],
}, ["requestHeaders", "blocking"]);

chrome.webRequest.onHeadersReceived.addListener(blockCookies, {
  urls: ["<all_urls>"],
  types: ["main_frame"],
}, ["responseHeaders", "blocking"]);