# Dependencies

In [1]:
# %load_ext autoreload
# %autoreload 2

# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random
import pytz
import datetime

import requests

logging.getLogger('urllib3').setLevel(logging.WARNING)

In [None]:
# for infinite scroll page
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys

import unittest, time, re

# to divert selenium log stream away
logging.getLogger('selenium').setLevel(logging.WARNING)


# Helpers

In [20]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    log.debug(f"Exctracting features from {url}")
    try:
        article = Article(url)
        article.download()
        # the below method may only extract a snippet... 
        # check the database for results of text extraction
        # and apply additional processing if needed after 
        # article has been stored in the DB
        # see code below Newrepublic for example
        article.parse()
    except Exception as e:
        log.critical(f'Data not saved: {e}')
        return datetime.datetime.now()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

# NY Post

In [None]:
collection = 'nypost'
source = 'https://nypost.com/news/'

class Sel(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Chrome()
        self.driver.implicitly_wait(30)
        self.base_url = source
        self.verificationErrors = []
        self.accept_next_alert = True
    def getPage(self):
        driver = self.driver
        delay = 2
        driver.get(self.base_url)
        html_source = driver.page_source
        self.html = html_source.encode('utf-8')
        return self.html
    def scrollDown(self):
        driver = self.driver
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.html = driver.page_source.encode('utf-8')
        return driver.page_source 
    def shutdown(self):
        driver = self.driver
        driver.quit()

data = Sel()

data.setUp()
data.getPage()
page = 1

utc=pytz.UTC
earliest_date = utc.localize(date_parser('2017-01-01'))

scraped_urls = []

while True:
    log.debug(f'NEXT SCROLL (#{page})')
    page += 1
    
    html = data.scrollDown()
    soup = bs.BeautifulSoup(html,'lxml')

    for link in soup.find('div', {"id":"primary"}).findAll('a'):
        url = urljoin(source, link['href'])
        if url and url not in scraped_urls:
            scraped_urls.append(url)
            log.debug(f'Processing url: {url}')
            article_date = scrape(url, db, collection)
    try:
        if article_date < earliest_date:
            log.debug(f'Reached earliest date requested: {article_date}')
            break
    except Exception as e:
        log.debug(f"Something is wrong: {e}")
data.shutdown()

In [None]:
collection = 'nypost'
source = 'https://nypost.com/news/page/'
page   = 1

utc=pytz.UTC
earliest_date = date_parser('2017-01-01')

while True:
    log.debug(f'\n\n PROCESSING PAGE: {source+str(page)}\n\n\
              ====================================\n\n')
    s = Source(source+str(page))
    page += 1
    s.download()
    soup = bs.BeautifulSoup(s.html,'lxml')

    for h3 in soup.find('div', {"id":"primary"}).findAll('h3'):
        for link in h3.findAll('a'):
            url = link['href']
            log.debug(f'Processing url: {url}')

            try:
                article_date = scrape(url, db, collection)
            except Exception as e:
                log.debug(e)
                article_date = earliest_date + 1 #to make sure scraping continues

        try:
            if article_date < earliest_date:
                log.debug(f'Reached earliest date requested: {article_date}')
                break
        except Exception as e:
            log.debug(f'Exception: {e}')
            continue

DEBUG:__main__:

 PROCESSING PAGE: https://nypost.com/news/page/1



DEBUG:__main__:Processing url: https://nypost.com/2018/07/30/pantsless-doctor-busted-in-bed-with-underage-boy/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/30/pantsless-doctor-busted-in-bed-with-underage-boy/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/30/us-teen-studying-abroad-drowns-while-swimming-in-israel/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/30/us-teen-studying-abroad-drowns-while-swimming-in-israel/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/30/investigators-gave-up-search-for-missing-flight-mh370-months-ago/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/30/investigators-gave-up-search-for-missing-flight-mh370-months-ago/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/30/canada-dry-ginger-ale-is-a-sham-suit/
DEBUG:__m

DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/28/england-could-see-a-massive-onion-potato-shortage/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/28/hundreds-attend-funeral-of-duck-boat-disaster-victims/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/28/hundreds-attend-funeral-of-duck-boat-disaster-victims/
DEBUG:__main__:Saved to DB
DEBUG:__main__:

 PROCESSING PAGE: https://nypost.com/news/page/4



DEBUG:__main__:Processing url: https://nypost.com/2018/07/28/california-wildfire-leaves-charred-remains-of-city-in-its-path/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/28/california-wildfire-leaves-charred-remains-of-city-in-its-path/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/28/jetblue-comes-through-after-bridesmaid-axed-from-wedding-party/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/28/jetblue-comes-through-after-br

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/recordings-show-change-in-weather-before-tourist-boat-sank/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27/recordings-show-change-in-weather-before-tourist-boat-sank/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/ocasio-cortez-hits-the-campaign-trail-for-other-dems/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27/ocasio-cortez-hits-the-campaign-trail-for-other-dems/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/officials-say-north-korea-returning-remains-shows-improved-relations/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27/officials-say-north-korea-returning-remains-shows-improved-relations/
DEBUG:__main__:Saved to DB
DEBUG:__main__:

 PROCESSING PAGE: https://nypost.com/news/page/7



DEBUG:__main__:Processing url: https://nypost.com/2018/07/2

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/california-wildfire-turns-deadly/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27/california-wildfire-turns-deadly/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/alaska-airlines-pilot-gets-prison-for-flying-while-intoxicated/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27/alaska-airlines-pilot-gets-prison-for-flying-while-intoxicated/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/flight-attendants-save-apparent-human-trafficking-victims/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27/flight-attendants-save-apparent-human-trafficking-victims/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/27/trump-denies-approving-2016-meeting-with-russians/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/27

DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/26/man-injured-dies-after-meeting-with-internet-date/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/26/man-injured-dies-after-meeting-with-internet-date/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/26/surfer-dies-after-being-found-face-down-during-huge-swell/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/26/surfer-dies-after-being-found-face-down-during-huge-swell/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/26/teen-found-safe-man-arrested-after-reported-abduction/
DEBUG:__main__:Exctracting features from https://nypost.com/2018/07/26/teen-found-safe-man-arrested-after-reported-abduction/
DEBUG:__main__:Saved to DB
DEBUG:__main__:Processing url: https://nypost.com/2018/07/26/woman-poisoned-baby-in-twisted-plot-to-get-husband-back-cops/
DEBUG:__main__:Exctracting features from ht

In [25]:
FIX AUTHORS

True