In [None]:

%load_ext autoreload
%autoreload 2

# For debugging turn on logging to console
import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# mongodb
import pymongo as pm

# fine-tuned newspaper lib
from resources.newspaper import newspaper
from resources.newspaper.newspaper.source import Source
from resources.newspaper.newspaper.article import Article

import bs4 as bs
from urllib.parse import urljoin
from dateutil.parser import parse as date_parser
from time import sleep
import random
import pytz
import datetime

import requests

In [None]:
# for infinite scroll page
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import selenium
import sys

import unittest, time, re

# to divert selenium log stream away
logging.getLogger('selenium').setLevel(logging.WARNING)

In [None]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
# DB_NAME = 'scrape'
DB_NAME = 'FINALP'
db = client[DB_NAME]

def saveToDB(db, collection, url, html, meta={}):
    """
    Saves a document to mongoDB, making sure there are no duplicates by 
    'url' value
    
    Parameters:
    --------
    db, collection  : mongo db connection
    url, html, meta : values to store
    
    Returns:
    --------
    Saved document
    """
    collection = db[collection]
    collection.update_one(
        {'url' : url},
        {
            '$set':
                {'url' : url,
                 'html' : html,
                 'meta' : meta
                }
        }
        ,
        upsert=True
    )
    log.debug(f'Saved to DB')

def scrape(url, db, collection):
    '''
    Scrapes an article from the 'url' up to the 'latest_date'
    
    Parameters:
    --------
    url         : main news website url
    date        : YYYY-MM-DD
    db          : database name
    collection  : mongodb collection
    
    Returns:
    --------
    Article's html and features stored to db, 
    Article's publish date
    
    '''
    log.debug(f"Exctracting features from {url}")
    try:
        article = Article(url)
        article.download()
        # the below method may only extract a snippet... 
        # check the database for results of text extraction
        # and apply additional processing if needed after 
        # article has been stored in the DB
        # see code below Newrepublic for example
        article.parse()
    except Exception as e:
        log.critical(f'Data not saved: {e}')
        return datetime.datetime.now()
    
    saveToDB(db, collection, article.url, article.html, meta={
        'date'    :article.publish_date,
        'title'   :article.title,
        'text'    :article.text,
        'authors' :article.authors
    })
    
    return article.publish_date

def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [None]:
collection = 'dailywire'
source = 'https://www.dailywire.com'

logging.getLogger('selenium').setLevel(logging.WARNING)

class Sel(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Chrome()
        self.driver.implicitly_wait(30)
        self.base_url = source
        self.verificationErrors = []
        self.accept_next_alert = True
    def getPage(self):
        driver = self.driver
        delay = 2
        driver.get(self.base_url)
        html_source = driver.page_source
        self.html = html_source.encode('utf-8')
        return self.html
    def scrollDown(self):
        driver = self.driver
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        data.driver.find_element_by_css_selector('button.load-more').click()
        self.html = driver.page_source.encode('utf-8')
        return driver.page_source 
    def shutdown(self):
        driver = self.driver
        driver.quit()

data = Sel()

data.setUp()
data.getPage()
page = 1

In [None]:
utc=pytz.UTC
earliest_date = utc.localize(date_parser('2017-01-01'))

base_url = 'https://www.dailywire.com/'
scraped_urls = []
x = 4100

while True:
    log.debug(f'NEXT SCROLL (#{x})')
        
    url = f'https://www.dailywire.com/api/v1/articles/retrieve.json?limit=15&offset={x}'
    x +=15
    data = requests.get(url).json()
    for article in data:
        url = urljoin(base_url, article['url'])
        
        if url and url not in scraped_urls:
            scraped_urls.append(url)
            log.debug(f'Processing url: {url}')
            article_date = scrape(url, db, collection)
            log.debug(f"Date scraped: {article_date}")

    try:
        if article_date < earliest_date:
            log.debug(f'Reached earliest date requested: {article_date}')
            break
    except Exception as e:
        log.debug(f"Something is wrong: {e}")

In [None]:
data.shutdown()