## Scraper code

In [1]:
from libs.article import Article

import re
import requests

from bs4 import BeautifulSoup
from sqlalchemy.orm import sessionmaker

from libs.sqlcreator import create_alchemy_engine
from libs.multi_thread import multi_thread

In [2]:
import string

from collections import Counter
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

def get_most_common(text):
    exclude = set(string.punctuation)
    text_nopunct = ''.join(ch for ch in text if ch not in exclude)

    words = text_nopunct.lower().split(" ")
    words = [word for word in words if word not in ENGLISH_STOP_WORDS and len(word) > 1]

    return [word for word,count in list(Counter(words).most_common(15))]


with open("npr_article_572945894.txt") as file_hdl:
    article_text = file_hdl.read()
    
title = article_text.split("\n")[0]
url = "npr.org/testing"

most_common = get_most_common(article_text)


In [3]:
def get_text(soup):
    text = ""
    for paragraph in soup.find_all('p'):
        if not paragraph.has_attr('class') and not paragraph.findChildren('b'):
            text += paragraph.get_text() + "\n\n"
            
    return text

In [4]:
def get_npr_urls(soup):
    
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    urls = []
    for link in soup.find_all('a', href=True):
        url = link['href']
        if (url.startswith('https://www.npr.org/')):
            if re.search(pattern, url):
                urls.append(url)
        
    return urls

In [5]:
def get_title(soup):
    title = ""
    #find href with class == title, then get b
    for link in soup.find_all('a'):
        if link.has_attr('class'):
            if link['class'][0] == "title":
                title = link.findChildren('b')[0].get_text()
    
    return title

In [6]:
def get_details(url):
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    matchObject = re.search(pattern, url, flags=0)
    match_split = matchObject[0].split("/")
    date = match_split[0] + "-" + match_split[1] + "-" + match_split[2]
    article_id = match_split[3]
    
    return date, article_id

In [7]:
def scrape_url(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    links = get_npr_urls(soup)

    urls = [x for x in get_npr_urls(soup)]

    text = get_text(soup)

    return_dict = {"urls": links}
    if len(text) > 100:
        title = get_title(soup)
        date, article_id = get_details(url)

        return_dict["article"] = Article(title, text, url, get_most_common(text), date)
        
    return return_dict

In [8]:
def is_url_in_db(session, url):
    return session.query(Article).filter(Article.url == url).count() > 0

In [9]:
article_dict = scrape_url("https://www.npr.org/2018/05/04/608323118/in-wake-of-school-shooting-trump-pence-to-address-nra")
article_dict["article"]

<libs.article.Article at 0x7f9cecc5d9e8>

In [10]:
##############
## 

start_url = "https://www.npr.org/"
traversed_urls = set()

to_traverse = set()
l.add(start_url)

In [11]:
engine = create_alchemy_engine()

Session = sessionmaker(bind=engine)
session = Session()

In [12]:
for url in session.query(Article.url).distinct():
    traversed_urls.add(url)
len(traversed_urls)

0

In [14]:

while len(to_traverse) > 0 and count < 10000:
    
    print("DB count:", count, "Traversed:", len(traversed), "Queue:", len(to_traverse))
    ## Create a list of URLS to traverse
    curr_traverse = []
    while len(curr_traverse) < 100 and len(to_traverse) > 0:
        url = to_traverse.pop()
        if is_url_in_db(session, url):
            continue
        curr_traverse.append(url)
    
    ## Pass that list to multi-threading
    results = multi_thread(scrape_url, curr_traverse, 10)
    
    ## Multi-threading should return dictionaries mapping to results and to discovered URLs
    traversed_urls.add_all(curr_traverse)
    
    ## Create class for DB that maintains unique set of to_traverse and traverse
    for result in results:
        result_dict = result[1]
        if "article" in result_dict:
            session.add(result_dict["article"])
        
        to_traverse = to_traverse.union(set(result_dict["urls"]) - traversed_urls)
        
    count = session.query(Article).count()
    session.commit()        
              

DB count: 0
DB count: 0
DB count: 32
DB count: 105
DB count: 196
DB count: 292
DB count: 387
DB count: 479
DB count: 565
DB count: 653
DB count: 732
DB count: 807
DB count: 880
DB count: 956
DB count: 1023
DB count: 1083
DB count: 1160
DB count: 1239
DB count: 1320
DB count: 1390
DB count: 1466
DB count: 1541
DB count: 1606
DB count: 1673
DB count: 1742
DB count: 1807
DB count: 1861
DB count: 1924
DB count: 1987
DB count: 2050
DB count: 2108
DB count: 2162
DB count: 2215
DB count: 2272
DB count: 2328
DB count: 2382
DB count: 2432
DB count: 2478
DB count: 2521
DB count: 2567
DB count: 2634
DB count: 2707
DB count: 2777
DB count: 2843
DB count: 2905
DB count: 2965
DB count: 3027
DB count: 3087
DB count: 3145
DB count: 3208
DB count: 3267
DB count: 3322
DB count: 3370
DB count: 3418
DB count: 3465
DB count: 3509
DB count: 3555
DB count: 3595
DB count: 3625
DB count: 3671
DB count: 3714
DB count: 3754
DB count: 3789
DB count: 3827
DB count: 3868
DB count: 3902
DB count: 3939
DB count: 3971

KeyboardInterrupt: 

In [None]:
rows = session.query(Article).count()
rows

In [15]:
len(to_traverse)

6196

set()

In [None]:
from sqlalchemy import desc

#for instance in session.query(Article).order_by(desc(Article.date)):
#    print(instance.title, instance.url)

for instance in session.query(Article).order_by(desc(Article.date)).limit(10).offset(10):
    print(instance.title, instance.url)