In [24]:
#-*- coding: utf-8 -*-
import os, re, sys, json, pickle, requests, psycopg2, numpy
from bs4 import BeautifulSoup
from unidecode import unidecode
from datetime import datetime
from dateutil.parser import parse

In [11]:
def collect_soup(href):
    response = requests.get(href, timeout=5)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [12]:
nyt_section_base_href = 'https://nytimes.com/section/'
nyt_sections_pages = ['world', 'us', 'business', 'tech', 'science', 'health', 'sports']

In [13]:
collect_soup('https://www.nytimes.com/')

<!DOCTYPE html>

<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<title data-rh="true">The New York Times - Breaking News, World News &amp; Multimedia</title>
<meta content="en-US" data-rh="true" itemprop="inLanguage"/><meta content="noarchive,noodp,noydir" data-rh="true" name="robots"/><meta content="The New York Times" data-rh="true" name="application-name"/><meta content="https://www.nytimes.com" data-rh="true" name="msapplication-starturl"/><meta content="name=Search;action-uri=https://www.nytimes.com/search/?src=iepin;icon-uri=https://static01.nyt.com/images/icons/search.ico" data-rh="true" name="msapplication-task"/><meta content="name=Most Popular;action-uri=https://www.nytimes.com/gst/mostpopular.html?src=iepin;icon-uri=https://static01.nyt.com/images/icons/mostpopular.ico" data-rh="true" name="msapplication-task"/><meta content="name=Video;action-uri=https://video.nytimes.com/?src=iepin;icon-uri=https://static01.nyt.com/images/icons/video.ico" data-rh="

In [14]:
def collect_nyt_article_hrefs(base, sections):
    article_hrefs = list()
    for section in sections:
        page = base + section
        response = requests.get(page, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('article')
        for article in articles:
            anchors = article.find_all('a')
            for a in anchors:
                if 'tips' not in a['href'] \
                    and 'interactive' not in a['href'] \
                    and 'slideshow' not in a['href'] \
                    and '/video/' not in a['href'] \
                    and 'nytimes.com' not in a['href'] \
                    and len(a['href']) > 2:
                        article_hrefs.append(a['href'])
    return article_hrefs

In [25]:
nyt_article_hrefs = collect_nyt_article_hrefs(nyt_section_base_href, nyt_sections_pages)
nyt_article_hrefs = numpy.unique(nyt_article_hrefs)

In [26]:
nyt_article_hrefs

array(['/2019/07/16/world/canada/lac-megantic-quebec-train-explosion.html',
       '/2019/08/19/world/asia/thailand-inequality-road-fatalities.html',
       '/2019/08/22/world/asia/rohingya-myanmar-repatriation.html',
       '/2019/10/01/business/val-broeksmit-deutsche-bank-trump-whistle-blower.html',
       '/2019/10/07/world/americas/hurricane-irma-saint-martin.html',
       '/2019/10/10/world/americas/amazon-fires-brazil-cattle.html',
       '/2019/10/12/business/jeffrey-epstein-bill-gates.html',
       '/2019/10/15/science/cheese-fungus-mold-camembert.html',
       '/2019/10/15/science/giant-antlers-deer.html',
       '/2019/10/17/science/trilobites-fossils-marching.html',
       '/2019/10/18/business/kenneth-dart-cayman-islands.html',
       '/2019/10/20/world/middleeast/israel-election-rosh-haayin.html',
       '/2019/10/21/science/aye-aye-lemur-pseudo-thumb.html',
       '/2019/10/21/science/butterflies-moths-fossils-evolution.html',
       '/2019/10/21/science/carl-safina-anima

In [215]:
def find_author(soup):
    try:
        author = re.findall('By [ \.\w]*', str(soup))[0]
    except Exception as e:
        print(e)
        author = 'www.nytimes.com'
    print(author)
    return author

In [216]:
href = "https://www.nytimes.com/2019/10/22/sports/basketball/zion-williamson-knee-injury.html"
find_author(collect_soup(href))

By Jeré Longman


'By Jeré Longman'

In [217]:
def collect_paragraphs(soup):
    body = soup.find_all('section', {'name': 'articleBody'})
    paragraphs = list()
    for b in body:
        paragraphs.extend(b.find_all('p'))
    paragraphs = [str(unidecode(p)) for p in paragraphs]
    paragraphs = ''.join(paragraphs)
    return paragraphs

In [218]:
def parse_article_soup(soup, href):
    article = dict()
    entry = str(href).split('/')[-1]
    entry = entry.replace('.html', '')
    entry = entry.replace('#commentsContainer', '')
    article = dict()
    article['entry'] = entry
    article['meta'] = dict()
    article['meta']['title'] = unidecode(soup.find('title').text.replace(' - The New York Times', ''))
    article['meta']['author'] = find_author(soup)
    try:
        article['meta']['section'] = href.split('/')[4]
    except:
        pass
    article['meta']['href'] = "nytimes.com" + href
    article['meta']['date'] = "-".join(href.split('/')[1:4])
    article['meta']['path'] = str(href.split('/')[-1]).replace('#commentsContainer', '')
    article['article_body'] = collect_paragraphs(soup)
    return article

In [219]:
def build_nyt_articles_dict(hrefs):
    current_nyt_articles = dict()
    base = 'https://nytimes.com/'
    
    for href in hrefs:
        response = requests.get(base + href)
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'),
            'html.parser');
        article = parse_article_soup(soup, href)
        current_nyt_articles[article['entry']] = article
    
    return current_nyt_articles

In [220]:
def write_nyt_articles_files(articles_dict):
    nyt_articles_meta = dict()
    for key in articles_dict:
        article_file_path = key + '.html'
        nyt_articles_meta[key] = articles_dict[key]['meta']
        nyt_articles_meta[key]['file_path'] = article_file_path
        
        with open('../app/assets/articles/{}'.format(article_file_path), 'w+') as outfile:
            outfile.write(str(articles_dict[key]['article_body']))
    
    return nyt_articles_meta    

In [221]:
current_nyt_articles = build_nyt_articles_dict(nyt_article_hrefs)
current_nyt_articles

By Ian Austen
By Hannah Beech
By Hannah Beech
By David Enrich
By Kirk Semple
By Clifford Krauss
By Emily Flitter and James B. Stewart
By Emma Goldberg
By Cara Giaimo
By Becky Ferreira
By Katy Lederer
By Isabel Kershner
By JoAnna Klein
By Nicholas Wade
By Claudia Dreifus
By Cara Giaimo
By Carlotta Gall and Mauricio Lima
By Kenneth Chang
By Gina Kolata
By Veronique Greenwood
By Donald G. McNeil Jr.
By Cade Metz
By Emily S. Rueb and Mariel Padilla
By Michael Corkery and Sapna Maheshwari
By Stanley Reed
By Peter Eavis
By Thomas Fuller and Kendra Pierre
By Carl Zimmer
By James Gorman
By Veronique Greenwood
By Nicholas St. Fleur
By Tyler Kepner
By Benjamin Hoffman
By Benjamin Hoffman
By Sopan Deb
By Marc Stein
By Corban Goble
By John Branch
By Alan Blinder
By Andrew Das
By Liam Stack
By Edward Wong
By Judi Ketteler
By The Associated Press
By Javier C. Hernández
By Jamie Tarabay and Matthew Abbott
By Bryant Rousseau
By Stephen Castle and Mark Landler
By Raphael Minder
By Jason Horowitz
By Cey

{'Ohio-hijab-runner': {'article_body': '<p class="css-exrw3m evys1bk0">Noor Alexandria Abukaram has played three high school sports since she began wearing a hijab in 2016. But she said that it was not until last weekend — after her seventh cross-country race of the season — that she learned she wasn’t allowed to run in her head scarf without special permission. </p><p class="css-exrw3m evys1bk0">The decision by the Ohio High School Athletic Association official to disqualify Ms. Abukaram, 16, last Saturday was met with outrage on Thursday, one day after <a class="css-1g7m0tk" href="https://www.facebook.com/Zobaidaf/posts/2486333541463584?__xts__[0]=68.ARAYzFCcR57BoI25rIFWRkkLrNGtKazmoVzFW6YA8MGwYaLR-s0nKyvNYKI5fJE7kgTHflwDIcyEtlav8DFi0Onk9c1Lxs59ot6-8QERF2RPLOEQLTWNhhChOabqY75ukhXSlq3zh0UbTexPrNGqRfoTyE9wu-Du58ZEjjvKee_2lt4UDKmYsOmhSiJvGKmgosP1oKvMqT3alwOA&amp;__tn__=-R" rel="noopener noreferrer" target="_blank" title="">she wrote about the episode</a> on Facebook. In an interview, sh

In [222]:
nyt_articles_meta = write_nyt_articles_files(current_nyt_articles)

In [223]:
print([nyt_articles_meta[k]['author'] for k in nyt_articles_meta.keys()])

['By Ian Austen', 'By Hannah Beech', 'By Hannah Beech', 'By David Enrich', 'By Kirk Semple', 'By Clifford Krauss', 'By Emily Flitter and James B. Stewart', 'By Emma Goldberg', 'By Cara Giaimo', 'By Becky Ferreira', 'By Katy Lederer', 'By Isabel Kershner', 'By JoAnna Klein', 'By Nicholas Wade', 'By Claudia Dreifus', 'By Cara Giaimo', 'By Carlotta Gall and Mauricio Lima', 'By Kenneth Chang', 'By Gina Kolata', 'By Veronique Greenwood', 'By Donald G. McNeil Jr.', 'By Cade Metz', 'By Emily S. Rueb and Mariel Padilla', 'By Michael Corkery and Sapna Maheshwari', 'By Stanley Reed', 'By Peter Eavis', 'By Thomas Fuller and Kendra Pierre', 'By Carl Zimmer', 'By James Gorman', 'By Veronique Greenwood', 'By Nicholas St. Fleur', 'By Tyler Kepner', 'By Benjamin Hoffman', 'By Benjamin Hoffman', 'By Sopan Deb', 'By Marc Stein', 'By Corban Goble', 'By John Branch', 'By Alan Blinder', 'By Andrew Das', 'By Liam Stack', 'By Edward Wong', 'By Judi Ketteler', 'By The Associated Press', 'By Javier C. Hernánde

#### Build Database

In [224]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

In [225]:
create_table_sql = (
                'DROP TABLE IF EXISTS articles;'
                'CREATE TABLE articles ('
                'id SERIAL PRIMARY KEY,'
                'title text NOT NULL,'
                'author text NOT NULL,'
                'date text NOT NULL,'
                'section text NOT NULL,'
                'href text NOT NULL,'
                'path text NOT NULL);'
                )

In [226]:
cursor.execute(create_table_sql)
cursor.close()
conn.commit()

In [227]:
def create_insert_commands(nyt_articles_meta):
    keys = nyt_articles_meta.keys()
    for key in keys:
        article = nyt_articles_meta[key]
        insert_SQL = (
            "INSERT INTO articles "
            "(title, author, "
            "date, section, href, path) "
            "VALUES (%s, %s, %s, %s, %s, %s);")
        values = [article['title'], article['author'], article['date'], 
                      article['section'], article['href'], article['path']]
        yield (insert_SQL, values)

In [228]:
insert_commands = list(create_insert_commands(nyt_articles_meta))

In [229]:
print(insert_commands[2])

('INSERT INTO articles (title, author, date, section, href, path) VALUES (%s, %s, %s, %s, %s, %s);', ['Massacred at Home, in Misery Abroad, 730,000 Rohingya Are Mired in Hopelessness', 'By Hannah Beech', '2019-08-22', 'world', 'nytimes.com/2019/08/22/world/asia/rohingya-myanmar-repatriation.html', 'rohingya-myanmar-repatriation.html'])


In [230]:
def execute_inserts(insert_commands):
    conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
    cursor = conn.cursor()

    for insert_command in insert_commands:
        try:
            cursor.execute(insert_command[0], insert_command[1])
        except Exception as e:
            print(e)
        
    cursor.close()
    conn.commit()

In [231]:
execute_inserts(insert_commands)

In [232]:
delete_command = ( "DELETE FROM articles a USING articles b "
    "WHERE a.id > b.id "
    "AND a.title = b.title AND a.author = b.author;")

In [233]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()
cursor.execute(delete_command)

In [234]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()
cursor.execute('SELECT DISTINCT ON (title, author) * FROM articles')

rows = cursor.fetchall()
for row in rows[:600]:
    print(row)

(76, "'All of Them Are Thieves': Iraqis Defy Security Forces to Protest Corruption", 'By Alissa J. Rubin', '2019-10-25', 'world', 'nytimes.com/2019/10/25/world/middleeast/iraq-protests.html', 'iraq-protests.html')
(26, "'It's Definitely Pretty Empty': Why Saving WeWork Will Be Hard", 'By Peter Eavis', '2019-10-24', 'business', 'nytimes.com/2019/10/24/business/wework-growth.html', 'wework-growth.html')
(62, "'Let the Viejos Play': In the World Series, Age Is an Asset", 'By James Wagner', '2019-10-25', 'sports', 'nytimes.com/2019/10/25/sports/nationals-astros-world-series.html', 'nationals-astros-world-series.html')
(47, '200 Dispatches: Odd Animals, Offbeat Childhoods, Celebrity Origins and Extreme Sports', 'By Bryant Rousseau', '2019-10-24', 'world', 'nytimes.com/2019/10/24/world/dispatches-odd-offbeat-international-news.html', 'dispatches-odd-offbeat-international-news.html')
(75, '3 More Arrests After 39 Bodies Are Found in Truck in U.K.', 'By Megan Specia and Sui', '2019-10-25', 'wo