In [408]:
import os, re, sys, json, pickle, requests, psycopg2, numpy
from bs4 import BeautifulSoup
from unidecode import unidecode

In [257]:
nyt_home_link = "https://www.nytimes.com/"
nyt_home_response = requests.get(nyt_home_link, timeout=5)
nyt_home_soup = BeautifulSoup(nyt_home_response.content, "html.parser")

In [358]:
nyt_section_base_href = 'https://nytimes.com/section/'
nyt_sections_pages = ['world', 'us', 'politics', 'business', 'tech', 'science', 'health', 'sports']

In [418]:
def collect_nyt_article_hrefs(base, sections):
    article_hrefs = list()
    for section in sections:
        page = base + section
        response = requests.get(page, timeout=5)
        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all('article')
        for article in articles:
            anchors = article.find_all('a')
            for a in anchors:
                if 'tips' not in a['href'] \
                    and 'interactive' not in a['href'] \
                    and 'slideshow' not in a['href'] \
                    and '/video/' not in a['href'] \
                    and 'nytimes.com' not in a['href'] \
                    and len(a['href']) > 2:
                        article_hrefs.append(a['href'])
    return article_hrefs

In [419]:
nyt_article_hrefs = collect_nyt_article_hrefs(nyt_section_base_href, nyt_sections_pages)
nyt_article_hrefs = numpy.unique(nyt_article_hrefs)

In [420]:
nyt_article_hrefs

array(['/2019/04/17/health/candida-auris-fungus-chicago.html',
       '/2019/04/29/health/un-drug-resistance-antibiotics.html',
       '/2019/05/02/world/americas/venezuela-maduro-hezbollah-drugs.html',
       '/2019/05/17/health/antibiotics-oranges-florida.html',
       '/2019/05/17/world/americas/colombia-farc-peace-deal.html',
       '/2019/05/17/world/americas/venezuela-economy.html',
       '/2019/05/18/world/americas/colombian-army-killings.html',
       '/2019/05/23/health/candida-auris-hospitals-ny.html',
       '/2019/06/07/health/drug-companies-antibiotics-resistance.html',
       '/2019/06/14/business/reimann-jab-nazi-keurig-krispy-kreme.html',
       '/2019/06/19/business/adidas-diversity-employees.html',
       '/2019/06/27/world/asia/afghanistan-education-girls.html',
       '/2019/06/28/business/subway-franchisees.html',
       '/2019/06/28/science/cicadas-fungus-butts.html',
       '/2019/06/30/world/europe/stalin-museum-georgia-russia.html',
       '/2019/07/01/science

In [421]:
def find_author(soup, href):
    date = '-'.join(href.split('/')[1:4])
    times = soup.find_all('time', {'datetime' : date})
    parents = [t.parent.parent.parent for t in times]
    author = ''
    for parent in parents:
        p = parent.find('p', {'itemprop' : 'author'})
        if p:
            author += p.text
    return str(author)

In [422]:
def collect_paragraphs(soup):
    body = soup.find_all('section', {'name': 'articleBody'})
    paragraphs = list()
    for b in body:
        paragraphs.extend(b.find_all('p'))
    paragraphs = [str(unidecode(p)) for p in paragraphs]
    paragraphs = ''.join(paragraphs)
    return paragraphs

In [423]:
def parse_article_soup(soup, href):
    article = dict()
    entry = str(href).split('/')[-1]
    entry = entry.replace('.html', '')
    entry = entry.replace('#commentsContainer', '')
    article = dict()
    article['entry'] = entry
    article['meta'] = dict()
    article['meta']['title'] = unidecode(soup.find('title').text.replace(' - The New York Times', ''))
    article['meta']['author'] = find_author(soup, href)
    try:
        article['meta']['section'] = href.split('/')[4]
    except:
        pass
    article['meta']['date'] = "-".join(href.split('/')[1:4])
    article['meta']['href'] = str(href.split('/')[-1]).replace('#commentsContainer', '')
    article['article_body'] = collect_paragraphs(soup)
    return article

s = requests.session()
current_nyt_articles = dict()

for href in nyt_article_refs:
    response = s.get('https://nytimes.com/{0}'.format(href))
    soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')
    article = parse_article_soup(href, soup)
    current_nyt_articles[article['entry']] = article
                                   

keys = list(current_nyt_articles.keys())
print(current_nyt_articles[keys[80]]['meta'])

In [425]:
def build_nyt_articles_dict(hrefs):
    current_nyt_articles = dict()
    base = 'https://nytimes.com/'
    
    for href in hrefs:
        response = requests.get(base + href)
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'),
            'html.parser');
        article = parse_article_soup(soup, href)
        current_nyt_articles[article['entry']] = article
    
    return current_nyt_articles

In [439]:
def write_nyt_articles_files(articles_dict):
    nyt_articles_meta = dict()
    for key in articles_dict:
        article_file_path = key + '.html'
        nyt_articles_meta[key] = articles_dict[key]['meta']
        nyt_articles_meta[key]['file_path'] = article_file_path
        
        with open('../app/assets/articles/{}'.format(article_file_path), 'w+') as outfile:
            outfile.write(str(articles_dict[key]['article_body']))
    
    return nyt_articles_meta    

In [440]:
current_nyt_articles = build_nyt_articles_dict(nyt_article_hrefs)
current_nyt_articles

{'accute-flaccid-myelitis-cdc': {'article_body': '<p class="css-exrw3m evys1bk0">Last year, health officials confronted a record number of <a class="css-1g7m0tk" href="https://www.nytimes.com/2018/10/10/health/myelitis-illness-acute-flaccid.html?module=inline" title="">cases of a rare, mysterious neurological condition</a> that caused limb weakness and paralysis in more than 200 children across the country.</p><p class="css-exrw3m evys1bk0">Officials with the Centers for Disease Control and Prevention said on Tuesday that they were still trying to understand the condition, called acute flaccid myelitis, or A.F.M. And though there have been very few cases so far this year, they urged doctors to be on the lookout because the illness has tended to emerge in late summer and early fall.</p><p class="css-exrw3m evys1bk0">A.F.M. often involves sudden muscle weakness in the legs or arms and can also include stiffness in the neck, drooping eyelids or face muscles, problems swallowing and slurre

In [441]:
nyt_articles_meta = write_nyt_articles_files(current_nyt_articles)

#### Build Database

In [442]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

In [443]:
create_table_sql = (
                'DROP TABLE IF EXISTS articles;'
                'CREATE TABLE articles ('
                'article_id SERIAL PRIMARY KEY,'
                'article_title text NOT NULL,'
                'article_author text NOT NULL,'
                'article_date text NOT NULL,'
                'article_section text NOT NULL,'
                'article_path text NOT NULL);'
                )

In [444]:
cursor.execute(create_table_sql)
cursor.close()
conn.commit()

In [445]:
def create_insert_commands(nyt_articles_meta):
    keys = nyt_articles_meta.keys()
    for key in keys:
        article = nyt_articles_meta[key]
        insert_SQL = (
            "INSERT INTO articles "
            "(article_title, article_author, "
            "article_date, article_section, article_path) "
            "VALUES (%s, %s, %s, %s, %s);")
        values = [article['title'], article['author'], article['date'], 
                      article['section'], article['href']]
        yield (insert_SQL, values)

In [446]:
insert_commands = list(create_insert_commands(nyt_articles_meta))

In [447]:
print(insert_commands[2])

('INSERT INTO articles (article_title, article_author, article_date, article_section, article_path) VALUES (%s, %s, %s, %s, %s);', ['Secret Venezuela Files Warn About Maduro Confidant', 'By Nicholas Casey', '2019-05-02', 'world', 'venezuela-maduro-hezbollah-drugs.html'])


In [448]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

for insert_command in insert_commands:
    try:
        cursor.execute(insert_command[0], insert_command[1])
    except Exception as e:
        print(e)
        
cursor.close()
conn.commit()

In [449]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()
cursor.execute('SELECT * FROM articles ORDER BY article_id')

rows = cursor.fetchall()
for row in rows[:600]:
    print(row)

(1, 'How a Chicago Woman Fell Victim to Candida Auris, a Drug-Resistant Fungus', 'By Matt Richtel', '2019-04-17', 'health', 'candida-auris-fungus-chicago.html')
(3, 'Secret Venezuela Files Warn About Maduro Confidant', 'By Nicholas Casey', '2019-05-02', 'world', 'venezuela-maduro-hezbollah-drugs.html')
(4, 'Citrus Farmers Facing Deadly Bacteria Turn to Antibiotics, Alarming Health Officials', 'By Andrew Jacobs', '2019-05-17', 'health', 'antibiotics-oranges-florida.html')
(5, "Colombia's Peace Deal Promised a New Era. So Why Are These Rebels Rearming?", 'By Nicholas Casey', '2019-05-17', 'world', 'colombia-farc-peace-deal.html')
(6, "Venezuela's Collapse Is the Worst Outside of War in Decades, Economists Say", 'By Anatoly Kurmanaev', '2019-05-17', 'world', 'venezuela-economy.html')
(7, "Colombia Army's New Kill Orders Send Chills Down Ranks", 'By Nicholas Casey', '2019-05-18', 'world', 'colombian-army-killings.html')
(8, 'To Fight Deadly Candida Auris, New York State Proposes New Tactic