In [256]:
import os, re, sys, json, pickle, requests, psycopg2
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode

In [257]:
nyt_home_link = "https://www.nytimes.com/"
nyt_home_response = requests.get(nyt_home_link, timeout=5)
nyt_home_soup = BeautifulSoup(nyt_home_response.content, "html.parser")

In [258]:
nyt_articles = nyt_home_soup.find_all("article")

In [259]:
nyt_article_refs = list()
for article in nyt_articles:
    anchors = article.find_all("a")
    for a in anchors:
        nyt_article_refs.append(a["href"])

In [260]:
nyt_article_refs = [href for href in nyt_article_refs if 'podcast' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'interactive' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'crosswords' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'weekly' not in href]

In [261]:
nyt_article_refs

['/2019/07/09/briefing/jeffrey-epstein-turkey-coco-gauff.html',
 '/2019/07/09/briefing/jeffrey-epstein-turkey-coco-gauff.html',
 '/2019/07/09/health/obamacare-appeals-court.html',
 '/2019/07/09/health/obamacare-appeals-court.html#commentsContainer',
 '/2019/07/09/health/obamacare-appeals-court.html',
 '/2019/07/08/nyregion/jeffrey-epstein-charges.html',
 '/2019/07/08/nyregion/jeffrey-epstein-charges.html#commentsContainer',
 '/2019/07/08/nyregion/jeffrey-epstein-nyc-mansion.html',
 '/2019/07/08/us/politics/william-barr-census-citizenship.html',
 '/2019/07/08/us/politics/william-barr-census-citizenship.html',
 '/2019/07/08/us/politics/william-barr-census-citizenship.html#commentsContainer',
 '/2019/07/09/us/politics/republican-women-congress.html',
 '/2019/07/09/us/politics/republican-women-congress.html#commentsContainer',
 '/2019/07/09/us/politics/tom-steyer-president.html',
 '/2019/07/09/us/politics/amy-mcgrath-mitch-mcconnell.html',
 '/2019/07/09/business/economy/recession-world-eco

s = requests.session()

response = s.get('https://nytimes.com/2019/05/17/briefing/iran-sat-taiwan.html')

In [262]:
s = requests.session()
response = s.get('https://nytimes.com/{0}'.format(nyt_article_refs[1]))
soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), "html.parser")

f = open('articles/example_raw_soup.html', 'w+')
f.write(str(soup))
f.close()

In [263]:
print(soup.find('title').text)

Jeffrey Epstein, Turkey, Coco Gauff: Your Tuesday Briefing - The New York Times


In [264]:
def find_author(soup, href):
    date = '-'.join(href.split('/')[1:4])
    times = soup.find_all('time', {'datetime' : date})
    parents = [t.parent.parent.parent for t in times]
    author = ''
    for parent in parents:
        p = parent.find('p', {'itemprop' : 'author'})
        if p:
            author += p.text
    return str(author)

In [265]:
find_author(soup, str(nyt_article_refs[1]))

'By Mike Ives'

In [266]:
def collect_paragraphs(soup):
    body = soup.find_all('section', {'name': 'articleBody'})
    paragraphs = list()
    for b in body:
        paragraphs.extend(b.find_all('p'))
    paragraphs = [str(unidecode(p)) for p in paragraphs]
    paragraphs = ''.join(paragraphs)
    return paragraphs

In [267]:
collect_paragraphs(soup)

'<p class="css-exrw3m evys1bk0">(Want to get this briefing by email? Here’s the <a class="css-1g7m0tk" href="https://www.nytimes.com/morning-briefing?module=inline" title="">sign-up</a>.)</p><p class="css-ma92ss evys1bk0">Good morning.</p><p class="css-ma92ss evys1bk0">We’re covering <strong class="css-8qgvsz ebyp5n10">Jeffrey Epstein’s indictment</strong>, landmark <strong class="css-8qgvsz ebyp5n10">peace talks for Afghanistan</strong>, and <strong class="css-8qgvsz ebyp5n10">Elizabeth Warren’s fund-raising</strong>.</p><p class="css-exrw3m evys1bk0">Federal prosecutors in Manhattan have <a class="css-1g7m0tk" href="https://www.nytimes.com/2019/07/08/nyregion/jeffrey-epstein-charges.html?module=inline" title="">charged the financier with sex trafficking</a>, and they revealed that a trove of lewd photographs of girls had been discovered in a safe in his Manhattan mansion.</p><p class="css-exrw3m evys1bk0">The indictment on Monday could prompt a moment of reckoning for the Justice Dep

In [272]:
def parse_article_soup(href, soup):
    article = dict()
    entry = str(href).split('/')[-1]
    entry = entry.replace('.html', '')
    entry = entry.replace('#commentsContainer', '')
    article = dict()
    article['entry'] = entry
    article['meta'] = dict()
    article['meta']['title'] = unidecode(soup.find('title').text.replace(' - The New York Times', ''))
    article['meta']['author'] = find_author(soup, href)
    try:
        article['meta']['section'] = href.split('/')[4]
    except:
        pass
    article['meta']['date'] = "-".join(href.split('/')[1:4])
    article['meta']['href'] = str(href.split('/')[-1]).replace('#commentsContainer', '')
    article['article_body'] = collect_paragraphs(soup)
    return article

In [273]:
parse_article_soup(nyt_article_refs[1], soup)

{'article_body': '<p class="css-exrw3m evys1bk0">(Want to get this briefing by email? Here’s the <a class="css-1g7m0tk" href="https://www.nytimes.com/morning-briefing?module=inline" title="">sign-up</a>.)</p><p class="css-ma92ss evys1bk0">Good morning.</p><p class="css-ma92ss evys1bk0">We’re covering <strong class="css-8qgvsz ebyp5n10">Jeffrey Epstein’s indictment</strong>, landmark <strong class="css-8qgvsz ebyp5n10">peace talks for Afghanistan</strong>, and <strong class="css-8qgvsz ebyp5n10">Elizabeth Warren’s fund-raising</strong>.</p><p class="css-exrw3m evys1bk0">Federal prosecutors in Manhattan have <a class="css-1g7m0tk" href="https://www.nytimes.com/2019/07/08/nyregion/jeffrey-epstein-charges.html?module=inline" title="">charged the financier with sex trafficking</a>, and they revealed that a trove of lewd photographs of girls had been discovered in a safe in his Manhattan mansion.</p><p class="css-exrw3m evys1bk0">The indictment on Monday could prompt a moment of reckoning fo

In [274]:
s = requests.session()
current_nyt_articles = dict()

for href in nyt_article_refs:
    response = s.get('https://nytimes.com/{0}'.format(href))
    soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')
    article = parse_article_soup(href, soup)
    current_nyt_articles[article['entry']] = article
                                   

In [276]:
keys = list(current_nyt_articles.keys())
print(current_nyt_articles[keys[0]]['meta'])

{'title': 'Jeffrey Epstein, Turkey, Coco Gauff: Your Tuesday Briefing', 'author': 'By Mike Ives', 'section': 'briefing', 'date': '2019-07-09', 'href': 'jeffrey-epstein-turkey-coco-gauff.html'}


In [280]:
nyt_articles_meta = dict()

for i, key in enumerate(keys):
    
    article_file_path = '{}.html'.format(key)
    nyt_articles_meta[key] = current_nyt_articles[key]['meta']
    nyt_articles_meta[key]['file_path'] = article_file_path
    
    with open('../app/assets/articles/{}'.format(article_file_path), 'w+') as outfile:
        outfile.write(str(current_nyt_articles[key]['article_body']))
        

In [281]:
try:
    articles_dict = pickle.load(open('articles_meta.p', 'rb'))
    articles_dict.update(nyt_articles_meta)
except:
    pass
finally:
    pickle.dump(nyt_articles_meta, open('articles_meta.p', 'ab'))

In [284]:
print(nyt_articles_meta)

{'jeffrey-epstein-turkey-coco-gauff': {'title': 'Jeffrey Epstein, Turkey, Coco Gauff: Your Tuesday Briefing', 'author': 'By Mike Ives', 'section': 'briefing', 'date': '2019-07-09', 'href': 'jeffrey-epstein-turkey-coco-gauff.html', 'file_path': 'jeffrey-epstein-turkey-coco-gauff.html'}, 'obamacare-appeals-court': {'title': 'Obamacare in Jeopardy as Appeals Court Hears Case Backed by Trump', 'author': 'By Abby Goodnough', 'section': 'health', 'date': '2019-07-09', 'href': 'obamacare-appeals-court.html', 'file_path': 'obamacare-appeals-court.html'}, 'jeffrey-epstein-charges': {'title': "Seized Photos of Nude Girls Deepens Questions About Jeffrey Epstein's 2008  Deal", 'author': 'By Ali Watkins', 'section': 'nyregion', 'date': '2019-07-08', 'href': 'jeffrey-epstein-charges.html', 'file_path': 'jeffrey-epstein-charges.html'}, 'jeffrey-epstein-nyc-mansion': {'title': "Inside Epstein's $56 Million Mansion: Photos of Bill Clinton, Woody Allen and Saudi Crown Prince", 'author': 'By Matthew Haag

#### Build Database

In [329]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

In [330]:
cursor.execute('SELECT article_id FROM articles ORDER BY article_id DESC LIMIT 1;') 

In [331]:
create_table_sql = (
                'DROP TABLE IF EXISTS articles;'
                'CREATE TABLE articles ('
                'article_id SERIAL PRIMARY KEY,'
                'article_title text NOT NULL,'
                'article_author text NOT NULL,'
                'article_date text NOT NULL,'
                'article_section text NOT NULL,'
                'article_path text NOT NULL);'
                )

In [332]:
cursor.execute(create_table_sql)
cursor.close()
conn.commit()

In [333]:
try:
    articles_dict = pickle.load(open('articles_meta.p', 'rb'))
    articles_dict.update(nyt_articles_meta)
except:
    pass

In [334]:
def create_insert_commands(nyt_articles_meta):
    keys = nyt_articles_meta.keys()
    for key in keys:
        article = nyt_articles_meta[key]
        insert_SQL = (
            "INSERT INTO articles "
            "(article_title, article_author, "
            "article_date, article_section, article_path) "
            "VALUES (%s, %s, %s, %s, %s);")
        values = [article['title'], article['author'], article['date'], 
                      article['section'], article['href']]
        yield (insert_SQL, values)

In [335]:
insert_commands = list(create_insert_commands(nyt_articles_meta))

In [336]:
print(insert_commands[2])

('INSERT INTO articles (article_title, article_author, article_date, article_section, article_path) VALUES (%s, %s, %s, %s, %s);', ["Seized Photos of Nude Girls Deepens Questions About Jeffrey Epstein's 2008  Deal", 'By Ali Watkins', '2019-07-08', 'nyregion', 'jeffrey-epstein-charges.html'])


In [337]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

for insert_command in insert_commands:
    try:
        cursor.execute(insert_command[0], insert_command[1])
    except Exception as e:
        print(e)
        
cursor.close()
conn.commit()

In [338]:
conn = psycopg2.connect(dbname='nytlitedb', user='gqe', host='localhost', password='')
cursor = conn.cursor()
cursor.execute('SELECT * FROM articles ORDER BY article_id')

rows = cursor.fetchall()
for row in rows[:600]:
    print(row)

(1, 'Jeffrey Epstein, Turkey, Coco Gauff: Your Tuesday Briefing', 'By Mike Ives', '2019-07-09', 'briefing', 'jeffrey-epstein-turkey-coco-gauff.html')
(2, 'Obamacare in Jeopardy as Appeals Court Hears Case Backed by Trump', 'By Abby Goodnough', '2019-07-09', 'health', 'obamacare-appeals-court.html')
(3, "Seized Photos of Nude Girls Deepens Questions About Jeffrey Epstein's 2008  Deal", 'By Ali Watkins', '2019-07-08', 'nyregion', 'jeffrey-epstein-charges.html')
(4, "Inside Epstein's $56 Million Mansion: Photos of Bill Clinton, Woody Allen and Saudi Crown Prince", 'By Matthew Haag', '2019-07-08', 'nyregion', 'jeffrey-epstein-nyc-mansion.html')
(5, 'Barr Says Legal Path to Census Citizenship Question Exists, but He Gives No Details', 'By Katie Benner', '2019-07-08', 'us', 'william-barr-census-citizenship.html')
(6, "'It Can't Be Worse': How Republican Women Are Trying to Rebuild", 'By Maggie Astor', '2019-07-09', 'us', 'republican-women-congress.html')
(7, 'Tom Steyer Will Run for Presiden