In [81]:
import os, re, sys, json, pickle, requests
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode

In [2]:
nyt_home_link = "https://www.nytimes.com/"
nyt_home_response = requests.get(nyt_home_link, timeout=5)
nyt_home_soup = BeautifulSoup(nyt_home_response.content, "html.parser")

In [3]:
nyt_articles = nyt_home_soup.find_all("article")

In [4]:
nyt_article_refs = list()
for article in nyt_articles:
    anchors = article.find_all("a")
    for a in anchors:
        nyt_article_refs.append(a["href"])

In [5]:
nyt_article_refs = [href for href in nyt_article_refs if 'podcast' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'interactive' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'crosswords' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'weekly' not in href]

In [7]:
nyt_article_refs

['/2019/06/21/us/reparations-discussion.html',
 '/2019/06/21/us/reparations-discussion.html',
 'https://www.nytimes.com/tips',
 'https://www.nytimes.com/tips',
 '/2019/06/25/us/migrant-children-border.html',
 '/2019/06/24/us/politics/migrants-emergency-aid-border.html',
 '/2019/06/24/us/politics/migrants-emergency-aid-border.html#commentsContainer',
 '/2019/06/25/climate/trump-minnesota-mine.html',
 '/2019/06/25/climate/trump-minnesota-mine.html#commentsContainer',
 '/2019/06/25/climate/trump-minnesota-mine.html',
 '/2019/06/25/world/middleeast/iran-rouhani-us-sanctions.html',
 '/2019/06/25/world/middleeast/iran-rouhani-us-sanctions.html#commentsContainer',
 '/2019/06/24/world/middleeast/iran-sanctions-response.html',
 '/2019/06/24/world/middleeast/iran-sanctions-response.html#commentsContainer',
 '/2019/06/24/us/politics/jean-carroll-trump.html',
 '/2019/06/24/us/politics/jean-carroll-trump.html#commentsContainer',
 '/2019/06/24/us/politics/jean-carroll-trump.html',
 '/2019/06/25/us/p

s = requests.session()

response = s.get('https://nytimes.com/2019/05/17/briefing/iran-sat-taiwan.html')

In [135]:
s = requests.session()
response = s.get('https://nytimes.com/{0}'.format(nyt_article_refs[1]))
soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), "html.parser")

f = open('articles/example_raw_soup.html', 'w+')
f.write(str(soup))
f.close()

In [9]:
print(soup.find('title').text)

No Easy Answers on Reparations - The New York Times


In [28]:
def find_author(soup, href):
    date = '-'.join(href.split('/')[1:4])
    times = soup.find_all('time', {'datetime' : date})
    parents = [t.parent.parent.parent for t in times]
    author = ''
    for parent in parents:
        p = parent.find('p', {'itemprop' : 'author'})
        if p:
            author += p.text
    return str(author)

In [11]:
find_author(soup, str(nyt_article_refs[1]))

'By Adeel Hassan'

In [96]:
def collect_paragraphs(soup):
    body = soup.find_all('section', {'name': 'articleBody'})
    paragraphs = list()
    for b in body:
        paragraphs.extend(b.find_all('p'))
    paragraphs = [str(unidecode(p)) for p in paragraphs]
    paragraphs = ''.join(paragraphs)
    return paragraphs

In [97]:
collect_paragraphs(soup)

'<p class="css-exrw3m evys1bk0">The millennial generation attended college in a golden era for student housing, as investors poured money into luxurious off-campus communities packed with resort-style amenities: rooftop pools, golf simulators, tanning beds, climbing walls.</p><p class="css-exrw3m evys1bk0">The wow factor increased with every new development. <!-- -->Many universities amped up their campus dorms and amenities in an effort to bolster recruitment, with a few going so far as to put in “lazy rivers” for floating around pools.</p><p class="css-exrw3m evys1bk0">“It was crazy to see what was going to beat the last new thing,” said <!-- -->Dan Oltersdorf<!-- -->, a senior vice president and chief learning officer at Campus Advantage, which manages about 70 off-campus student housing communities around the country. “You were just asking, what’s next?”</p><p class="css-exrw3m evys1bk0">But as millennials move on and so-called Generation Z moves in, student housing is shifting awa

In [180]:
def parse_article_soup(href, soup):
    article = dict()
    entry = str(href).split('/')[-1]
    entry = entry.replace('.html', '')
    entry = entry.replace('#commentsContainer', '')
    article = dict()
    article['entry'] = entry
    article['meta'] = dict()
    article['meta']['title'] = unidecode(soup.find('title').text.replace(' - The New York Times', ''))
    article['meta']['author'] = find_author(soup, href)
    article['meta']['date'] = "-".join(href.split('/')[1:4])
    article['meta']['href'] = str(href).replace('#commentsContainer', '')
    article['article_body'] = collect_paragraphs(soup)
    return article

In [181]:
parse_article_soup(nyt_article_refs[1], soup)

{'article_body': '<p class="css-exrw3m evys1bk0">The millennial generation attended college in a golden era for student housing, as investors poured money into luxurious off-campus communities packed with resort-style amenities: rooftop pools, golf simulators, tanning beds, climbing walls.</p><p class="css-exrw3m evys1bk0">The wow factor increased with every new development. <!-- -->Many universities amped up their campus dorms and amenities in an effort to bolster recruitment, with a few going so far as to put in “lazy rivers” for floating around pools.</p><p class="css-exrw3m evys1bk0">“It was crazy to see what was going to beat the last new thing,” said <!-- -->Dan Oltersdorf<!-- -->, a senior vice president and chief learning officer at Campus Advantage, which manages about 70 off-campus student housing communities around the country. “You were just asking, what’s next?”</p><p class="css-exrw3m evys1bk0">But as millennials move on and so-called Generation Z moves in, student housin

In [193]:
s = requests.session()
current_nyt_articles = dict()

for href in nyt_article_refs:
    response = s.get('https://nytimes.com/{0}'.format(href))
    soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')
    article = parse_article_soup(href, soup)
    current_nyt_articles[article['entry']] = article
                                   

In [194]:
keys = list(current_nyt_articles.keys())
print(current_nyt_articles[keys[0]])

{'entry': 'reparations-discussion', 'meta': {'title': 'No Easy Answers on Reparations', 'author': 'By Adeel Hassan', 'date': '2019-06-21', 'href': '/2019/06/21/us/reparations-discussion.html'}, 'article_body': '<p class="css-exrw3m evys1bk0">In the 50 years since the civil rights pioneer <a class="css-1g7m0tk" href="https://www.nytimes.com/2005/01/12/obituaries/james-forman-dies-at-76-was-pioneer-in-civil-rights.html?module=inline" title="">James Forman</a> demanded <a class="css-1g7m0tk" href="https://episcopalarchives.org/church-awakens/exhibits/show/specialgc/black-manifesto" rel="noopener noreferrer" target="_blank" title="">$500 million in reparations</a> for African-Americans from synagogues and white churches in his 1969 “Black Manifesto,” the United States has largely avoided any serious discussion of the legacy of slavery, Jim Crow and the structural racism that continues to permeate American society.</p><p class="css-exrw3m evys1bk0">This week might have evinced signs of chan

In [196]:
nyt_articles_meta = dict()

for i, key in enumerate(keys):
    article_file_path = '{0}.html'.format(key)
    nyt_articles_meta[key] = current_nyt_articles[key]['meta']
    nyt_articles_meta[key]['file_path'] = article_file_path
    with open('../app/assets/articles/{0}'.format(article_file_path), 'w+') as outfile:
        outfile.write(str(current_nyt_articles[key]['article_body']))
        

In [197]:
try:
    articles_dict = pickle.load(open('articles_meta.p', 'rb'))
    articles_dict.update(nyt_articles_meta)
except:
    pass
finally:
    pickle.dump(nyt_articles_meta, open('articles_meta.p', 'ab'))

with open('../app/assets/articles.json', 'w+') as outfile:
    json.dump(articles_dict, outfile)

In [198]:
print(articles_dict)

{'reparations-discussion': {'title': 'No Easy Answers on Reparations', 'author': 'By Adeel Hassan', 'date': '2019-06-21', 'href': '/2019/06/21/us/reparations-discussion.html', 'file_path': 'reparations-discussion.html'}, 'tips': {'title': 'Page Not Found', 'author': '', 'date': '-www.nytimes.com-tips', 'href': 'https://www.nytimes.com/tips', 'file_path': 'tips.html'}, 'migrant-children-border': {'title': "'We're in a Dark Place': Children Returned to Troubled Texas Border Facility", 'author': 'By Arturo Rubio and Caitlin Dickerson', 'date': '2019-06-25', 'href': '/2019/06/25/us/migrant-children-border.html', 'file_path': 'migrant-children-border.html'}, 'migrants-emergency-aid-border': {'title': 'Emergency Aid for Migrants Badly Divides Democrats', 'author': 'By Julie Hirschfeld Davis', 'date': '2019-06-24', 'href': '/2019/06/24/us/politics/migrants-emergency-aid-border.html', 'file_path': 'migrants-emergency-aid-border.html'}, 'trump-minnesota-mine': {'title': 'A Plan to Mine the Minn