In [1]:
import os, re, sys, json, pickle, requests
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode

In [2]:
nyt_home_link = "https://www.nytimes.com/"
nyt_home_response = requests.get(nyt_home_link, timeout=5)
nyt_home_soup = BeautifulSoup(nyt_home_response.content, "html.parser")

In [3]:
nyt_articles = nyt_home_soup.find_all("article")

In [4]:
nyt_article_refs = list()
for article in nyt_articles:
    anchors = article.find_all("a")
    for a in anchors:
        nyt_article_refs.append(a["href"])

In [5]:
nyt_article_refs = [href for href in nyt_article_refs if 'podcast' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'interactive' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'crosswords' not in href]
nyt_article_refs = [href for href in nyt_article_refs if 'weekly' not in href]

In [7]:
nyt_article_refs

['/2019/06/21/us/reparations-discussion.html',
 '/2019/06/21/us/reparations-discussion.html',
 'https://www.nytimes.com/tips',
 'https://www.nytimes.com/tips',
 '/2019/06/25/us/migrant-children-border.html',
 '/2019/06/24/us/politics/migrants-emergency-aid-border.html',
 '/2019/06/24/us/politics/migrants-emergency-aid-border.html#commentsContainer',
 '/2019/06/25/climate/trump-minnesota-mine.html',
 '/2019/06/25/climate/trump-minnesota-mine.html#commentsContainer',
 '/2019/06/25/climate/trump-minnesota-mine.html',
 '/2019/06/25/world/middleeast/iran-rouhani-us-sanctions.html',
 '/2019/06/25/world/middleeast/iran-rouhani-us-sanctions.html#commentsContainer',
 '/2019/06/24/world/middleeast/iran-sanctions-response.html',
 '/2019/06/24/world/middleeast/iran-sanctions-response.html#commentsContainer',
 '/2019/06/24/us/politics/jean-carroll-trump.html',
 '/2019/06/24/us/politics/jean-carroll-trump.html#commentsContainer',
 '/2019/06/24/us/politics/jean-carroll-trump.html',
 '/2019/06/25/us/p

s = requests.session()

response = s.get('https://nytimes.com/2019/05/17/briefing/iran-sat-taiwan.html')

In [8]:
s = requests.session()
response = s.get('https://nytimes.com/{0}'.format(nyt_article_refs[1]))
soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), "html.parser")

f = open('articles/example_raw_soup.html', 'w+')
f.write(str(soup))
f.close()

In [9]:
print(soup.find('title').text)

No Easy Answers on Reparations - The New York Times


In [28]:
def find_author(soup, href):
    date = '-'.join(href.split('/')[1:4])
    times = soup.find_all('time', {'datetime' : date})
    parents = [t.parent.parent.parent for t in times]
    author = ''
    for parent in parents:
        p = parent.find('p', {'itemprop' : 'author'})
        if p:
            author += p.text
    return str(author)

In [11]:
find_author(soup, str(nyt_article_refs[1]))

'By Adeel Hassan'

In [70]:
def collect_paragraphs(soup):
    body = soup.find_all('section', {'name': 'articleBody'})
    paragraphs = list()
    for b in body:
        paragraphs.extend(b.find_all('p'))
    paragraphs = [unidecode(p) for p in paragraphs]
    return paragraphs

In [54]:
collect_paragraphs(soup)

[<p class="css-exrw3m evys1bk0">The millennial generation attended college in a golden era for student housing, as investors poured money into luxurious off-campus communities packed with resort-style amenities: rooftop pools, golf simulators, tanning beds, climbing walls.</p>,
 <p class="css-exrw3m evys1bk0">The wow factor increased with every new development. <!-- -->Many universities amped up their campus dorms and amenities in an effort to bolster recruitment, with a few going so far as to put in “lazy rivers” for floating around pools.</p>,
 <p class="css-exrw3m evys1bk0">“It was crazy to see what was going to beat the last new thing,” said <!-- -->Dan Oltersdorf<!-- -->, a senior vice president and chief learning officer at Campus Advantage, which manages about 70 off-campus student housing communities around the country. “You were just asking, what’s next?”</p>,
 <p class="css-exrw3m evys1bk0">But as millennials move on and so-called Generation Z moves in, student housing is shi

In [71]:
nyt_articles_content  = dict()
s = requests.session()
for href in nyt_article_refs:
    response = s.get('https://nytimes.com/{0}'.format(href))
    soup = BeautifulSoup(response.content.decode('utf-8', 'ignore'), 'html.parser')
    entry = str(href).split('/')[-1].replace('.html', '').replace('#commentsContainer', '')
    title = soup.find('title').text.replace(' - The New York Times', '')
    author = find_author(soup, href)
    article_text = collect_paragraphs(soup)
    nyt_articles_content[entry] = { 'body': article_text, 'params': {'title': title, 'author': author, 'link': str(href) } }
                                   
                                   

In [72]:
print(nyt_articles_content.keys())

dict_keys(['reparations-discussion', 'tips', 'migrant-children-border', 'migrants-emergency-aid-border', 'trump-minnesota-mine', 'iran-rouhani-us-sanctions', 'iran-sanctions-response', 'jean-carroll-trump', 'cory-booker-debate', 'democrats-2020-emails', 'student-loan-debt-forgiveness', 'nigerian-food-yewande-komolafe', 'taxes-debates-eli-broad', 'brain-injury-philosophy', 'marta-harnecker-latin-american-left', 'bourdain-antarctica-climate', 'boris-johnson-election', 'supreme-court-travel-ban-trump', '', 'joe-biden-electability-2020', 'republican-states-health-care', 'artificial-intelligence-depression', 'women-sex-toys-advertisements', 'iris-murdoch-centennial', 'college-dorm-uber-amenities'])


In [73]:
keys = list(nyt_articles_content.keys())

In [80]:
for i, key in enumerate(keys):
    with open('articles/{}.html'.format(key), 'w+') as outfile:
        outfile.write(str(nyt_articles_content[key]['params']))
        outfile.write(str(nyt_articles_content[key]['body']))

In [77]:
print(nyt_articles_content[keys[4]]['params']['author'])

By Hiroko Tabuchi and Steve Eder
