# [Scraping news article text](#scraping-news-articles-text)

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
from glob import glob
from IPython.display import display
from pathlib import Path
from random import randint
from time import sleep, time

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
%aimport src.scraping_helpers
from src.scraping_helpers import (
    get_guardian_text_from_soup,
    get_hubble_text_from_soup,
    get_space_text_from_soup,
    get_nytimes_text_from_soup,
    save_dflist_hdfs,
    append_datetime_attrs,
)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

<a id="toc"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Scrape articles](#scrape-articles)

<a id="about"></a>

## 0. [About](#about)

In this notebook, we will use files of URLs to listings retrieved in `1_get_list_of_urls.ipynb` and scrape the text of these news articles (from various publications) and store the retrieved text data in `data/raw/<publication-name>.csv`

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

We'll define below the variables that are to be used throughout the code.

In [None]:
# General inputs
data_dir = str(Path().cwd() / "data" / "raw")
min_delay_between_scraped = 0
max_delay_between_scraped = 1

# Paths to url files and dictionaries of lists of urls
list_of_urls_file = {
    "space": Path(data_dir) / "space_com_urls.csv",
    "guardian": Path(data_dir) / "guardian_urls.csv",
    "hubble": Path(data_dir) / "hubble_urls.csv",
    "nytimes": Path(data_dir) / "nytimes_urls__*.csv",
}

# Dictionary of lists of urls, for all publications
urls = {
    "guardian": pd.read_csv(list_of_urls_file["guardian"])["webUrl"].tolist(),
    "hubble": pd.read_csv(list_of_urls_file["hubble"])["url"].tolist(),
    "space": pd.read_csv(list_of_urls_file["space"])["url"].tolist(),
    "nytimes": pd.concat(
        [pd.read_csv(f) for f in glob(str(list_of_urls_file["nytimes"]))],
        axis=0,
        ignore_index=True,
    )["web_url"].tolist(),
}

In [None]:
# Lists of urls, by publication, used during development to manually test scraping/utility code
guardian_urls = [
    "https://www.theguardian.com/science/1986/jan/29/spaceexploration.columbia",
    "https://www.theguardian.com/science/1969/jul/21/spaceexploration.archive1",
    "https://www.theguardian.com/science/1999/jul/18/spaceexploration.theobserver",
    "https://www.theguardian.com/science/1999/aug/11/eclipse.uknews4",
    "https://www.theguardian.com/science/1999/aug/05/technology1",
    "https://www.theguardian.com/science/1999/jul/23/eclipse",
    "https://www.theguardian.com/science/1986/jan/30/spaceexploration.columbia",
    "https://www.theguardian.com/science/1986/jan/30/spaceexploration.columbia1",
    "https://www.theguardian.com/science/2014/nov/10/breakthrough-prize-scientists-23m-science-awards-2015",
    "https://www.theguardian.com/technology/2015/jan/16/elon-musk-falcon-9-rapid-unscheduled-disassembly",
    "https://www.theguardian.com/science/2014/oct/31/-sp-rosetta-selfie-mars-saturn-a-month-in-space-september-2014",
    "https://www.theguardian.com/science/blog/2014/oct/31/pumpkins-halloween-yeti-mental-health-polio-blogs-roundup",
    "https://www.theguardian.com/science/2014/nov/09/steven-pinker-twitter-can-hone-writing-skills",
    "https://www.theguardian.com/science/2014/nov/12/rosetta-mission-philae-historic-landing-comet",
]
hubble_urls = [
    "https://webbtelescope.org/contents/news-releases/2019/news-2019-41",
    "https://hubblesite.org/contents/news-releases/2018/news-2018-21.html",
    "https://hubblesite.org/contents/news-releases/2018/news-2018-03.html",
    "https://hubblesite.org/contents/news-releases/1990/news-1990-23.html",
    "https://hubblesite.org/contents/news-releases/1990/news-1990-17.html",
]
space_urls = [
    "https://www.space.com/7078-shuttle-astronauts-deploy-satellites-landing.html",
    "https://www.space.com/201-explore-colors-stars.html",
    "https://www.space.com/9429-cargo-ship-delivers-healthy-halloween-treats-space-station.html",
    "https://www.space.com/37172-trappist-1-planet-visualizations-explained.html",
    "https://www.space.com/15107-venus-pleiades-april-skywatching-tips.html",
    "https://www.space.com/11521-royal-wedding-space-station-astronauts-message.html",
]
nytimes_urls = [
    "https://www.nytimes.com/2019/11/09/us/politics/impeachment-state-department.html?action=click&module=Top%20Stories&pgtype=Homepage",
    "http://www.nytimes.com/2015/01/02/world/europe/turkey-police-thwart-attack-on-prime-ministers-office.html",
    "https://www.nytimes.com/2019/11/09/us/politics/michael-bloomberg-democrats.html?action=click&module=Top%20Stories&pgtype=Homepage",
    "https://www.nytimes.com/2019/11/04/science/space/nasa-boeing-starliner-tes.html",
]

<a id="scrape-articles"></a>

## 2. [Scrape articles](#scrape-articles)

In [None]:
# #  ========= GET SOUP FOR A SINGLE URL (DO NOT DELETE) =========
# link =
# page_response = requests.get(link, timeout=5)
# soup = BeautifulSoup(page_response.content, "lxml")
# print(soup.prettify())
# text, date = get_space_text_from_soup(soup, page_response)
# text

In [None]:
# #  ========= DATES FROM SPACE.com (DO NOT DELETE) =========
# dates = []
# for url in space_urls:
#     r = requests.get(url)
#     soup = BeautifulSoup(r.content, "lxml")
#     # print(soup.prettify())
#     published_datetime = soup.find("meta", {"name": "pub_date"}).get("content")
#     dates.append(published_datetime)
# # print(dates)
# df = pd.DataFrame(pd.Series(dates), columns=["publication_date"])
# df = append_datetime_attrs(df, date_col="publication_date")
# df

In [None]:
# # ========= APPEND SINGLE ARTICLE SCRAPE TO HDF5 FILE (DO NOT DELETE) =========
# l = []
# for site, links in urls.items():
#     # print(site)
#     for k, link in enumerate(links[:2]):
#         # print(f"{site}_{k+1}")
#         df_row = pd.DataFrame(np.random.rand(1, 9), columns=list("ABCDEFGHI"))
#         df_row["publication"] = site
#         df_row.to_hdf(h5_path, key=f"{site}_{k+1}", format="t", mode="a")
#         l.append(df_row)
# print(f"Scraped {len(l)} articles")
# pd.concat(
#     [
#         pd.read_hdf(h5_path, key=f"{site}_{k+1}")
#         for site in urls.keys()
#         for k, link in enumerate(links[:2])
#     ],
#     axis=0,
#     ignore_index=True,
# )

First, we will iterate over a Python dictionary of all the news publications and perform the following actions within each iteration
1. Scrape page with `BeautifulSoup` and get soup
2. Get text and (optionally) date from soup
3. Store extracted contents from soup in a dictionary
4. Convert dictionary into single row `DataFrame`
5. Rename date column
6. Apend publication name as column to `DataFrame`
7. Export single-row `DataFrame` to HDF file
   - this would be a single listing's details
8. Convert dictionary with all rows into `DataFrame`
   - this would be all listings' details
9. Append datetime attributes to full `DataFrame`
10. Append full `DataFrame` for publication to a list

In [None]:
cell_st = time()

# Main controller loop for scraping article text from url
l = []
for site, links in urls.items():
    (Path(data_dir) / site).mkdir(parents=True, exist_ok=True)
    # print(site, links)
    l_texts = {}
    date_published = np.nan
    for k, link in enumerate(links[30000:]):
        l_texts_single_listing = {}
        print(f"Scraping article number {k+1} from {site}, Link: {link}")
        # print(site, link)
        start_time = time()
        try:
            # 1. Get soup
            page_response = requests.get(link, timeout=5)
            soup = BeautifulSoup(page_response.content, "lxml")
            # print(soup.prettify())
            # 2. Get text (and optionally date)
            if site == "guardian":
                text = get_guardian_text_from_soup(soup)
            elif site == "hubble":
                text = get_hubble_text_from_soup(soup)
            elif site == "space":
                text, date_published = get_space_text_from_soup(soup, page_response)
            elif site == "nytimes":
                text, date_published = get_nytimes_text_from_soup(soup)
        except Exception as e:
            print(f"Experienced error {str(e)} when scraping {link} from {site}")
            text = np.nan
        scrape_minutes, scrape_seconds = divmod(time() - start_time, 60)
        print(
            f"Scraping time: {int(scrape_minutes):d} minutes, {scrape_seconds:.2f} seconds"
        )
        
        # 3. Store text and date in dictionary
        l_texts[link] = [text, date_published]
        l_texts_single_listing[link] = [text, date_published]
        # 4. Convert dictionary of text and date, for single listing, to DataFrame
        df_row = pd.DataFrame.from_dict(
            l_texts_single_listing, orient="index"
        ).reset_index()
        # 5. Rename publication date column of DataFrame
        df_row.rename(
            columns={"index": "url", 0: "text", 1: "publication_date"}, inplace=True
        )
        # 6. Append publication name as column to DataFrame
        df_row["publication"] = site
        # print(Path(data_dir) / site / f"scrapes_{site}_{k+1}.h5")
        # 7. Store DataFrame in HDF file
        df_row.to_hdf(
            Path(data_dir) / site / f"scrapes_{site}_{k+1}.h5",
            key=f"{site}_{k+1}",
            format="t",
            mode="w",
        )
        # print(text)
        # Delay between scraping urls
        delay_between_scrapes = randint(
            min_delay_between_scraped, max_delay_between_scraped
        )
        if (k + 1) < len(links[30000:]):
            print(f"Pausing for {delay_between_scrapes} seconds\n")
            sleep(delay_between_scrapes)
    
    # 8. Convert dictionary of text and date, for all listings, to DataFrame
    df = pd.DataFrame.from_dict(l_texts, orient="index").reset_index()
    df.rename(columns={"index": "url", 0: "text", 1: "publication_date"}, inplace=True)
    df["publication"] = site
    # display(df)
    # 9. (Optional) Append datetime attributes for space.com and nytimes publications
    if site in ["space", "nytimes"]:
        df = append_datetime_attrs(df, date_col="publication_date", publication=site)
    else:
        for L in [
            "year",
            "month",
            "day",
            "dayofweek",
            "dayofyear",
            "weekofyear",
            "quarter",
        ]:
            df[L] = np.nan
    # 10. Append DataFrame to list
    l.append(df)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

In [None]:
# # ========= LOAD SUBSET (NOT ALL) OF HDF5 FILES (DO NOT DELETE) =========
# pd.concat(
#     [
#         pd.read_hdf(Path(data_dir) / site / f"scrapes_{site}_{k+1}.h5", key=f"{site}_{k+1}")
#         for site, links in urls.items()
#         for k, link in enumerate(links[0:10])
#     ],
#     axis=0,
#     ignore_index=True,
# )

Finally, we'll concatenate the list of `DataFrame`s of scraped text data into a single `DataFrame` and export it to a separate `*.csv` file per publication

In [None]:
dfs = pd.concat(l, axis=0, ignore_index=True).drop_duplicates()
if site == "space":
    dfs = dfs[~pd.isnull(dfs["text"])]
display(dfs)
print(dfs.shape)
dfs.to_csv(Path(data_dir) / f"{site}.csv", index=False)