# [Scrape article URLs](#scrape-article-urls)

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import math
import os
import re
from IPython.display import display
from time import sleep
from random import randint

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
%aimport src.url_extraction_helpers
from src.url_extraction_helpers import (
    generate_space_archive_url,
    generate_nytimes_api_url,
)

<a id="toc"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Retrieve Space.com metadata from archive](#retrieve-space.com-metadata-from-archive)
3. [Retrieve Guardian newspaper metadata from API](#retrieve-guardian-newpaper-metadata-from-api)
4. [Retrieve Hubble telescope metadata from archive](#retrieve-hubble-telescope-metadata-from-archive)
5. [Retrieve New York Times newspaper metadata from API](#retrieve-new-york-times-newpaper-metadata-from-api)

<a id="about"></a>

## 0. [About](#about)

In this notebook, we will use archives (Hubble and Space.com) or APIs (Guardian, New York Times) to retrieve URLs for articles in the Space and Science section of various publications and export the listings to `*.csv` files in `data/raw/*_urls.csv`

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

We'll define below the variables to be used throughout the code.

In [None]:
# Guardian
guardian_from_date = "1950-01-01"
guardian_to_date = "2019-11-01"
guardian_section = "science"
guardian_query = "space"
guardian_start_page_num = 1
guardian_num_pages_wanted = 49
guardian_api = os.get_env("GUARDIAN_API_KEY")
guardian_query_min_delay = 2
guardian_query_max_delay = 4

# Hubble
hubble_article_fields_available = ["name", "news_id", "url"]

# Space.com
space_com_years = list(range(1999, 2019 + 1))

# NY Times
nytimes_api = os.get_env("NYTIMES_API_KEY")
nytimes_query = "space"
nytimes_begin_date = "19500101"  # "19500101"
nytimes_end_date = "20191101"  # "20191101"
nytimes_start_page_num = 0
nytimes_num_pages_wanted = -1
nytimes_newspaper_lang = "en"

# Other inputs
PROJ_ROOT_DIR = os.getcwd()
data_dir = os.path.join(PROJ_ROOT_DIR, "data", "raw")
list_of_urls_file = {
    "space": os.path.join(data_dir, "space_com_urls.csv"),
    "guardian": os.path.join(data_dir, "guardian_urls.csv"),
    "hubble": os.path.join(data_dir, "hubble_urls.csv"),
    "nytimes": os.path.join(data_dir, "nytimes_urls.csv"),
}
urls = {
    "guardian": "https://content.guardianapis.com/search",
    "hubble": "http://hubblesite.org/api/v3/news?page=all",
    "space": "https://www.space.com/archive",
}

# API Query inputs
query_params = {
    "guardian": {
        "section": guardian_section,
        "from-date": guardian_from_date,
        "to-date": guardian_to_date,
        "order-by": "oldest",
        "page-size": 100,
        "q": guardian_query,
        "api-key": guardian_api,
    },
    "hubble": {},
}

In [None]:
# guardian_url = (
#     "https://content.guardianapis.com/search?"
#     f"section={section}&"
#     f"from-date={from_date}&"
#     f"to-date={to_date}&"
#     "order-by=oldest&"
#     "page={0}&"
#     "page-size=100&"
#     f"q='{query}'&"
#     f"api-key={api}"
# )

<a id="retrieve-space.com-metadata-from-archive"></a>

## 2. [Retrieve Space.com metadata from archive](#retrieve-space.com-metadata-from-archive)

We will retrieve the URL for articles and other metadata from the blog Space.com by performing the following for each year in which listings will be required
1. Programmatically assemble URL to archive/API
2. Scrape web link for articles and store metadata in dictionary
3. Convert dictionary of urls and metadata into `DataFrame`
4. Append archive URL to `DataFrame`
5. Concatenate `DataFrame`s of metadata into single `DataFrame`
6. Export `DataFrame` of metadata to `*.csv`

The code below walks through each step enumerated above, with numbered explanatory comments along the way

In [None]:
# Space.com urls to file
dfs_space_urls = []
# Loop over all years to be queried, assemble archive url and retrieve article details
for year in space_com_years:
    dfs_space_urls_per_year = []
    # 1. Assemble archive url
    space_com_archive_urls = generate_space_archive_url(year, urls["space"])
    # Loop over all archive urls to be queried and retrieve article details
    for space_com_archive_url in space_com_archive_urls:
        print(f"Retrieving all article URLs from space.com archive at: {sp_url}")
        d_url = {}
        page_response = requests.get(space_com_archive_url, timeout=5)
        soup = BeautifulSoup(page_response.content, "lxml")
        # print(soup.prettify())
        # dates_li = soup.findAll("li", {"class": "list-title date-heading"})
        # all_dates = [str(date.text)[:-2] + f", {str(year)}" for date in dates_li]
        # 2. Scrape article url and store in dict
        li_all = soup.findAll("li", {"class": "day-article"})
        d_url[year] = [li.find("a")["href"] for li in li_all]
        # 3. Convert dict of urls to DataFrame of urls
        df_space_urls_all_months = pd.DataFrame.from_dict(d_url, orient="index").T
        # 4. Append archive url to DataFrame
        df_space_urls_all_months["archive_url"] = space_com_archive_url
        df_space_urls_all_months.rename(columns={year: "url"}, inplace=True)
        dfs_space_urls_per_year.append(df_space_urls_all_months)
    # 5. Concatenate DataFrames of yearly (single-year) urls
    df_space_urls = pd.concat(dfs_space_urls_per_year, axis=0, ignore_index=True).drop_duplicates()
    dfs_space_urls.append(df_space_urls)
# 6. Concatenate DataFrames across all years
df_all_space_urls = pd.concat(dfs_space_urls, axis=0, ignore_index=True).drop_duplicates()
display(df_all_space_urls)
# 7. Export DataFrame of metadata to *.csv file
df_all_space_urls.to_csv(list_of_urls_file["space"], index=False)

<a id="retrieve-guardian-newpaper-metadata-from-api"></a>

## 3. [Retrieve Guardian newspaper metadata from API](#retrieve-guardian-newpaper-metadata-from-api)

We will now retrieve the URL for articles and other metadata from the Guardian newspaper [API](https://open-platform.theguardian.com/documentation/) ([data explorer](https://open-platform.theguardian.com/explore/)) by doing the following for each year in which to retrieve data
1. Find maximum number of pages of results available for query
2. Set the maximum page number to be queried
3. Retrieve query response per page
4. Convert jsonified response into dictionary
5. Extract various metadata, including article URL, of converted response and store in separate dictionary
6. Convert dictionary of urls and metadata into `DataFrame`
7. Append page number of DataFrame
8. Concatenate `DataFrame`s of metadata into single `DataFrame`
9. Filter `DataFrame` to retain articles and remove blogs
10. Export `DataFrame` of metadata to `*.csv`

As before, the code below walks through each of the above enumerated steps, with numbered explanatory comments per step

In [None]:
%%time
# Guardian urls to file
dfs_guardian_details = []
# 1. Find maximum number of pages of results available
guardian_max_pages_returned = requests.get(
    urls["guardian"], params=query_params["guardian"]
).json()["response"]["pages"]
# 2. Set the maximum page number to be queried
if guardian_num_pages_wanted == -1:
    guardian_max_page_num = guardian_max_pages_returned
    guardian_pages_to_use = "all available"
else:
    guardian_max_page_num = guardian_start_page_num + guardian_num_pages_wanted
    guardian_pages_to_use = "requested"
print(
    f"Retrieving articles from {guardian_pages_to_use} pages, "
    f"number of requested pages = {guardian_num_pages_wanted}"
)
# Loop over all pages to be queried and retrieve article details
for page in range(guardian_start_page_num, guardian_max_page_num):
    d = {}
    query_params["guardian"]["page"] = page
    # 3. Send GET request to API and retrieve response
    r = requests.get(urls["guardian"], params=query_params["guardian"])
    # print(r.json().keys())
    # 4. Get results dict from response attribute of jsonified response
    try:
        rdocs = r.json()["response"]["results"]
    except KeyError as e:
        # print(r.json())
        if (
            r.json()["response"]["message"]
            == "requested page is beyond the number of available pages"
        ):
            print(f"Page {page} exceeded number of available pages. Stopping.")
            break
    else:
        print(f"Page: {page}, Found: {len(rdocs)} articles")
        # 5. Extract various attributes (metadata) of response json and store in dict
        for key in [
            "webUrl",
            "id",
            "webPublicationDate",
            "apiUrl",
            "webTitle",
            "document_type",
            "sectionId",
            "sectionName",
            "type",
            "isHosted",
            "pillarId",
            "pillarName",
        ]:
            d[key] = []
            for rr in rdocs:
                try:
                    rr[key]
                    d[key].append(rr[key])
                except Exception as e:
                    d[key].append(None)
        print(f"Retrieved {len(rdocs)} article details from page {page}")
        # 6. Convert dict of urls to DataFrame of urls
        df_guardian_article = pd.DataFrame.from_dict(d, orient="index").T
        # 7. Append page number of DataFrame
        df_guardian_article["page"] = page
        dfs_guardian_details.append(df_guardian_article)
        # Pause between pages
        if page != (guardian_start_page_num + guardian_num_pages_wanted) - 1:
            random_sleep_time = randint(
                guardian_query_min_delay, guardian_query_max_delay
            )
            print(
                f"Pausing for {random_sleep_time} seconds before retrieving from page {page+1}\n"
            )
            sleep(random_sleep_time)
# 8. Concatenate DataFrames across all pages
df_guardian_details = pd.concat(
    dfs_guardian_details, axis=0, ignore_index=True
).drop_duplicates()
# 9. Filter DataFrame to retain articles and remove blogs
df_guardian_details = df_guardian_details.loc[
    (df_guardian_details["type"] == "article")
    & (~df_guardian_details["webUrl"].str.contains("blog"))
]
print(df_guardian_details.shape[0])
display(df_guardian_details)
# 10. Export DataFrame of metadata to *.csv file
df_guardian_details.to_csv(list_of_urls_file["guardian"], index=False)

<a id="retrieve-hubble-telescope-metadata-from-archive"></a>

## 4. [Retrieve Hubble telescope metadata from archive](#retrieve-hubble-telescope-metadata-from-archive)

We will now retrieve the URL for articles and other metadata from the Hubble telescope news release [API](http://hubblesite.org/api/documentation#news) by doing the following for the only page (covering all years) in which to retrieve data
1. Retrieve query response per page
2. Convert jsonified response into dictionary
3. Extract various metadata, one at a time, including article URL, of converted response and store in separate dictionary
4. Convert dictionary of urls and metadata into `DataFrame`
5. Concatenate `DataFrame`s of metadata into single `DataFrame`
6. Rename `DataFrame` columns

Again, the code below goes through each of the above enumerated steps, with numbered comments detailing each step

In [None]:
# Hubble urls to file - 1/2
dfs_hubble_article_details = []
# 1. Send GET request to API and retrieve response
r = requests.get(urls["hubble"], params=query_params["hubble"])
# 2. Get results dict from jsonified response
rdocs = r.json()
# 3. Extract various attributes of response json and store in dict
for key in hubble_article_fields_available:
    d = {}
    d[key] = []
    for rr in rdocs:
        try:
            rr[key]
            d[key].append(rr[key])
        except Exception as e:
            d[key].append(None)
        print(f"Retrieved article details for news_id: {rr['news_id']}, from {rr['url']}")
    # 4. Convert dict of urls to DataFrame of urls
    df_hubble_article_details = pd.DataFrame.from_dict(d, orient="index").T
    dfs_hubble_article_details.append(df_hubble_article_details)
# 5. Concatenate DataFrames (horizontally) across all attributes
dfs_hubble_articles_details = pd.concat(
    dfs_hubble_article_details, axis=1, ignore_index=True
).drop_duplicates()
# 6. Rename columns
dfs_hubble_articles_details.columns = hubble_article_fields_available
print(dfs_hubble_articles_details.shape[0])
assert dfs_hubble_articles_details.shape[0] == len(rdocs)
dfs_hubble_articles_details.head(10)

Next, we'll get the details of each news release from a separate API endpoint. To do this, we will do the following for each news article listed `id`
1. Retrieve query response per `id`
2. Convert jsonified response into dictionary
3. Extract mission, publication and abstract attributes of response json and store in new columns of above `DataFrame`
4. Export `DataFrame` of metadata to `*.csv`

The code below walks through these four steps

In [None]:
# Hubble urls to file - 2/2
# Append mission and publication columns
dfs_hubble_articles_details["mission"] = np.nan
dfs_hubble_articles_details["publication"] = np.nan
# Populate mission and publication columns
for index, row in dfs_hubble_articles_details.iterrows():
    # print(row['news_id'])
    # 1. Send GET request to news_release API and retrieve resoonse
    r_news_release = requests.get("http://hubblesite.org/api/v3/news_release/" + row["news_id"])
    # 2. Get results dict from jsonified response
    rdocs_news_release = r_news_release.json()
    # print(rdocs_news_release)
    # 3. Extract various attributes of response json and store in dict and populate newly added mission
    # and publication columns
    for key in [
        "mission",
        "publication",
        "abstract",
    ]:
        try:
            rdocs_news_release[key]
            dfs_hubble_articles_details.loc[index, key] = rdocs_news_release[key]
        except Exception as e:
            dfs_hubble_articles_details.loc[index, key] = None
    print(
        f"Retrieved article {index} details for publication: {dfs_hubble_articles_details.loc[index, 'publication']}, "
        f"for mission {dfs_hubble_articles_details.loc[index, 'mission']}"
    )
print(dfs_hubble_articles_details.shape[0])
display(dfs_hubble_articles_details)
# 4. Export DataFrame of all metadata (from both API endpoints) to *.csv
dfs_hubble_articles_details.to_csv(list_of_urls_file["hubble"], index=False)

<a id="retrieve-new-york-times-newpaper-metadata-from-api"></a>

## 5. [Retrieve New York Times newspaper metadata from API](#retrieve-new-york-times-newpaper-metadata-from-api)

When querying the NY Times API, there are 2 important considerations to take into account
- for how to navigate through pages, see the [API documentation](https://developer.nytimes.com/docs/articlesearch-product/1/overview), under **Filtering Your Search** > **Pagination**
  > The Article Search API returns a max of 10 results at a time. The meta node in the response contains the total number of matches ("hits") and the current offset. Use the page query parameter to paginate thru results (page=0 for results 1-10, page=1 for 11-20, ...). You can paginate thru up to 100 pages (1,000 results). If you get too many results try filtering by date range.
- for how to stay within the API's call limits, see the [FAQs](https://developer.nytimes.com/faq#a11), under **11. Is there an API call limit?**
  > Yes, there are two rate limits per API: 4,000 requests per day and 10 requests per minute. You should sleep 6 seconds between calls to avoid hitting the per minute rate limit. If you need a higher rate limit, please contact us at code@nytimes.com.

With this in mind, we will retrieve the URL for articles and other metadata from the New York Times newspaper [API](https://developer.nytimes.com/docs/articlesearch-product/1/overview) by doing the following for each year in which to retrieve data
1. Find maximum number of pages of results available for query
2. Set the maximum page number to be queried
3. Retrieve query response per page
4. Extract various metadata, including article URL, of converted response and store in separate dictionary
5. Convert dictionary of urls and metadata into `DataFrame`
6. Append page number of DataFrame
7. Concatenate `DataFrame`s of metadata into single `DataFrame`
8. Filter `DataFrame` to retain articles and remove blogs
9. Export `DataFrame` of metadata to `*.csv`

The code below walks through each step above and comments are included to point to each step along the way

In [None]:
# NY Times urls to file
dfs_nytimes_article_details = []
# 1. Find maximum number of pages of results available
nytimes_url = generate_nytimes_api_url(
    nytimes_query, nytimes_begin_date, nytimes_end_date, nytimes_api
).format(0)
nytimes_max_pages_returned = math.ceil(
    requests.get(nytimes_url).json()["response"]["meta"]["hits"] / 10
)
# 2. Set the maximum page number to be queried
if nytimes_num_pages_wanted == -1:
    nytimes_max_page_num = nytimes_max_pages_returned
    nytimes_pages_to_use = "all available"
else:
    nytimes_max_page_num = nytimes_start_page_num + nytimes_num_pages_wanted
    nytimes_pages_to_use = "requested"
print(
    f"Retrieving articles from {nytimes_pages_to_use} pages, "
    f"number of requested pages ({nytimes_num_pages_wanted}), "
    f"number of available pages ({nytimes_max_page_num})"
)
# Loop over all pages to be queried and retrieve article details
for page in range(
    nytimes_start_page_num, nytimes_start_page_num + nytimes_max_page_num
):
    # 3. Send GET request to API and retrieve response
    # print(page)
    nytimes_url = generate_nytimes_api_url(
        nytimes_query, nytimes_begin_date, nytimes_end_date, nytimes_api
    )
    try:
        r = requests.get(nytimes_url.format(page))
        # print(r.json().keys())
        rdocs = r.json()["response"]["docs"]
        print(f"Page: {page}, Found: {len(rdocs)}")
        d = {}
        # 4. Extract various attributes of response json and store in dict
        for key in [
            "web_url",
            "lead_paragraph",
            "abstract",
            "snippet",
            "source",
            "document_type",
            "news_desk",
            "section_name",
            "type_of_material",
            "subsection_name",
            "word_count",
        ]:
            d[key] = []
            for rr in rdocs:
                try:
                    rr[key]
                    d[key].append(rr[key])
                except Exception as e:
                    d[key].append(None)
        print(f"Retrieved NY Times article details from page number {page}")
        # 5. Convert dict of urls to DataFrame of urls
        df_nytimes_article_details = pd.DataFrame.from_dict(d, orient="index").T
        # 6. Append page number of DataFrame
        df_nytimes_article_details["page"] = page
        dfs_nytimes_article_details.append(df_nytimes_article_details)
        # Pause between pages
        if page != (nytimes_start_page_num + nytimes_num_pages_wanted) - 1:
            print(f"Pausing for 7 seconds before retrieving details from page {page+1}\n")
            sleep(7)
    except Exception as e:
        if r.json()["errors"]:
            print(f"Requested page number ({page}) exceeds returned number of pages")
# 7. Concatenate DataFrames across all pages
dfs_nytimes_article_details_all = pd.concat(
    dfs_nytimes_article_details, axis=0, ignore_index=True
).drop_duplicates()
# 8. Filter DataFrame to retain articles
dfs_nytimes_article_details_all = dfs_nytimes_article_details_all.query('document_type == "article"')
print(dfs_nytimes_article_details_all.shape[0])
display(dfs_nytimes_article_details_all)
# 9. Export DataFrame of metadata to *.csv file
dfs_nytimes_article_details_all.to_csv(list_of_urls_file["nytimes"], index=False)