In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from time import time
from urllib3.util.retry import Retry

import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter

In [None]:
%aimport src.scraping_helpers
from src.scraping_helpers import get_guardian_text_from_soup

In [None]:
base_url = "https://content.guardianapis.com/search"
path_to_save_unseen_data = "data/guardian_3.csv"
guardian_from_date = "2019-11-02"
guardian_to_date = "2020-02-28"
guardian_section = "science"
guardian_query = "space"
# guardian_api = os.get_env("GUARDIAN_API_KEY")
guardian_api = "4e521fbc-7b13-4e08-b9c1-9ab29181bee6"

In [None]:
def get_unseen_links(query_params, base_url):
    r = requests.get(base_url, params=query_params)
    # print(r.json().keys())
    rdocs = r.json()["response"]["results"]
    print(f"Found: {len(rdocs)} articles")
    d = {}
    for key in [
        "webUrl",
        "id",
        "webPublicationDate",
        "apiUrl",
        "type",
    ]:
        d[key] = []
        for rr in rdocs:
            try:
                rr[key]
                d[key].append(rr[key])
            except Exception as e:
                d[key].append(None)
    df_guardian_article = pd.DataFrame.from_dict(d, orient="index").T
    df_guardian_article = df_guardian_article.loc[
        (df_guardian_article["type"] == "article")
        & (~df_guardian_article["webUrl"].str.contains("blog"))
    ]
    display(df_guardian_article)
    return df_guardian_article

def get_article_text(df):
    l_texts = {}
    for k, link in enumerate(df["webUrl"].tolist()):
        print(f"Scraping article number {k+1}, Link: {link}")
        # print(site, link)
        start_time = time()
        r_session = requests.Session()
        retries = Retry(
            total=2,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
        )
        r_session.mount("http://", HTTPAdapter(max_retries=retries))
        try:
            page_response = r_session.get(link, timeout=5)
        except Exception as ex:
            print(f"{ex} Error connecting to {link}")
        else:
            try:
                soup = BeautifulSoup(page_response.content, "lxml")
                # print(soup.prettify())
            except Exception as e:
                print(f"Experienced error {str(e)} when scraping {link}")
                text = np.nan
            else:
                text = get_guardian_text_from_soup(soup)
        scrape_minutes, scrape_seconds = divmod(time() - start_time, 60)
        print(
            f"Scraping time: {int(scrape_minutes):d} minutes, {scrape_seconds:.2f} seconds"
        )
        l_texts[link] = [text]
    df = pd.DataFrame.from_dict(l_texts, orient="index").reset_index()
    df.rename(columns={"index": "url", 0: "text"}, inplace=True)
    display(df)
    return df

In [None]:
query_params = {
    "section": guardian_section,
    "from-date": guardian_from_date,
    "to-date": guardian_to_date,
    "order-by": "oldest",
    "page-size": 100,
    "q": guardian_query,
    "api-key": guardian_api,
    "page": 1,
}
df_guardian_article = get_unseen_links(query_params, base_url)

In [None]:
df = get_article_text(df_guardian_article.drop_duplicates())

We'll now save the retrieved article text data to a csv file

In [None]:
df.to_csv(path_to_save_unseen_data, index=False)