# Guardian - FROM 01.01.2022


## Import Packages

In [12]:
# load .env File in Environment
from dotenv import load_dotenv
# operating system interfaces - get Environment-variables
import os
# Connect to API
import requests
# create Pandas Data Frame
import pandas as pd
import datetime as dt

In [13]:
content_url = "https://content.guardianapis.com/search"

## Load Variables from .env File

In [14]:
load_dotenv()

True

### Create DataFrame function


In [15]:
def get_df(url: str, params: dict):
        all_results = []
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
                tic = dt.datetime.now()
                params["page"] = current_page
                try:
                        r = requests.get(url, params)
                        all_results = all_results + r.json()["response"]["results"]
                        r.raise_for_status()
                except Exception as err:
                        SystemExit(err)
                if current_page == 1:
                        total_pages = r.json()['response']['pages']
                        print("---- API STATUS ---- ")
                        print("URL: ", r.url)
                        print("status",  r.json()["response"]["status"])
                        print("total",  r.json()["response"]["total"])
                        print("startIndex",  r.json()["response"]["startIndex"])
                        print("pageSize",  r.json()["response"]["pageSize"])
                        print("pages",  r.json()["response"]["pages"])
                        print("orderBy",  r.json()["response"]["orderBy"])
                        print("---- RUNTIME STATUS ---- ")

                time_taken = str(dt.datetime.now() - tic)
                print(f"({current_page}/{total_pages}) in {time_taken}s")
                        
                current_page += 1

        return pd.DataFrame(all_results)

### Set Params

In [16]:
all_content_params = {
    "page-size": "50",
    "show-fields": ["all"],
    "show-tags": ["all"],
    "show-section": "true",
    "show-rights": ["all"],
    "from-date": "2022-01-01",
    "api-key": os.getenv("GUARDIAN_API_KEY3"),
}

### API Request + create DF

In [17]:
articles = get_df(content_url, all_content_params)
articles.to_csv(f"../../data/raw/2022.csv.zip", compression="zip")

---- API STATUS ---- 
URL:  https://content.guardianapis.com/search?page-size=50&show-fields=all&show-tags=all&show-section=true&show-rights=all&from-date=2022-01-01&api-key=7e9dff14-114c-40a2-bb71-a816ac25ee64&page=1
status ok
total 33055
startIndex 1
pageSize 50
pages 662
orderBy newest
---- RUNTIME STATUS ---- 
(1/662) in 0:00:02.813331s
(2/662) in 0:00:01.360675s
(3/662) in 0:00:01.382149s
(4/662) in 0:00:01.076423s
(5/662) in 0:00:01.155104s
(6/662) in 0:00:01.332178s
(7/662) in 0:00:01.342808s
(8/662) in 0:00:01.110197s
(9/662) in 0:00:01.502417s
(10/662) in 0:00:01.556040s
(11/662) in 0:00:01.420434s
(12/662) in 0:00:01.346152s
(13/662) in 0:00:01.260387s
(14/662) in 0:00:01.487219s
(15/662) in 0:00:01.377421s
(16/662) in 0:00:01.282030s
(17/662) in 0:00:01.198441s
(18/662) in 0:00:01.456923s
(19/662) in 0:00:01.564630s
(20/662) in 0:00:01.590076s
(21/662) in 0:00:01.503983s
(22/662) in 0:00:01.348167s
(23/662) in 0:00:01.078035s
(24/662) in 0:00:01.243198s
(25/662) in 0:00:01.0

In [18]:
articles.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33057 entries, 0 to 33056
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  33057 non-null  object
 1   type                33057 non-null  object
 2   sectionId           33057 non-null  object
 3   sectionName         33057 non-null  object
 4   webPublicationDate  33057 non-null  object
 5   webTitle            33057 non-null  object
 6   webUrl              33057 non-null  object
 7   apiUrl              33057 non-null  object
 8   fields              33057 non-null  object
 9   tags                33057 non-null  object
 10  rights              33057 non-null  object
 11  section             33039 non-null  object
 12  isHosted            33057 non-null  bool  
 13  pillarId            32812 non-null  object
 14  pillarName          32812 non-null  object
dtypes: bool(1), object(14)
memory usage: 3.6+ MB
