# Guardian - Iteration 2


## Import Packages

In [63]:
# load .env File in Environment
from dotenv import load_dotenv
# operating system interfaces - get Environment-variables
import os
# Connect to API
import requests
# create Pandas Data Frame
import pandas as pd
import datetime as dt
from functools import wraps
load_dotenv()

True

### Set Params

In [64]:
iteration = 1

content_url = "https://content.guardianapis.com/search"
all_content_params = {
    "page-size": "50",
    "show-fields": "all",
    "show-tags": "all",
    "from-date": "2022-01-01",
    "to-date": "2022-07-01",
    "order-by": "newest",
    "api-key": os.getenv("GUARDIAN_API_KEY1"),
}


In [65]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

### Create DataFrame function


In [66]:
def get_df(url: str, params: dict):
        all_results = []
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
                tic = dt.datetime.now()
                params["page"] = current_page
                try:
                        r = requests.get(url, params)
                        all_results = all_results + r.json()["response"]["results"]
                        r.raise_for_status()
                except Exception as err:
                        SystemExit(err)
                if current_page == 1:
                        print("---- API STATUS ---- ")
                        print("status",  r.json()["response"]["status"])
                        total_pages = r.json()['response']['pages']
                        print("URL: ", r.url)
                        print("status",  r.json()["response"]["status"])
                        print("total",  r.json()["response"]["total"])
                        print("startIndex",  r.json()["response"]["startIndex"])
                        print("pageSize",  r.json()["response"]["pageSize"])
                        print("pages",  r.json()["response"]["pages"])
                        print("orderBy",  r.json()["response"]["orderBy"])
                        print("---- RUNTIME STATUS ---- ")

                time_taken = str(dt.datetime.now() - tic)
                print(f"({current_page}/{total_pages}) in {time_taken}s")
                        
                current_page += 1

        return pd.DataFrame(all_results)

### API Request

In [67]:
init_df = get_df(content_url, all_content_params)

---- API STATUS ---- 
status ok
URL:  https://content.guardianapis.com/search?page-size=50&show-fields=all&show-tags=all&from-date=2022-01-01&to-date=2022-07-01&order-by=newest&api-key=7e9dff14-114c-40a2-bb71-a816ac25ee64&page=1
status ok
total 33630
startIndex 1
pageSize 50
pages 673
orderBy newest
---- RUNTIME STATUS ---- 
(1/673) in 0:00:03.051623s
(2/673) in 0:00:07.323041s
(3/673) in 0:00:02.567628s
(4/673) in 0:00:01.516966s
(5/673) in 0:00:01.878448s
(6/673) in 0:00:02.293652s
(7/673) in 0:00:01.660556s
(8/673) in 0:00:01.105341s
(9/673) in 0:00:05.733809s
(10/673) in 0:00:02.039132s
(11/673) in 0:00:01.389787s
(12/673) in 0:00:05.934949s
(13/673) in 0:00:01.890570s
(14/673) in 0:00:01.310731s
(15/673) in 0:00:02.599618s
(16/673) in 0:00:01.401670s
(17/673) in 0:00:02.219573s
(18/673) in 0:00:01.903518s
(19/673) in 0:00:18.007636s
(20/673) in 0:00:12.699922s
(21/673) in 0:00:01.456071s
(22/673) in 0:00:01.558830s
(23/673) in 0:00:01.108362s
(24/673) in 0:00:01.369928s
(25/673) i

In [68]:
@log_step
def init_pipeline(df):
    return df.copy()

@log_step
def unfold_columns(df):
    dict_cols = ["fields"] 
    for col in dict_cols:
        new_df = pd.DataFrame()
        new_df[col] = df[col]#.apply(lambda x: eval(x))
        add_cols_df = pd.json_normalize(new_df[col])
        df = pd.concat([df, add_cols_df], axis=1)
    
    # Tags extraction
    
    #df["tags"] = df["tags"].apply(lambda x: eval(x))
    df['tagWebTitle'] = df['tags'].map(lambda x:[i['webTitle'] for i in x])
    df['tagId'] = df['tags'].map(lambda x:[i['id'] for i in x])
    df = df.drop(columns="tags")
    return df.drop(columns=dict_cols)
@log_step
def drop_columns(df):
    df = df.drop(columns=[
        "body",
        "webUrl",
        "apiUrl",
        "sectionId",
        "pillarId",
        "showAffiliateLinks",
        "bylineHtml",
        "standfirst",
        "shouldHideReaderRevenue",
        "isInappropriateForSponsorship",
        "shortUrl",
        "isPremoderated",
        "charCount",
        "shouldHideAdverts",
        "showInRelatedContent",
        "lang",
        "main",
        "thumbnail"
        
    ])
    return df

@log_step
def create_csv(df):
    df.to_csv(f"../../data/partial/articles_partial_{iteration}.csv.zip", compression="zip")
    return df

In [69]:
articles = (
    init_df
        .pipe(init_pipeline)
        .pipe(unfold_columns)
        .pipe(drop_columns)
        .pipe(create_csv)
)

init_pipeline:
 shape=(33650, 13) took 0:00:00.009978s

unfold_columns:
 shape=(33650, 48) took 0:00:08.963369s

drop_columns:
 shape=(33650, 30) took 0:00:00.019082s

create_csv:
 shape=(33650, 30) took 0:00:26.045041s



In [70]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33650 entries, 0 to 33649
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        33650 non-null  object
 1   type                      33650 non-null  object
 2   sectionName               33650 non-null  object
 3   webPublicationDate        33650 non-null  object
 4   webTitle                  33650 non-null  object
 5   isHosted                  33650 non-null  bool  
 6   pillarName                33404 non-null  object
 7   headline                  33650 non-null  object
 8   trailText                 33649 non-null  object
 9   byline                    33357 non-null  object
 10  wordcount                 33650 non-null  object
 11  firstPublicationDate      33646 non-null  object
 12  lastModified              33650 non-null  object
 13  liveBloggingNow           1167 non-null   object
 14  productionOffice      