Pulls from the New York Times all Headlines from 2024. Can be refined using special topics as key or different time frames.

In [None]:
API_KEY     = "9cLqd9jAufochxZTdf3XW0MVh4mvzGIO"
BASE_URL    = "https://api.nytimes.com/svc/archive/v1"
DELTA_PATH  = "/mnt/nyt/archive_yearly"
TARGET_YEAR = 2024

import requests
import json
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, hour

# 1) Initialize Spark (on Databricks this is implicit)
spark = SparkSession.builder.getOrCreate()

# 2) Loop over each month of the target year
for m in range(1, 13):
    # Construct the URL for that month’s archive
    url = f"{BASE_URL}/{TARGET_YEAR}/{m}.json"
    params = {"api-key": API_KEY}
    
    # 3) Fetch the monthly JSON (all articles for year/month) 
    resp = requests.get(url, params=params)
    resp.raise_for_status()  # No rate limits—will not 429
    month_json = resp.json()
    
    # 4) Extract the docs array (schema matches Article Search API) 
    docs = month_json.get("response", {}).get("docs", [])
    if not docs:
        continue  # Skip empty months
    
    # 5) Parallelize and read JSON into a DataFrame
    rdd = spark.sparkContext.parallelize([json.dumps(d) for d in docs])
    df  = spark.read.json(rdd)
    
    # 6) Select & rename desired fields
    df_sel = df.select(
        col("headline.main").alias("headline"),
        col("pub_date").alias("pub_date"),
        col("web_url"),
        col("section_name"),
        col("news_desk"),
        col("byline.original").alias("byline")
    )
    
    # 7) Add partition columns (year/month/day/hour)
    df_part = df_sel.withColumn("yr",  year("pub_date")) \
                    .withColumn("mo",  month("pub_date")) \
                    .withColumn("dy",  dayofmonth("pub_date")) \
                    .withColumn("hr",  hour("pub_date"))
    
    # 8) Write to Delta in append mode, partitioned by (yr,mo,dy,hr)
    df_part.write \
           .format("delta") \
           .mode("append") \
           .partitionBy("yr","mo","dy","hr") \
           .save(DELTA_PATH)
           
full_df = spark.read.format("delta").load(DELTA_PATH)