WARNING: Don't use the API to download large quantities of data. Use the files available for download instead following these instructions: https://pro.dp.la/developers/bulk-download

## Import Libraries

In [None]:
import os
import time

from getpass import getpass
import httpx
import re
import yake
import matplotlib.pyplot as plt

## Setting up the API key variable

In [None]:
API_URL="https://api.dp.la/v2/"
ENV_VAR = "DPLA_API_KEY"
TIMEOUT=30.0

In [None]:
key= os.getenv(ENV_VAR)

# This method avoids hardcoding the API key in the script
# The variable is persistent during the session
if not key:
    key = getpass(f"Enter your DPLA API key: ").strip()
    if not key:
        raise ValueError("No API key provided.")
    os.environ[ENV_VAR] = key
    
print(f"API key set in environment variable {ENV_VAR}.")

## Helping Functions

In [None]:
def _join_list(x, sep="; ", keep_first_only=False):
    if isinstance(x, list):
        if keep_first_only and len(x) > 0:
            return str(x[0])
        return sep.join(str(v) for v in x if v is not None)
    return "" if x is None else str(x)

In [None]:
def top_n(d, n=10):
    return dict(sorted(d.items(), key=lambda x: x[1], reverse=True)[:n])

In [None]:
def search_items(query, verbose=False, facets=None, **parameters):
    """
    Search DPLA items with given query and parameters.
    
    Args:
        query (str): The search query string. It's possible to use logical operators 
        (AND, OR, NOT). Additionally, you can use wildcards (*) for partial matches.
        **parameters: Facets and filter parameters, such as:
            - page_size (int): Number of results per page (default is 10, maximum is 100).
            - dataProvider (str): Filter results by one or multiple data provider. 
                (e.g., "UC Santa Barbara, Library, Department of Special Research Collections")
            - provider_name (str): Filter results by a specific provider name. 
                (e.g., "California Digital Library")
            - resource_type (str): Filter results by item type. Available types are
                "text", "image", "sound", "moving image", "physical object"
            - after (str): Filter results with sourceResource.date after the specified date (YYYY-MM-DD).
            - before (str): Filter results with sourceResource.date before the specified date (YYYY-MM-DD).
            - exact_field_match (str): true or false. "The exact_field_match behavior is applied to all 
               specific-field parameters. It does not affect the behavior of the “simple search” q parameter 
               (which can be combined with fields, and with exact_field_match).
            - Other facets and filters as documented in the DPLA API documentation.
            
    """
    
    base_url = f"{API_URL}items"
    params = {
        "q": query,
        "api_key": os.getenv(ENV_VAR),
    }
    
    # Handle facets
    if facets:
        params["facets"] = facets
    
    # Aliases for DPLA parameters
    provider_name = parameters.pop("provider_name", None)
    if provider_name:
        parameters["provider.name"] = provider_name
    
    after = parameters.pop("after", None)
    if after:
        parameters["sourceResource.date.after"] = after
        
    before = parameters.pop("before", None)
    if before:
        parameters["sourceResource.date.before"] = before
        
    resource_type = parameters.pop("resource_type", None)
    if resource_type:
        parameters["sourceResource.type"] = resource_type.lower()
    
    # Add remaining parameters
    for key, value in parameters.items():
        params[key] = value
        
    # Make the request
    with httpx.Client(timeout=TIMEOUT) as client:
        response = client.get(base_url, params=params)
    
    print(f"Request URL: {response.url}") if verbose else None
    
    response.raise_for_status() 
    return response.json()

In [None]:
def search_all_items(query, max_items=100, sleep=0.5, verbose=False, facets=None, **parameters):
    """
    Collect up to max_items across pages.
    
    Args:
        query (str): The search query string. It's possible to use logical operators 
            (AND, OR, NOT). Additionally, you can use wildcards (*) for partial matches.
        max_items (int): Maximum number of items to retrieve. For number of elements per page, 
            use the page_size parameter in **parameters.
        sleep (float): Time to wait between requests to avoid hitting rate limits.
        **parameters: Additional facets and filter parameters as documented in the DPLA API documentation.
    """
    all_docs = []
    page = 1
    page_size = int(parameters.get("page_size", 100))
    if page_size > 100:
        page_size = 100
        print("page_size cannot exceed 100. Setting to 100.")
        
    while len(all_docs) < max_items:
        parameters['page'] = page
        data = search_items(query, verbose=verbose, facets=facets, **parameters)
        docs = data.get('docs', [])
        if not docs:
            break  # No more results
        all_docs.extend(docs)
        
        # stop if we've reached max_items
        if len(all_docs) >= max_items:
            break
        
        page += 1
        time.sleep(sleep)
        
    return all_docs[:max_items]
    

Using the `search_items()` function, let's search for a page of results related to artificial intelligence using the following parameters:

- "artificial AND intelligence" as the search query
- `facets` of `"sourceResource.date.being"` to get a breakdown of results by initial date.
- `page_size` of 25. This will return 25 results.
- `verbose` set to `True` to print out the URL being requested and the number of results returned.

In [None]:
mysearch = search_items(
    "",
    facets="",
    page_size=,
    verbose=
    )

# This is going to help us understand the structure of the facets and how to access them in the next step.
print(f"{mysearch.get('count')} results found.") if isinstance(mysearch, dict) else print(f"{len(mysearch)} results found.")
print(f"Facets:")
mysearch.get('facets').get("sourceResource.date.begin").get('entries') if isinstance(mysearch, dict) else None

## Let's play with the faceted elements

In [None]:
# Visualize the histogram of entries dates

facets_list = mysearch.get('facets').get('sourceResource.date.begin').get('entries')

years = [d['time'] for d in facets_list][::-1] # We use [::-1] to reverse the order
counts = [d['count'] for d in facets_list][::-1]

plt.bar(years, counts, color='skyblue', edgecolor='black')

plt.xlabel('Year')
plt.ylabel('Number of Items')
plt.title('Items about Artificial Intelligence by Year')
plt.xticks(range(0, len(years), 5), [years[i] for i in range(0, len(years), 5)], rotation=45)
plt.tight_layout()
plt.show()

In [None]:
waves = [
    ("first_wave", "1955-1975"),
    ("second_wave", "1990-2005"),
    ("third_wave", "2018-2026"),
]

In [None]:
# create a pool of results for each period
ai_results = {}

for period_name, date_range in waves:
    print(f"Searching for AI items in the {period_name}: {date_range}")
    ai_results[period_name] = search_all_items(
        "artificial AND intelligence",
        max_items=3000,
        facets="sourceResource.date.begin,sourceResource.date.end",
        page_size=50,
        after=date_range.split('-')[0],
        before=date_range.split('-')[1],
        verbose=True
    )
    print(f"Found {len(ai_results[period_name])} items for period {period_name}.\n")

In [None]:
ai_results_summary = {period: len(items) for period, items in ai_results.items()}
print("AI Results Summary by Period:")
for period, count in ai_results_summary.items():
    print(f"{period}: {count} items")

In [None]:
ai_results['first_wave'][400].get('sourceResource')

In [None]:
ai_results['first_wave'][400].get('sourceResource').get('subject')[1].get('name')

In [None]:
## subject pools

ai_subjects = {}

for period, items in ai_results.items():
    subjects = {}
    for item in items:
        source_res = item.get('sourceResource', {})
        item_subjects = source_res.get('subject', [])
        for subj in item_subjects:
            name = subj.get('name')
            terms = re.split(r'[^\w\s]+', name) if name else []
            for term in terms:
                if term is not None and term.strip() != "":
                    term = term.strip().lower()
                    subjects[term] = subjects.get(term, 0) + 1
    ai_subjects[period] = subjects
    


In [None]:
# top five subjects per period

top = 10

for period, subjects in ai_subjects.items():
    sorted_subjects = sorted(subjects.items(), key=lambda x: x[1], reverse=True)[:top]
    print(f"Top {top} subjects for {period}:")
    for subj, count in sorted_subjects:
        print(f"  {subj}: {count}")
    print()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharex=False)

for ax, (key, title) in zip(axes, waves):
    data = top_n(ai_subjects[key], 10)
    terms = list(data.keys())[::-1]
    counts = list(data.values())[::-1]

    ax.barh(terms, counts)
    ax.set_title(title)
    ax.set_xlabel("Frequency")

plt.suptitle("How 'Artificial Intelligence' appears across time in DPLA", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
ai_keywords = {}

kw_extractor = yake.KeywordExtractor(lan="en", n=2, top=5)

for period, items in ai_results.items():
    period_keywords = {}
    for item in items:
        source_res = item.get('sourceResource', {})
        title = _join_list(source_res.get('title', ''), keep_first_only=True)
        description = _join_list(source_res.get('description', ''), keep_first_only=True)
        text = f"{title} {description}".lower()
        
        keywords = kw_extractor.extract_keywords(text)
        for kw, score in keywords:
            period_keywords[kw] = period_keywords.get(kw, 0) + 1
            
    ai_keywords[period] = period_keywords

In [None]:
top = 10

for period, keywords in ai_keywords.items():
    sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top]
    print(f"Top {top} keywords for {period}:")
    for kw, count in sorted_keywords:
        print(f"  {kw}: {count}")
    print()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharex=False)

waves = [
    ("first_wave", "1955–1975"),
    ("second_wave", "1990–2005"),
    ("third_wave", "2018–2026"),
]

for ax, (key, title) in zip(axes, waves):
    data = top_n(ai_keywords[key], 10)
    terms = list(data.keys())[::-1]
    counts = list(data.values())[::-1]

    ax.barh(terms, counts)
    ax.set_title(title)
    ax.set_xlabel("Frequency")

plt.suptitle("How 'Artificial Intelligence' appears across time in DPLA", fontsize=14)
plt.tight_layout()
plt.show()