Evaluate the differences and Academic Analytic API and the Google Scholar API.

In [None]:
import requests
import os

import json
from dotenv import load_dotenv

In [None]:
damon_aa_id = 279865
damon_scholar_id = "MS85p6QAAAAJ"
damon_orcid = "0000-0003-3436-3718"

## AA

In [None]:
def get_author(id: int) -> dict:
    """Get an author from the academic analytics API."""

    url = f"https://wisc.discovery.academicanalytics.com/api/people/{id}"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

In [None]:
record_aa = get_author(damon_aa_id)
dois_aa = [r["digitalObjectIdentifier"] for r in record_aa["articles"]]
dois_aa = [r for r in dois_aa if r is not None]

In [None]:
print(f"AA: {record_aa['firstName']} {record_aa['lastName']}")
print(f"AA: Total number of articles with dois: {len(dois_aa)}")

## Scholar (via SERP)

In [None]:
load_dotenv()
SERP_API_KEY = os.getenv("SERP_API_KEY")

In [None]:
data = {
    "engine": "google_scholar_author",
    "author_id": damon_scholar_id,
    "api_key": SERP_API_KEY,
    "num": 100,
}

response = requests.get("https://serpapi.com/search", params=data)

results = response.json()

In [None]:
with open(f"tmp/serp/{damon_scholar_id}_0.json", "w") as f:
    json.dump(response.json(), f, indent=2)

Download other pages

In [None]:
i = 1
next_page = results["serpapi_pagination"]["next"]
while next_page is not None:
    print(next_page)

    next_page += f"&api_key={SERP_API_KEY}"
    response = requests.get(next_page)
    results = response.json()
    with open(f"tmp/serp/{damon_scholar_id}_{i}.json", "a") as f:
        json.dump(results, f, indent=2)

    try:
        next_page = results["serpapi_pagination"]["next"]
        i += 1
    except KeyError:
        next_page = None

There are 700+ articles from scholar

In [None]:
results

## ORCID

ORCID
https://orcid.org/0000-0003-3436-3718

has 54 works, 43 has proper dois

In [None]:
from functools import cache


@cache
def get_oauth_token() -> str:
    """Get an OAuth token from ORCID."""

    client_id = os.getenv("ORCID_CLIENT_ID")
    client_secret = os.getenv("ORCID_CLIENT_SECRET")

    response = requests.post(
        "https://orcid.org/oauth/token",
        headers={"Accept": "application/json"},
        data={
            "client_id": client_id,
            "client_secret": client_secret,
            "grant_type": "client_credentials",
            "scope": "/read-public",
        },
    )

    if response.status_code != 200:
        raise Exception("Unable to get OAuth token from ORCID.")

    return response.json()["access_token"]

In [None]:
def pull(path: str) -> dict:
    """Pull data from ORCID."""

    token = get_oauth_token()
    response = requests.get(
        f"https://pub.orcid.org/v3.0/{path}",
        headers={
            "Accept": "application/json",
            "Authorization": f"Bearer {token}",
        },
    )
    if response.status_code != 200:
        raise Exception(f"Unable to get {path} from ORCID.")
    return response.json()

In [None]:
results = pull(f"{damon_orcid}/works")

In [None]:
from typing import Optional
from pydantic import BaseModel, validator


class Article(BaseModel):
    author_id: Optional[str] = None
    doi: Optional[str] = None
    publication_year: Optional[int] = None
    title: Optional[str] = None
    abstract: Optional[str] = None
    cited_by: Optional[int] = None

    @property
    def text(self) -> str:
        """Text to embed."""

        if not self.title and not self.abstract:
            raise ValueError("No text to embed.")

        text = ""
        if self.title:
            text += self.title

        if self.abstract:
            text += " " + self.abstract

        return text


class ORCIDWorkParser:
    """Parse the raw JSON from ORCID into a list of Article objects."""

    def parse(self, works: dict) -> list[Article]:
        work_summaries = [w["work-summary"][0] for w in works["group"]]

        outputs = []
        for work_summary in work_summaries:
            if work_summary["type"] == "journal-article":
                article = self.parse_summary(work_summary)
                outputs.append(article)
        return outputs

    def parse_summary(self, work_summary: dict) -> Article:
        return Article(
            orcid_path=self._get_orcid_path(work_summary),
            doi=self._get_doi(work_summary),
            title=self._get_title(work_summary),
            url=self._get_url(work_summary),
            publication_year=self._get_publication_year(work_summary),
        )

    @staticmethod
    def _get_orcid_path(work_summary):
        return work_summary["path"]

    @staticmethod
    def _get_doi(work_summary) -> str:
        if work_summary["external-ids"] is None:
            return None

        external_ids = work_summary["external-ids"]["external-id"]
        ext_doi = [
            external_id
            for external_id in external_ids
            if external_id["external-id-type"] == "doi"
        ]
        if len(ext_doi) == 0:
            return None

        ext_doi = ext_doi[0]  # There should only be one DOI

        try:
            return ext_doi["external-id-normalized"]["value"]
        except KeyError:
            return ext_doi["external-id-value"]

    @staticmethod
    def _get_title(work_summary):
        return work_summary["title"]["title"]["value"]

    @staticmethod
    def _get_url(work_summary):
        if work_summary["url"] is None:
            return None
        return work_summary["url"]["value"]

    @staticmethod
    def _get_publication_year(work_summary):
        pub_date = work_summary["publication-date"]

        try:
            year = pub_date["year"]["value"]
        except (KeyError, TypeError):
            return None
        return int(year)

In [None]:
work_parser = ORCIDWorkParser()
articles = work_parser.parse(results)

In [None]:
len(articles)