Merge pull request #18 from OKN-CollabNext/fetch-custom-institutions

Refactoring API data pipeline for fetching institutions data and updating Readme
OKN-CollabNext · Apr 25, 2024 · dbea5f6 · dbea5f6
2 parents 0555e52 + 5c9036a
commit dbea5f6
Show file tree

Hide file tree

Showing 12 changed files with 152 additions and 78 deletions.
diff --git a/README.md b/README.md
@@ -14,16 +14,16 @@ pyenv install 3.11.4
 
 ### Node
 
-This code base is compatible with node 18 and above. Please use [the following instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs)
+This code base is compatible with node 18 and above. Please use [these instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs)
 to install node for your operating system if needed.
 
 ### Git
 
-Please [follow the instructions on GitHub](https://github.com/git-guides/install-git) to install git on your system.
+Please follow the [instructions on GitHub](https://github.com/git-guides/install-git) to install git on your system.
 
 ### Poetry
 
-Please [follow the instractions on the Poetry website](https://python-poetry.org/docs/#installation) to install poetry on your system.
+Please follow the [instructions on the Poetry website](https://python-poetry.org/docs/#installation) to install poetry on your system.
 
 ## Getting Started
 
@@ -45,14 +45,22 @@ You can then install project dependencies as follows:
 poetry install
 ```
 
-You need a `.env` file to store secrets as follows:
+You need a `.env` file to store secrets and other environment variables as follows:
 
 ```
 OPENALEX_EMAIL=mailto@example.com
+INSTITUTIONS_FETCH_FILTER=hbcus
+INSTITUTIONS_FETCH_COUNT=5
 ```
 
 The OPENALEX_EMAIL secret is used to [speed up calls](https://docs.openalex.org/how-to-use-the-api/api-overview) to the OpenAlex REST API.
 
+INSTITUTIONS_FETCH_FILTER (allowed values = `hbcus` or `howardu`) is used to configure which institutions will be fetched from the OpenAlex API and saved to `observable/docs/data/institutions.json`.
+
+INSTITUTIONS_FETCH_COUNT determines how many institutions will be loaded in the application.
+
+>**NOTE:** INSTITUTIONS_FETCH_FILTER and INSTITUTIONS_FETCH_COUNT are only used when running `fetch_custom_institutions.py` as a script. When using `invoke fetch` the default values of `hbcus` and `5` are used respectively.
+
 ## Running
 
 This project uses [Observable Framework](https://observablehq.com/framework/). You can run the site locally in development mode as follows
@@ -72,13 +80,19 @@ Deployments to this project on the Observable Cloud take place through the **Dep
 
 You can run various other commands using `invoke` as follows.
 
-Deploy the site to Observable Cloud.
+Fetch first 5 HBCUs institutions data from the OpenAlex API and save it to `observable/docs/data/institutions.json`:
+
+```bash
+invoke fetch
+```
+
+Deploy the site to Observable Cloud:
 
 ```bash
 invoke deploy
 ```
 
-Build the static web site locally.
+Build the static web site locally:
 
 ```bash
 invoke build
@@ -87,13 +101,13 @@ invoke build
 Manually case a graph.json refresh. This is needed because currently
 observable framework doesn't notice if a dependent python module
 has been changed when developing. It only monitors changes to
-the particular page that is being displayed.
+the particular page that is being displayed.:
 
 ```bash
 invoke touch
 ```
 
-Delete local git branches that have already been merged.
+Delete local git branches that have already been merged:
 
 ```bash
 invoke clean-branches

diff --git a/collabnext/custom.py b/collabnext/custom.py
diff --git a/collabnext/openalex/institutions.py b/collabnext/openalex/institutions.py
@@ -1,6 +1,20 @@
 from pyalex import Institution, Institutions
+import json
+import sys
 
 
-def get_institutions() -> list[Institution]:
-    # Get 5 random institutions for now
-    return [Institutions().random() for _ in range(5)]
+def get_institutions(institutions_file_path: str = "docs/data/institutions.json") -> list[Institution]:
+    institutions = []
+
+    # Load institutions from JSON file
+    try:        
+        institutions = json.load(open(institutions_file_path))        
+    except Exception as e:
+        print("\nError loading institutions from JSON file", institutions_file_path, ":", e, "\n", file=sys.stderr)
+
+    # Get 5 random institutions in case of error
+    if institutions is None or len(institutions) == 0:
+        print("No institutions found in JSON file, fetching random institutions\n", file=sys.stderr)
+        institutions = [Institutions().random() for _ in range(5)]
+
+    return institutions
diff --git a/collabnext/settings.py b/collabnext/settings.py
diff --git a/data/institutions_hbcus.json b/data/institutions_hbcus.json
diff --git a/data/institutions_howardu.json b/data/institutions_howardu.json
diff --git a/observable/docs/data/graph.sqlite.py b/observable/docs/data/graph.sqlite.py
@@ -23,20 +23,8 @@
 from collabnext.openalex.topics import get_work_topics
 from collabnext.openalex.works import get_works_by_authors
 
-from collabnext import settings, custom
-
 # Get institutions
-institutions = []
-try:
-    if settings.INSTITUTION_FILTER == "howardu":
-        institutions = custom.get_institutions_howardu()
-    elif settings.INSTITUTION_FILTER == "hbcus":
-        institutions = custom.get_institutions_hbcus(settings.DATA_LOAD_TYPE)
-except Exception as e:
-    print("\nError getting custom institutions:", e, "\n")
-
-if institutions is None or len(institutions) == 0:
-    institutions = get_institutions()
+institutions = get_institutions()
 
 # Create nodes
 institution_nodes = make_institution_nodes(institutions)

diff --git a/observable/docs/data/institutions.json b/observable/docs/data/institutions.json
diff --git a/requirements.txt b/requirements.txt
diff --git a/scripts/fetch_custom_institutions.py b/scripts/fetch_custom_institutions.py
@@ -0,0 +1,100 @@
+
+from pyalex import Institutions, Institution
+import pandas as pd
+import json
+import os
+import sys
+
+
+def fetch_institutions_from_api(
+        institutions_fetch_filter: str = "hbcus",
+        institutions_fetch_count: int = 5,
+        institutions_names_list_path: str = "scripts/hbcus_names_list.csv",
+        save_to_file: bool = True,
+        institutions_save_path: str = "observable/docs/data/institutions.json"
+) -> list[Institution]:
+    """
+    Fetch institutions from the OpenAlex API based on the specified filter and save the data to a JSON file
+
+    Args:
+        institutions_fetch_filter (str): The filter to determine which institutions to fetch from the API
+        institutions_fetch_count (int): The number of institutions for which to fetch data
+        institutions_names_list_path (str): CSV file path containing the list of HBCUs names
+        save_to_file (bool): Whether to save the institutions data to a JSON file
+        institutions_save_path (str): JSON file path to save the institutions data to
+    
+    Returns:
+        list[Institution]: The list of institutions fetched from the API
+    """
+
+    institutions = []
+
+    try:
+        if institutions_fetch_filter == "howardu":
+            # Fetch Howard University based on OpenAlex ID
+            institutions = Institutions().filter(openalex="I137853757").get()
+            print("\nFetched institution data for Howard University")
+
+        elif institutions_fetch_filter == "hbcus":
+
+            # Read list of HBCUs Names from Eligibility Data
+            inst_df = pd.read_csv(institutions_names_list_path)
+            print("\nLoaded list of HBCUs names from:", institutions_names_list_path, "\n")
+            inst_df["query"] = inst_df["name"].str.lower()
+            inst_df["query"] = inst_df["query"].str.replace(" &", "")
+
+            # Run API search for each HBCU name
+            hbcu_inst_ids = []
+            hbcu_inst_count = 0
+            for query in inst_df["query"].tolist():
+                # Break if the required number of institutions have been fetched, else proceed with search query
+                if hbcu_inst_count >= institutions_fetch_count:
+                    break                
+                institutions_query = Institutions().filter(display_name={"search": query}).get()
+
+                # Check search results for name matches and add to institutions list if not already present
+                for inst in institutions_query:
+                    if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids) and (hbcu_inst_count < institutions_fetch_count):
+                        print("Adding institution:", inst["display_name"])
+                        institutions.append(inst)
+                        hbcu_inst_ids.append(inst["id"])
+                        hbcu_inst_count += 1
+
+            print("\nFetched data for", len(institutions), "out of", inst_df.shape[0], "institutions\n")
+
+        else:
+            print("Invalid value of institutions_fetch_filter, make sure to set it to 'hbcus' or 'howardu' (without the quotes) in your .env file")
+
+        # Save institutions data to JSON file if required
+        if (save_to_file) and (len(institutions) > 0):            
+            with open(institutions_save_path, "w") as f:
+                json.dump(institutions, f)
+                print("Institutions data saved to", institutions_save_path, "\n")
+
+    except Exception as e:
+        print("\nError fetching institutions from the API:", e, "\n")
+
+    return institutions
+
+
+if __name__ == "__main__":
+
+    # Check system arguments and environment variables for filter and count of institutions to fetch
+    try:
+        institutions_fetch_filter = str(sys.argv[1])
+        institutions_fetch_count = int(sys.argv[2])
+    except Exception as e:
+        print("\nError parsing system arguments:", e, "\n")
+        try:
+            institutions_fetch_filter = os.getenv("INSTITUTIONS_FETCH_FILTER")
+            institutions_fetch_count = int(os.getenv("INSTITUTIONS_FETCH_COUNT"))
+        except Exception as e:
+            print("\nError fetching environment variables:", e, "\n")
+            institutions_fetch_filter = "hbcus"
+            institutions_fetch_count = 5
+    if institutions_fetch_count is None or institutions_fetch_count <= 0:
+        institutions_fetch_count = 5
+
+    # Make the API call to fetch data
+    institutions = fetch_institutions_from_api(institutions_fetch_filter, institutions_fetch_count)
+    print("Completed fetching institutions data from the OpenAlex API\n")
diff --git a/data/institutions_hbcus.csv → scripts/hbcus_names_list.csv b/data/institutions_hbcus.csv → scripts/hbcus_names_list.csv
@@ -1,5 +1,9 @@
 name
-Alabama A & M University
+Alabama Agricultural and Mechanical University
+Fisk University
+Howard University
+Morehouse College
+Texas Southern University
 Alabama State University
 Albany State University
 Alcorn State University
@@ -26,7 +30,6 @@ Dillard University
 Edward Waters College
 Elizabeth City State University
 Fayetteville State University
-Fisk University
 Florida Agricultural and Mechanical University
 Florida Memorial University
 Fort Valley State University
@@ -36,7 +39,6 @@ H Councill Trenholm State Community College
 Hampton University
 Harris-Stowe State University
 Hinds Community College
-Howard University
 Huston-Tillotson University
 J. F. Drake State Community and Technical College
 Jackson State University
@@ -53,7 +55,6 @@ Livingstone College
 Meharry Medical College
 Miles College
 Mississippi Valley State University
-Morehouse College
 Morehouse School of Medicine
 Morgan State University
 Morris Brown College
@@ -84,7 +85,6 @@ Stillman College
 Talladega College
 Tennessee State University
 Texas College
-Texas Southern University
 Tougaloo College
 Tuskegee University
 University of Arkansas at Pine Bluff

diff --git a/tasks.py b/tasks.py
@@ -55,3 +55,9 @@ def clean_branches(c):
 def touch(c):
     with cwd("observable/docs/data"):
         c.run("touch graph.sqlite.py")
+
+
+@task
+def fetch(c):
+    with cwd("."):
+        c.run("python scripts/fetch_custom_institutions.py hbcus 5")