Skip to content

Commit

Permalink
Merge pull request #16 from OKN-CollabNext/fetch-custom-institutions
Browse files Browse the repository at this point in the history
Adding custom institutions pipelines and data for HowardU and HBCUs
  • Loading branch information
whymath committed Apr 20, 2024
2 parents c4cb4c0 + ffffb7d commit d00083a
Show file tree
Hide file tree
Showing 9 changed files with 169 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
yarn.lock
node_modules/
bkp/

# Ignore python related files (cache, virtual environment, etc.)
*.pyc
*.pyo
Expand Down
37 changes: 37 additions & 0 deletions collabnext/custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

from pyalex import Institutions, Institution
import pandas as pd
import json

def get_institutions_howardu() -> list[Institution]:
return Institutions().filter(openalex="I137853757").get()

def get_institutions_hbcus(dataloadtype) -> list[Institution]:
institutions_hbcus = []

if dataloadtype == "local":
try:
institutions_hbcus = json.load(open("data/institutions_hbcus.json"))
except Exception as e:
print("Error loading HBCUs JSON data:", e)

if dataloadtype == "api" or len(institutions_hbcus) == 0:
try:
# Read list of HBCUs Names from Eligibility Data
inst_df = pd.read_csv("data/institutions_hbcus.csv")
inst_df["query"] = inst_df["name"].str.lower()
inst_df["query"] = inst_df["query"].str.replace(" &", "")

# Run API search for HBCUs and add filtered results
for query in inst_df["query"].tolist():
institutions_query = Institutions().filter(display_name={"search": query}).get()

for inst in institutions_query:
hbcu_inst_ids = [x["id"] for x in institutions_hbcus]
if (inst["display_name"] in inst_df["name"].tolist()) and (inst["id"] not in hbcu_inst_ids):
print("Adding institution:", inst["display_name"])
institutions_hbcus.append(inst)
except Exception as e:
print("Error reading HBCUs names from CSV and fetching API data:", e)

return institutions_hbcus
2 changes: 1 addition & 1 deletion collabnext/openalex/institutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@


def get_institutions() -> list[Institution]:
# Get 3 random institutions for now
# Get 5 random institutions for now
return [Institutions().random() for _ in range(5)]
5 changes: 5 additions & 0 deletions collabnext/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

DATA_LOAD_TYPE = "local"
# DATA_LOAD_TYPE = "api"
INSTITUTION_FILTER = "howardu"
# INSTITUTION_FILTER = "hbcus"
102 changes: 102 additions & 0 deletions data/institutions_hbcus.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name
Alabama A & M University
Alabama State University
Albany State University
Alcorn State University
Allen University
American Baptist College
Arkansas Baptist College
Benedict College
Bennett College
Bethune-Cookman University
Bishop State Community College
Bluefield State College
Bowie State University
Central State University
Charles R Drew University of Medicine and Science
Cheyney University of Pennsylvania
Claflin University
Clark Atlanta University
Clinton College
Coahoma Community College
Coppin State University
Delaware State University
Denmark Technical College
Dillard University
Edward Waters College
Elizabeth City State University
Fayetteville State University
Fisk University
Florida Agricultural and Mechanical University
Florida Memorial University
Fort Valley State University
Gadsden State Community College
Grambling State University
H Councill Trenholm State Community College
Hampton University
Harris-Stowe State University
Hinds Community College
Howard University
Huston-Tillotson University
J. F. Drake State Community and Technical College
Jackson State University
Jarvis Christian College
Johnson C Smith University
Kentucky State University
Lane College
Langston University
Lawson State Community College
Le Moyne-Owen College
Lincoln University
Lincoln University
Livingstone College
Meharry Medical College
Miles College
Mississippi Valley State University
Morehouse College
Morehouse School of Medicine
Morgan State University
Morris Brown College
Morris College
Norfolk State University
North Carolina A & T State University
North Carolina Central University
Oakwood University
Paine College
Paul Quinn College
Philander Smith College
Prairie View A & M University
Rust College
Saint Augustine's University
Savannah State University
Shaw University
Shelton State Community College
Shorter College
Simmons College of Kentucky
South Carolina State University
Southern University and A & M College
Southern University at New Orleans
Southern University at Shreveport
Southwestern Christian College
Spelman College
St Philip's College
Stillman College
Talladega College
Tennessee State University
Texas College
Texas Southern University
Tougaloo College
Tuskegee University
University of Arkansas at Pine Bluff
University of Maryland Eastern Shore
University of the District of Columbia
University of the Virgin Islands
Virginia State University
Virginia Union University
Virginia University of Lynchburg
Voorhees College
West Virginia State University
Wilberforce University
Wiley College
Winston-Salem State University
Xavier University of Louisiana
1 change: 1 addition & 0 deletions data/institutions_hbcus.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/institutions_howardu.json

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion observable/docs/data/graph.sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,20 @@
from collabnext.openalex.topics import get_work_topics
from collabnext.openalex.works import get_works_by_authors

from collabnext import settings, custom

# Get institutions
institutions = get_institutions()
institutions = []
try:
if settings.INSTITUTION_FILTER == "howardu":
institutions = custom.get_institutions_howardu()
elif settings.INSTITUTION_FILTER == "hbcus":
institutions = custom.get_institutions_hbcus(settings.DATA_LOAD_TYPE)
except Exception as e:
print("\nError getting custom institutions:", e, "\n")

if institutions is None or len(institutions) == 0:
institutions = get_institutions()

# Create nodes
institution_nodes = make_institution_nodes(institutions)
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
invoke
pyalex
python-dotenv
poetry
pandas

0 comments on commit d00083a

Please sign in to comment.