Skip to content

Commit

Permalink
Merge pull request #20 from OKN-CollabNext/authors-without-topics
Browse files Browse the repository at this point in the history
Clamp nodes and edges
  • Loading branch information
kaaloo committed Apr 26, 2024
2 parents d18f1fa + d8b2d38 commit f1a1507
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 67 deletions.
61 changes: 39 additions & 22 deletions collabnext/openalex/edges.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,10 @@
from pyalex import Author, Institution, Work
from pyalex import Author, Institution, Topic, Work


def make_associated_institution_edges(institutions: list[Institution]) -> list[dict]:
return [
{
"id": f"""{x["id"]}-{y["id"]}""",
"start": x["id"],
"end": y["id"],
"label": "ASSOCIATED",
"start_type": "INSTITUTION",
"end_type": "INSTITUTION",
}
for x in institutions
for y in x["associated_institutions"]
]


def make_affiliated_author_edges(authors: list[Author]) -> list[dict]:
return [
def make_author_institution_edges(
authors: list[Author], institutions: list[Institution]
) -> list[dict]:
edges = [
{
"id": f"""{x["id"]}-{y["institution"]["id"]}""",
"start": x["id"],
Expand All @@ -30,9 +17,19 @@ def make_affiliated_author_edges(authors: list[Author]) -> list[dict]:
for y in x["affiliations"]
]

# Clamp authors
author_ids = [x["id"] for x in authors]
edges = [x for x in edges if x["start"] in author_ids]

# Clamp institutions
institution_ids = [x["id"] for x in institutions]
edges = [x for x in edges if x["end"] in institution_ids]

def make_author_work_edges(works: list[Work]) -> list[dict]:
return [
return edges


def make_author_work_edges(authors: list[Author], works: list[Work]) -> list[dict]:
edges = [
{
"id": f"{work['id']}-{authorship['author']['id']}",
"start": authorship["author"]["id"],
Expand All @@ -45,9 +42,19 @@ def make_author_work_edges(works: list[Work]) -> list[dict]:
for authorship in work.get("authorships", [])
]

# Clamp authors
author_ids = [x["id"] for x in authors]
edges = [x for x in edges if x["start"] in author_ids]

# Clamp works
work_ids = [x["id"] for x in works]
edges = [x for x in edges if x["end"] in work_ids]

return edges

def make_work_topic_edges(works: list[Work]) -> list[dict]:
return [

def make_work_topic_edges(works: list[Work], topics: list[Topic]) -> list[dict]:
edges = [
{
"id": f"{work['id']}-{topic['id']}",
"start": work["id"],
Expand All @@ -59,3 +66,13 @@ def make_work_topic_edges(works: list[Work]) -> list[dict]:
for work in works
for topic in work["topics"]
]

# Clamp works
work_ids = [x["id"] for x in works]
edges = [x for x in edges if x["start"] in work_ids]

# Clamp topics
topic_ids = [x["id"] for x in topics]
edges = [x for x in edges if x["end"] in topic_ids]

return edges
30 changes: 17 additions & 13 deletions collabnext/openalex/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
def make_institution_nodes(institutions: list[Institution]) -> list[dict]:
return [
{
"id": x["id"],
"id": x["id"],
"name": x["display_name"],
"institution_type": x["type"],
"homepage": x["homepage_url"],
Expand All @@ -14,8 +14,8 @@ def make_institution_nodes(institutions: list[Institution]) -> list[dict]:
"description": None,
"subfield": None,
"domain": None,
"label": x["display_name"],
"type": "INSTITUTION"
"label": x["display_name"],
"type": "INSTITUTION",
}
for x in institutions
]
Expand All @@ -27,37 +27,38 @@ def make_author_nodes(authors: list[Author]) -> list[dict]:
"id": x["id"],
"name": x["display_name"],
"institution_type": None,
"homepage": None,
"homepage": None,
"works_count": x["works_count"],
"cited_by_count": x["cited_by_count"],
"field": None,
"description": None,
"subfield": None,
"domain": None,
"label": x["display_name"],
"type": "AUTHOR"
}
"label": x["display_name"],
"type": "AUTHOR",
}
for x in authors
]


def make_work_nodes(works: list[Work]) -> list[dict]:
return [
{
"id": x["id"],
"id": x["id"],
"name": None,
"institution_type": None,
"homepage": None,
"homepage": None,
"works_count": None,
"cited_by_count": None,
"field": None,
"description": None,
"subfield": None,
"domain": None,
"label": x["title"],
"type": "WORK"
}
for x in works]
"label": x["title"],
"type": "WORK",
}
for x in works
]


def make_topic_nodes(topics: list[Topic]) -> list[dict]:
Expand All @@ -81,3 +82,6 @@ def make_topic_nodes(topics: list[Topic]) -> list[dict]:
# Note that topics are grouped by field
if not (x["field"]["id"] in seen or seen.add(x["field"]["id"]))
]



11 changes: 11 additions & 0 deletions collabnext/openalex/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
def clamp_author_nodes_to_edges(
author_nodes: list[dict], edges: list[dict]
) -> list[dict]:
author_ids = {x["start"] for x in edges}
return [x for x in author_nodes if x["id"] in author_ids]

def clamp_author_edges_to_nodes(
author_edges: list[dict], nodes: list[dict]
) -> list[dict]:
author_ids = {x["id"] for x in nodes}
return [x for x in author_edges if x["start"] in author_ids]
29 changes: 27 additions & 2 deletions collabnext/openalex/works.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,33 @@
from pyalex import Works, Work, Author
from pyalex import Author, Institution, Work, Works


def get_works_by_authors(authors: list[Author]) -> list[Work]:
seen = set()
works_by_authors = []
for author in authors:
works = Works().filter(authorships={"author": {"id": author["id"]}}).get()
works = [x for x in works if not (x["id"] in seen or seen.add(x["id"]))]
works_by_authors.extend(works)
return works_by_authors
return works_by_authors


def get_work_institutions(works: list[Work]) -> list[Institution]:
seen = set()
return [
y
for x in works["authorships"]
for y in x["institutions"]
if not (y["id"] in seen or seen.add(y["id"]))
]


def clamp_works_to_institutions(
works: list[Work], institutions: list[Institution]
) -> list[Author]:
institution_ids = {x["id"] for x in institutions}
result = []
for work in works:
work_institution_ids = {x["id"] for x in get_work_institutions(work)}
if len(institution_ids & work_institution_ids) > 0:
result.append(work)
return result
Empty file removed observable/docs/data/graph.json.py
Empty file.
96 changes: 66 additions & 30 deletions observable/docs/data/graph.sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from collabnext.openalex.authors import get_affiliated_authors
from collabnext.openalex.edges import (
make_affiliated_author_edges,
make_author_institution_edges,
make_author_work_edges,
make_work_topic_edges,
)
Expand All @@ -21,68 +21,104 @@
make_work_nodes,
)
from collabnext.openalex.topics import get_work_topics
from collabnext.openalex.works import get_works_by_authors
from collabnext.openalex.utils import (
clamp_author_edges_to_nodes,
clamp_author_nodes_to_edges,
)
from collabnext.openalex.works import (
clamp_works_to_institutions,
get_works_by_authors,
)

#
# Get and filter date from OpenAlex
#

# Get institutions
institutions = get_institutions()

# Create nodes
institution_nodes = make_institution_nodes(institutions)

# Get unique affiliated authors
authors = get_affiliated_authors(institutions)

# Get works by authors in these institutions
works = get_works_by_authors(authors)

# Clamp works to institutions
works = clamp_works_to_institutions(works, institutions)

# Get topics from works
topics = get_work_topics(works)

#
# Convert into nodes and edges
#

# Create nodes
institution_nodes = make_institution_nodes(institutions)

# Get all authors affiliated with each institution
author_nodes = make_author_nodes(authors)

# Create instutition edges
affiliated_author_edges = make_affiliated_author_edges(authors)

# Get works by authors
works = get_works_by_authors(authors)
# Create author -> instutition edges
author_institution_edges = make_author_institution_edges(authors, institutions)

# Create work nodes
work_nodes = make_work_nodes(works)

# Create author-work edges
author_work_edges = make_author_work_edges(works)
# Create author -> work edges
author_work_edges = make_author_work_edges(authors, works)

# Get topics from works
topics = get_work_topics(works)
# Clamp author nodes to those with works
author_nodes = clamp_author_nodes_to_edges(author_nodes, author_work_edges)

# Create topic nodes
topic_nodes = make_topic_nodes(topics)

# Create work-topic edges
work_topic_edges = make_work_topic_edges(works)
work_topic_edges = make_work_topic_edges(works, topics)

# Infer author-topic edges
author_topic_edges = infer_author_topic_edges(author_work_edges, work_topic_edges)

# Clamp author nodes to author topic edges
author_nodes = clamp_author_nodes_to_edges(author_nodes, author_topic_edges)

# Clamp author -> institution edges to those with topics
author_institution_edges = clamp_author_edges_to_nodes(
author_institution_edges, author_nodes
)

# Group all nodes and edges together
nodes = [*institution_nodes, *author_nodes, *work_nodes, *topic_nodes]
edges = [
*affiliated_author_edges,
*author_institution_edges,
*author_work_edges,
*author_topic_edges,
*work_topic_edges,
]

#
# Create SQLite database
#

# Create nodes dataframe
df_nodes = pd.DataFrame(nodes, columns=[
"id",
"label",
"type",
"name",
"institution_type",
"homepage",
"works_count",
"cited_by_count",
"field",
"description",
"subfield",
"domain",
])
df_nodes = pd.DataFrame(
nodes,
columns=[
"id",
"label",
"type",
"name",
"institution_type",
"homepage",
"works_count",
"cited_by_count",
"field",
"description",
"subfield",
"domain",
],
)

# Create edges dataframe
df_edges = pd.DataFrame(
Expand Down

0 comments on commit f1a1507

Please sign in to comment.