In [1]:
import pandas as pd

In [2]:
# Load base paper universe

filtered_df = pd.read_csv("../outputs/filtered_papers.csv")
print("Total filtered papers:", len(filtered_df))


# Load OpenAlex metadata
oa_df = pd.read_csv("../outputs/openalex_metadata_full.csv")
print("OpenAlex matched papers:", len(oa_df))

# Ensure no duplicate OA entries
assert oa_df["global_paper_id"].is_unique, "Duplicate global IDs in OpenAlex metadata!"

# Merge
paper_nodes = filtered_df.merge(
    oa_df,
    on="global_paper_id",
    how="left"  # Keep ALL 2628 papers
)

Total filtered papers: 2628
OpenAlex matched papers: 2529


In [3]:
## Rename + clean columns

paper_nodes = paper_nodes.rename(columns={
    "global_paper_id": "node_id"
})

paper_nodes["node_type"] = "Paper"

# Reorder columns cleanly
paper_nodes = paper_nodes[
    [
        "node_id",
        "node_type",
        "title",
        "domain",
        "split",
        "year",
        "openalex_id",
        "cited_by_count",
        "score",  # original mapping score
        "method"  # original mapping method
    ]
]

In [4]:
## Sanity Checks

print("\n===== SANITY CHECKS =====")

print("Total paper nodes:", len(paper_nodes))
print("Duplicate node_ids:",
      paper_nodes["node_id"].duplicated().sum())

print("Missing OpenAlex IDs:",
      paper_nodes["openalex_id"].isnull().sum())

print("Missing year values:",
      paper_nodes["year"].isnull().sum())

assert paper_nodes["node_id"].is_unique

# Save

paper_nodes.to_csv("../outputs/paper_nodes.csv", index=False)

print("\nSaved: paper_nodes.csv")


===== SANITY CHECKS =====
Total paper nodes: 2628
Duplicate node_ids: 0
Missing OpenAlex IDs: 99
Missing year values: 99

Saved: paper_nodes.csv
