In [3]:
import pandas as pd

In [4]:
## Load 
paper_nodes = pd.read_csv("../outputs/paper_nodes.csv")
oa_df = pd.read_csv("../outputs/openalex_metadata_full.csv")

print("Total papers:", len(paper_nodes))
print("OpenAlex papers:", len(oa_df))

# Build OpenAlex ID → global ID map

oa_id_map = dict(
    zip(oa_df["openalex_id"], oa_df["global_paper_id"])
)

print("OpenAlex ID map size:", len(oa_id_map))

Total papers: 2628
OpenAlex papers: 2529
OpenAlex ID map size: 2436


In [5]:
# Build citation edges
edges = []

for _, row in oa_df.iterrows():

    source_global = row["global_paper_id"]
    source_year = row["year"]
    referenced = row["referenced_works"]

    if pd.isna(referenced):
        continue

    # referenced_works stored as string list → convert
    if isinstance(referenced, str):
        try:
            referenced = eval(referenced)
        except:
            continue

    for ref_oa_id in referenced:

        # Keep only citations within our paper universe
        if ref_oa_id in oa_id_map:

            target_global = oa_id_map[ref_oa_id]

            # Drop self-citations
            if source_global == target_global:
                continue

            edges.append({
                "source": source_global,
                "target": target_global,
                "year": source_year
            })

In [6]:
# Create DataFrame
citation_edges = pd.DataFrame(edges)

print("Raw citation edges:", len(citation_edges))

# Remove duplicates
citation_edges = citation_edges.drop_duplicates()

print("After deduplication:", len(citation_edges))

Raw citation edges: 7425
After deduplication: 7425


In [10]:
# Final Sanity Checks
print("\nSANITY CHECKS")

print("Duplicate edges:",
      citation_edges.duplicated().sum())

print("Missing year values:",
      citation_edges["year"].isnull().sum())

assert citation_edges["source"].isin(
    paper_nodes["node_id"]
).all()

assert citation_edges["target"].isin(
    paper_nodes["node_id"]
).all()


SANITY CHECKS
Duplicate edges: 0
Missing year values: 0


In [8]:
# Save
citation_edges.to_csv("../outputs/citation_edges.csv", index=False)

print("\nSaved: citation_edges.csv")


Saved: citation_edges.csv
