In [1]:
import pandas as pd
import json

In [3]:
paper_nodes = pd.read_csv("../outputs/paper_nodes_dia.csv")

with open("../outputs/openalex_metadata.json", "r", encoding="utf-8") as f:
    metadata_store = json.load(f)

print("Loaded paper_nodes:", len(paper_nodes))
print("Loaded metadata entries:", len(metadata_store))

Loaded paper_nodes: 2628
Loaded metadata entries: 273


In [4]:
## FILTER DIA ONLY
dia_nodes = paper_nodes[
    (paper_nodes["domain"] == "DIA") &
    (paper_nodes["openalex_id"].notna())
].copy()

print("DIA papers with OpenAlex ID:", len(dia_nodes))

DIA papers with OpenAlex ID: 273


In [5]:
## BUILD OpenAlex → global_paper_id MAP

oa_to_global = dict(
    zip(dia_nodes["openalex_id"], dia_nodes["node_id"])
)

print("OpenAlex ID mapping size:", len(oa_to_global))

OpenAlex ID mapping size: 273


In [6]:
## BUILD CITATION EDGES

citation_edges = []

for _, row in dia_nodes.iterrows():

    source_global = row["node_id"]
    source_oa = row["openalex_id"]
    source_year = row["year"]
    title = row["title"]

    meta = metadata_store.get(title)

    if not meta:
        continue

    references = meta.get("referenced_works", [])

    for ref in references:

        # Only keep citations within DIA
        if ref in oa_to_global:

            target_global = oa_to_global[ref]

            # Optional: prevent self-citation
            if source_global == target_global:
                continue

            citation_edges.append({
                "source": source_global,
                "target": target_global,
                "year": source_year
            })


In [7]:
citation_df = pd.DataFrame(citation_edges)

print("Total DIA citation edges:", len(citation_df))
print("Unique sources:", citation_df["source"].nunique())
print("Unique targets:", citation_df["target"].nunique())
print("Self citations:", (citation_df["source"] == citation_df["target"]).sum())

Total DIA citation edges: 526
Unique sources: 188
Unique targets: 115
Self citations: 0


In [8]:
citation_df.to_csv("../outputs/citation_edges_dia.csv", index=False)

print("Saved citation_edges_dia.csv")

Saved citation_edges_dia.csv


In [9]:
pd.read_csv('../outputs/citation_edges_dia.csv')

Unnamed: 0,source,target,year
0,NOVEL_DIA_1,SKG_DIA_72,2021.0
1,NOVEL_DIA_2,SKG_DIA_97,2021.0
2,NOVEL_DIA_3,SKG_DIA_216,2021.0
3,NOVEL_DIA_4,SKG_DIA_150,2021.0
4,NOVEL_DIA_4,SKG_DIA_49,2021.0
...,...,...,...
521,SKG_DIA_315,SKG_DIA_308,2020.0
522,SKG_DIA_315,SKG_DIA_276,2020.0
523,SKG_DIA_315,SKG_DIA_186,2020.0
524,SKG_DIA_315,SKG_DIA_102,2020.0


In [10]:
citation_df["year"].isna().sum()

np.int64(0)