In [1]:
import pandas as pd
import os
from glob import glob

In [2]:
paper_nodes = pd.read_csv("../outputs/paper_nodes.csv")
entity_nodes = pd.read_csv("../outputs/entity_nodes.csv")

print("Paper nodes:", len(paper_nodes))
print("Entity nodes:", len(entity_nodes))

Paper nodes: 2628
Entity nodes: 108816


In [3]:
## entity name â†’ entity_id map
entity_map = dict(
    zip(entity_nodes["name"], entity_nodes["node_id"])
)

print("Entity map size:", len(entity_map))

Entity map size: 108816


In [4]:
## Triplet files

TRIPLETS_FOLDER = "../../Scientific_Novelty_Detection/Triplets/"
triplet_files = glob(os.path.join(TRIPLETS_FOLDER, "**", "*_triplets.csv"), recursive=True)

print("Triplet files found:", len(triplet_files))

edges = []

Triplet files found: 17


In [5]:
## Normalize entity same way as entity_nodes
def normalize_entity(text):
    if pd.isna(text):
        return None
    text = str(text).strip().lower()
    text = text.replace('"""', '"')
    text = text.replace("''", "'")
    text = " ".join(text.split())
    return text

In [37]:
# Process all triplet files
paper_id_set = set(paper_nodes["node_id"])

# Process all triplet files
for file in triplet_files:

    # Extract split from parent folder
    split = os.path.basename(os.path.dirname(file)).upper()

    # Extract domain from filename
    filename = os.path.basename(file)
    domain = filename.split("_")[0].upper()

    df = pd.read_csv(file)

    required_cols = ["sub", "obj", "pred", "paper_ID"]
    if not all(col in df.columns for col in required_cols):
        continue

    for _, row in df.iterrows():

        local_id = row["paper_ID"]
        global_id = f"{split}_{domain}_{local_id}"

        if global_id not in paper_id_set:
            continue

        source_year = paper_nodes.loc[
            paper_nodes["node_id"] == global_id, "year"
        ].values[0]

        sub = normalize_entity(row["sub"])
        obj = normalize_entity(row["obj"])
        predicate = row["pred"]

        for entity in [sub, obj]:

            if entity in entity_map:

                edges.append({
                    "source": global_id,
                    "target": entity_map[entity],
                    "predicate": predicate,
                    "year": source_year
                })

In [39]:
print(len(edges))

278530


In [40]:
# Create DataFrame
knowledge_edges = pd.DataFrame(edges)

print("Raw knowledge edges:", len(knowledge_edges))

# Remove duplicates
knowledge_edges = knowledge_edges.drop_duplicates()

print("After deduplication:", len(knowledge_edges))

Raw knowledge edges: 278530
After deduplication: 238273


In [41]:
# Sanity Checks
print("\nSANITY CHECKS")

print("Duplicate edges:",
      knowledge_edges.duplicated().sum())

print("Missing year values:",
      knowledge_edges["year"].isnull().sum())

assert knowledge_edges["source"].isin(
    paper_nodes["node_id"]
).all()

assert knowledge_edges["target"].isin(
    entity_nodes["node_id"]
).all()


SANITY CHECKS
Duplicate edges: 0
Missing year values: 3925


In [42]:
## Save
knowledge_edges.to_csv("../outputs/knowledge_edges.csv", index=False)

print("\nSaved: knowledge_edges.csv")


Saved: knowledge_edges.csv


In [43]:
print("Papers with no edges:",
      len(set(paper_nodes["node_id"]) - set(knowledge_edges["source"])))

Papers with no edges: 470


In [44]:
knowledge_edges.merge(
    paper_nodes[["node_id", "domain"]],
    left_on="source",
    right_on="node_id"
)["domain"].value_counts()

domain
MT     104301
SA      39307
QA      35644
DIA     25089
SUM     18420
PAR     13835
NLI      1677
Name: count, dtype: int64