In [1]:
import pandas as pd

In [2]:
## Load data
triplets_dia = pd.concat([
    pd.read_csv("../../Scientific_Novelty_Detection/Triplets/Blogs/Dia_Blogs_triplets.csv"),
    pd.read_csv("../../Scientific_Novelty_Detection/Triplets/Novel_Papers/Dia2021_triplets.csv"),
    pd.read_csv("../../Scientific_Novelty_Detection/Triplets/SKG/Dia_triplets.csv")
], ignore_index=True)
filtered_df = pd.read_csv("../outputs/filtered_papers.csv")
paper_nodes = pd.read_csv("../outputs/paper_nodes_dia.csv")
entity_nodes = pd.read_csv("../outputs/entity_nodes.csv")

print("Triplets:", len(triplets_dia))
print("Filtered papers:", len(filtered_df))
print("Paper nodes:", len(paper_nodes))
print("Entities:", len(entity_nodes))

Triplets: 24619
Filtered papers: 2628
Paper nodes: 2628
Entities: 108816


In [3]:
triplets_dia.rename({"paper_ID" : "paper_id"}, axis=1, inplace=True)

In [4]:
triplets_dia


Unnamed: 0,topic,paper_id,sentence_ID,info-unit,sub,pred,obj,triplets,pred_weights
0,Blogs,0,23,ablation-analysis,hyper-parameters,discovered that,more powerful decoder,hyper-parameters discovered that more powerful...,0.709181
1,Blogs,0,23,ablation-analysis,more powerful decoder,key to,higher conversational quality,more powerful decoder key to higher conversati...,0.592724
2,Blogs,0,23,ablation-analysis,ablation analysis,tuning,hyper-parameters,ablation analysis tuning hyper-parameters,0.820923
3,Blogs,0,59,ablation-analysis,tuned decoding,advances,ssa score,tuned decoding advances ssa score,0.693804
4,Blogs,0,59,ablation-analysis,ssa score,to,79 %,ssa score to 79 %,0.526003
...,...,...,...,...,...,...,...,...,...
24614,Blogs,7,181,results,top,in,dstc1 and dstc3 evaluations,top in dstc1 and dstc3 evaluations,0.557671
24615,Blogs,7,181,results,results,has,machine - learned methods,results has machine - learned methods,0.492934
24616,Blogs,7,189,results,dstc1,has,sequential crf model,dstc1 has sequential crf model,0.571273
24617,Blogs,7,189,results,outperformed,has,static classifier approaches,outperformed has static classifier approaches,0.631077


In [5]:
triplets_dia.columns

Index(['topic', 'paper_id', 'sentence_ID', 'info-unit', 'sub', 'pred', 'obj',
       'triplets', 'pred_weights'],
      dtype='object')

In [6]:
filtered_dia = filtered_df[filtered_df["domain"] == "DIA"].copy()

paper_nodes_dia = paper_nodes[
    (paper_nodes["domain"] == "DIA")
].copy()

print("Filtered DIA papers:", len(filtered_dia))
print("Paper nodes DIA:", len(paper_nodes_dia))

Filtered DIA papers: 274
Paper nodes DIA: 274


In [7]:
## BUILD local_paper_id → global_paper_id MAP
filtered_dia = filtered_df[filtered_df["domain"] == "DIA"].copy()

paper_nodes_dia = paper_nodes[
    (paper_nodes["domain"] == "DIA")
].copy()

print("Filtered DIA papers:", len(filtered_dia))
print("Paper nodes DIA:", len(paper_nodes_dia))

Filtered DIA papers: 274
Paper nodes DIA: 274


In [8]:
# BUILD local_paper_id → global_paper_id MAP
local_to_global = dict(
    zip(filtered_dia["local_paper_id"], filtered_dia["global_paper_id"])
)

print("Local → Global mapping size:", len(local_to_global))

Local → Global mapping size: 231


In [9]:
# BUILD entity name → entity_id MAP
entity_map = dict(
    zip(entity_nodes["name"], entity_nodes["node_id"])
)

print("Entity mapping size:", len(entity_map))

Entity mapping size: 108816


In [10]:
# BUILD entity name → entity_id MAP
paper_year_map = dict(
    zip(paper_nodes_dia["node_id"], paper_nodes_dia["year"])
)

In [17]:
# CONSTRUCT KNOWLEDGE EDGES
knowledge_edges = []

missing_entities = 0
missing_papers = 0

for _, row in triplets_dia.iterrows():

    local_id = (row["paper_id"])   # ensure string match

    if local_id not in local_to_global:
        missing_papers += 1
        continue

    global_id = local_to_global[local_id]

    sub = str(row["sub"]).strip()
    pred = str(row["pred"]).strip()
    obj = str(row["obj"]).strip()

    # Normalize (must match entity_nodes cleaning)
    sub = sub.lower()
    obj = obj.lower()

    # SUB edge
    if sub in entity_map:
        knowledge_edges.append({
            "source": global_id,
            "target": entity_map[sub],
            "predicate": pred,
            "year": paper_year_map.get(global_id)
        })
    else:
        missing_entities += 1

    # OBJ edge
    if obj in entity_map:
        knowledge_edges.append({
            "source": global_id,
            "target": entity_map[obj],
            "predicate": pred,
            "year": paper_year_map.get(global_id)
        })
    else:
        missing_entities += 1

In [18]:
knowledge_edges

[{'source': 'SKG_DIA_0',
  'target': 'E_4616a006d0',
  'predicate': 'discovered that',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_ed84d47e33',
  'predicate': 'discovered that',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_ed84d47e33',
  'predicate': 'key to',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_250a0b27d0',
  'predicate': 'key to',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_b12965637d',
  'predicate': 'tuning',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_4616a006d0',
  'predicate': 'tuning',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_68a81f19c9',
  'predicate': 'advances',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_0221cdbe2a',
  'predicate': 'advances',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_0221cdbe2a',
  'predicate': 'to',
  'year': 2019.0},
 {'source': 'SKG_DIA_0',
  'target': 'E_b12965637d',
  'predicate': 'has',
  'year': 2019.0},
 {'source': 'SK

In [19]:
# Create DataFrame
knowledge_df = pd.DataFrame(knowledge_edges)

print("Total DIA knowledge edges:", len(knowledge_df))
print("Missing paper mappings:", missing_papers)
print("Missing entity mappings:", missing_entities)

print("Unique sources:", knowledge_df["source"].nunique())
print("Unique targets:", knowledge_df["target"].nunique())

Total DIA knowledge edges: 41561
Missing paper mappings: 2470
Missing entity mappings: 2737
Unique sources: 231
Unique targets: 13317


In [21]:
knowledge_df.to_csv("../outputs/knowledge_edges_dia.csv", index=False)

print("Saved knowledge_edges_dia.csv")

Saved knowledge_edges_dia.csv
