In [1]:
import pandas as pd
import hashlib
import os
from glob import glob
import re

In [2]:
# Change this to your actual triplet folder path
TRIPLETS_FOLDER = "../../Scientific_Novelty_Detection/Triplets/"

# Recursively find all *_triplets.csv files
triplet_files = glob(os.path.join(TRIPLETS_FOLDER, "**", "*_triplets.csv"), recursive=True)

print("Found", len(triplet_files), "triplet files")
for f in triplet_files:
    print(f)

Found 17 triplet files
../../Scientific_Novelty_Detection/Triplets\Blogs\Dia_Blogs_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Blogs\MT_Blogs_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Blogs\QA_Blogs_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Blogs\SA_Blogs_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Blogs\Sum_Blogs_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Novel_Papers\Dia2021_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Novel_Papers\MT2021_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Novel_Papers\QA2021_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Novel_Papers\SA2021_triplets.csv
../../Scientific_Novelty_Detection/Triplets\Novel_Papers\Sum2021_triplets.csv
../../Scientific_Novelty_Detection/Triplets\SKG\Dia_triplets.csv
../../Scientific_Novelty_Detection/Triplets\SKG\MT_triplets.csv
../../Scientific_Novelty_Detection/Triplets\SKG\NLI_triplets.csv
../../Scientific_Novelty_Detection

In [3]:
## Merge all triplets
all_dfs = []

for file in triplet_files:
    df = pd.read_csv(file)

    # Ensure required columns exist
    if "sub" not in df.columns or "obj" not in df.columns:
        raise ValueError(f"Missing 'sub' or 'obj' in {file}")

    all_dfs.append(df[["sub", "obj"]])

triplets_df = pd.concat(all_dfs, ignore_index=True)

print("Total triplet rows:", len(triplets_df))

Total triplet rows: 238088


In [4]:
triplets_df

Unnamed: 0,sub,obj
0,hyper-parameters,more powerful decoder
1,more powerful decoder,higher conversational quality
2,ablation analysis,hyper-parameters
3,tuned decoding,ssa score
4,ssa score,79 %
...,...,...
238083,results,proposed model
238084,our model,longer sequences
238085,longer sequences,introduction and the sentences
238086,introduction and the sentences,extractor


In [5]:
## Normalise Entity Strings
def normalize_entity(text):
    if pd.isna(text):
        return None

    text = str(text).strip().lower()

    # Remove excessive quotes
    text = text.replace('"""', '"')
    text = text.replace("''", "'")

    # Remove extra spaces
    text = " ".join(text.split())

    return text

triplets_df["sub"] = triplets_df["sub"].apply(normalize_entity)
triplets_df["obj"] = triplets_df["obj"].apply(normalize_entity)

In [6]:
raw_entities = set(triplets_df["sub"].dropna()) | set(triplets_df["obj"].dropna())
print("Raw unique entities:", len(raw_entities))

Raw unique entities: 119379


In [7]:
def is_valid_entity(text):
    if not text:
        return False

    text = text.strip()

    # Minimum length
    if len(text) < 3:
        return False

    # Remove parenthesis-dominated patterns
    if text.startswith("("):
        return False

    if re.fullmatch(r"\(.*\)", text):
        return False

    # Remove set notation / math blocks
    if text.startswith("{") or text.startswith("|"):
        return False

    if re.search(r"\{.*\}", text):
        return False

    # Remove scientific notation
    if re.search(r"\d+e[-\s]?\d+", text):
        return False

    # Remove heavy numeric ratio
    digit_ratio = sum(c.isdigit() for c in text) / len(text)
    if digit_ratio > 0.4:
        return False

    # Remove hardware/config patterns
    if "gpu" in text:
        return False

    if "units" in text:
        return False

    if "encoder" in text and "decoder" in text:
        return False

    # Remove statistical fragments
    if text.startswith("~"):
        return False

    if "%" in text:
        return False

    stat_patterns = [
        " more ",
        " lower",
        " lines",
        " questions",
        " entities",
        " pp",
        " ratio"
    ]

    for p in stat_patterns:
        if p in text:
            return False

    if re.search(r"\d+\s?[km]", text):
        return False

    if re.search(r"\d+\.?\d*x", text):
        return False

    # Must contain alphabet
    if not re.search(r"[a-zA-Z]", text):
        return False

    # Remove pure punctuation
    if re.fullmatch(r"[\W_]+", text):
        return False

    return True

In [8]:
## Extract unique Entities
clean_entities = {e for e in raw_entities if is_valid_entity(e)}
print("Clean unique entities:", len(clean_entities))

Clean unique entities: 108816


In [9]:
## Assign entity IDs
def generate_entity_id(entity_string):
    return "E_" + hashlib.md5(entity_string.encode("utf-8")).hexdigest()[:10]

entity_data = []

for entity in sorted(clean_entities):
    entity_id = generate_entity_id(entity)
    entity_data.append({
        "node_id": entity_id,
        "node_type": "Entity",
        "name": entity
    })

entity_nodes_df = pd.DataFrame(entity_data)

In [10]:
entity_nodes_df.to_csv("../outputs/entity_nodes.csv", index=False)

print("entity_nodes.csv created successfully.")

entity_nodes.csv created successfully.


In [13]:
entity_nodes_df.sample(50)

Unnamed: 0,node_id,node_type,name
19457,E_daf9566046,Entity,ce and afl
54869,E_70a82ed313,Entity,manual pyramid annotations
62566,E_0f04cf5ba5,Entity,"negators ( e.g. , cannot )"
19288,E_4b3ddb1ee9,Entity,catalan - spanish pair
84794,E_6276ef647a,Entity,sense embeddings
5408,E_0062ef963f,Entity,69.58 bleu score
100483,E_9fbc7e50ce,Entity,translationese from non-translated english
60646,E_dec8ae0a7e,Entity,multi-task learning methods
88951,E_eeb5763e42,Entity,skewed prediction loss distributions
59032,E_e551d0b620,Entity,moses default value weights
