In [2]:
import pandas as pd
import requests
import time
import hashlib
from tqdm import tqdm

In [3]:
OPENALEX_API_KEY = "gk3pilAfGAop5QnB0IWhRy"

HEADERS = {
    "User-Agent": "Novelty-Detection-Project",
    "mailto": "ishantk250705@gmail.com"
}

BASE_URL = "https://api.openalex.org/works/"


In [4]:
def create_global_paper_id(split, domain, paper_id):
    return f"{split}_{domain}_{paper_id}"

def normalize_entity(entity):
    return str(entity).strip().lower()

def create_entity_id(entity_name):
    h = hashlib.md5(entity_name.encode()).hexdigest()[:12]
    return f"E_{h}"

def fetch_openalex_metadata(openalex_id):
    url = BASE_URL + openalex_id
    params = {"api_key": OPENALEX_API_KEY}
    response = requests.get(url, headers=HEADERS, params=params)

    if response.status_code != 200:
        return None

    data = response.json()
    return {
        "year": data.get("publication_year"),
        "abstract": data.get("abstract"),
        "cited_works": data.get("referenced_works", [])
    }


In [5]:
SKG_FILES = {
    "Dia": "Scientific_Novelty_Detection/Triplets/SKG/Dia_triplets.csv",
    "MT": "Scientific_Novelty_Detection/Triplets/SKG/MT_triplets.csv",
    "NLI": "Scientific_Novelty_Detection/Triplets/SKG/NLI_triplets.csv",
    "Par": "Scientific_Novelty_Detection/Triplets/SKG/Par_triplets.csv",
    "QA": "Scientific_Novelty_Detection/Triplets/SKG/QA_triplets.csv",
    "SA": "Scientific_Novelty_Detection/Triplets/SKG/SA_triplets.csv",
    "Sum": "Scientific_Novelty_Detection/Triplets/SKG/Sum_triplets.csv",
}

NOVEL_FILES = {
    "Dia": "Scientific_Novelty_Detection/Triplets/Novel_Papers/Dia2021_triplets.csv",
    "MT": "Scientific_Novelty_Detection/Triplets/Novel_Papers/MT2021_triplets.csv",
    "QA": "Scientific_Novelty_Detection/Triplets/Novel_Papers/QA2021_triplets.csv",
    "SA": "Scientific_Novelty_Detection/Triplets/Novel_Papers/SA2021_triplets.csv",
    "Sum": "Scientific_Novelty_Detection/Triplets/Novel_Papers/Sum2021_triplets.csv",
}

BLOG_FILES = {
    "Dia": "Scientific_Novelty_Detection/Triplets/Blogs/Dia_Blogs_triplets.csv",
    "MT": "Scientific_Novelty_Detection/Triplets/Blogs/MT_Blogs_triplets.csv",
    "QA": "Scientific_Novelty_Detection/Triplets/Blogs/QA_Blogs_triplets.csv",
    "SA": "Scientific_Novelty_Detection/Triplets/Blogs/SA_Blogs_triplets.csv",
    "Sum": "Scientific_Novelty_Detection/Triplets/Blogs/Sum_Blogs_triplets.csv",
}


In [8]:
def load_triplets(file_dict, split_name):
    all_rows = []

    for domain, path in file_dict.items():
        df = pd.read_csv(path)

        # Ensure required columns exist
        required_cols = ["paper_ID", "pred", "obj"]
        for col in required_cols:
            if col not in df.columns:
                raise ValueError(f"{col} not found in {path}")

        for _, row in df.iterrows():
            global_id = create_global_paper_id(
                split_name,
                domain,
                row["paper_ID"]
            )

            entity = normalize_entity(row["obj"])
            entity_id = create_entity_id(entity)

            all_rows.append({
                "paper_id": global_id,
                "domain": domain,
                "split": split_name,
                "predicate": row["pred"],
                "entity_name": entity,
                "entity_id": entity_id
            })

    return pd.DataFrame(all_rows)


print("Loading SKG...")
skg_df = load_triplets(SKG_FILES, "SKG")

print("Loading NOVEL...")
novel_df = load_triplets(NOVEL_FILES, "NOVEL")

print("Loading BLOG...")
blog_df = load_triplets(BLOG_FILES, "BLOG")

triplets_df = pd.concat([skg_df, novel_df, blog_df], ignore_index=True)

print("Total triples:", len(triplets_df))


Loading SKG...
Loading NOVEL...
Loading BLOG...
Total triples: 238088


In [9]:
paper_nodes = triplets_df[["paper_id", "domain", "split"]].drop_duplicates()

paper_nodes["node_type"] = "Paper"
paper_nodes["year"] = None
paper_nodes["name"] = None

paper_nodes = paper_nodes.rename(columns={"paper_id": "node_id"})

print("Total papers:", len(paper_nodes))


Total papers: 3965


In [10]:
entity_nodes = triplets_df[["entity_id", "entity_name"]].drop_duplicates()

entity_nodes["node_type"] = "Entity"
entity_nodes["year"] = None
entity_nodes["domain"] = None
entity_nodes["split"] = None

entity_nodes = entity_nodes.rename(columns={
    "entity_id": "node_id",
    "entity_name": "name"
})

print("Total entities:", len(entity_nodes))


Total entities: 108362


In [11]:
metadata_store = {}

for pid in tqdm(paper_nodes["node_id"].tolist()):
    # Extract OpenAlex ID from your ID if available
    # Modify this depending on your mapping format
    openalex_id = pid  # <-- FIX THIS IF NEEDED

    meta = fetch_openalex_metadata(openalex_id)

    if meta:
        metadata_store[pid] = meta
        time.sleep(0.2)


 38%|███▊      | 1494/3965 [09:51<16:17,  2.53it/s] 


KeyboardInterrupt: 

In [14]:
import os

print("Triplets Dia unique papers:")
df = pd.read_csv("Scientific_Novelty_Detection/Triplets/SKG/Dia_triplets.csv")
print(df["paper_ID"].nunique())

print("GROBID Dia files:")
print(len(os.listdir("Scientific_Novelty_Detection/grobid_files/SKG_Papers/Dia")))


Triplets Dia unique papers:
304
GROBID Dia files:
319


In [15]:
import os

files = os.listdir("Scientific_Novelty_Detection/grobid_files/SKG_Papers/Dia")
print(files[:20])

['10.IIT-UHH at SemEval-2017 Task 3 Exploring Multiple Features for Community Question Answering and Implicit Dialogue Identification-Grobid-out.txt', '10.PLATO Pre-trained Dialogue Generation Model with Discrete Latent Variable-Grobid-out.txt', '10.Text-based Speaker Identification on Multiparty Dialogues Using Multi-document Convolutional Neural Networks-Grobid-out.txt', '10.The Impact of Interpretation Problems on Tutorial Dialogue-Grobid-out.txt', '100.Learning the Information Status of Noun Phrases in Spoken Dialogues-Grobid-out.txt', '103.Recognizing Authority in Dialogue with an Integer Linear Programming Constrained Model-Grobid-out.txt', '104.Towards an Automatic Turing Test Learning to Evaluate Dialogue Responses-Grobid-out.txt', '107.Semantic Information and Derivation Rules for Robust Dialogue Act Detection in a Spoken Dialogue System-Grobid-out.txt', '11.A Statistical Spoken Dialogue System using Complex User Goals and Value Directed Compression-Grobid-out.txt', '11.Intrin

In [16]:
import pandas as pd

df_dia = pd.read_csv("Scientific_Novelty_Detection/Triplets/SKG/Dia_triplets.csv")

In [17]:
paper_id = 1

entities = df_dia[df_dia["paper_ID"] == paper_id]["obj"].unique()
entities = [e.lower() for e in entities if isinstance(e, str)]

print(entities[:10])


['150k human-human dialogues', 'recently introduced guesswhat ?!', 'pre-trained models', 'reinforce', 'plain stochastic gradient descent ( sgd )', '80 epochs', 'learning rate', 'batch size', '0.001', '64']


In [18]:
import os

grobid_path = "Scientific_Novelty_Detection/grobid_files/SKG_Papers/Dia"
files = os.listdir(grobid_path)

def find_best_match(entities):
    best_file = None
    best_score = 0

    for file in files:
        with open(os.path.join(grobid_path, file), "r", encoding="utf-8") as f:
            text = f.read().lower()

        score = sum(1 for e in entities[:5] if e in text)

        if score > best_score:
            best_score = score
            best_file = file

    return best_file, best_score


In [19]:
match, score = find_best_match(entities)
print(match, score)


10.PLATO Pre-trained Dialogue Generation Model with Discrete Latent Variable-Grobid-out.txt 2


In [20]:
def find_best_match_strong(entities, grobid_path, files):
    best_file = None
    best_score = 0

    # Filter entities (keep meaningful ones)
    filtered = [
        e.lower() for e in entities
        if isinstance(e, str) and len(e.split()) > 1 and len(e) > 6
    ]

    for file in files:
        with open(os.path.join(grobid_path, file), "r", encoding="utf-8") as f:
            text = f.read().lower()

        score = 0
        for e in filtered:
            if e in text:
                score += 1

        if score > best_score:
            best_score = score
            best_file = file

    return best_file, best_score


In [21]:
match, score = find_best_match_strong(entities, grobid_path, files)
print(match, score)


260.Deal or No Deal End-to-End Learning of Negotiation Dialogues-Grobid-out.txt 5


In [22]:
from collections import defaultdict

def build_triplet_documents(df):
    docs = defaultdict(str)

    for _, row in df.iterrows():
        pid = row["paper_ID"]
        text = " ".join([
            str(row["sub"]),
            str(row["pred"]),
            str(row["obj"])
        ])
        docs[pid] += " " + text.lower()

    return docs

triplet_docs = build_triplet_documents(df_dia)


In [23]:
grobid_docs = {}
for file in files:
    with open(os.path.join(grobid_path, file), "r", encoding="utf-8") as f:
        grobid_docs[file] = f.read().lower()


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def match_paper(pid):
    trip_text = triplet_docs[pid]

    corpus = [trip_text] + list(grobid_docs.values())

    vectorizer = TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3,5))
    tfidf = vectorizer.fit_transform(corpus)

    sims = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()

    best_idx = sims.argmax()
    best_score = sims[best_idx]
    best_file = list(grobid_docs.keys())[best_idx]

    return best_file, best_score


In [30]:
match, score = match_paper(12)
print(match, score)


123.A Compare Aggregate Transformer for Understanding Document-grounded Dialogue-Grobid-out.txt 0.4338368957213103


In [26]:
mapping = {}

for pid in triplet_docs.keys():
    best_file, best_score = match_paper(pid)
    mapping[pid] = {
        "filename": best_file,
        "similarity": best_score
    }

print("Done mapping.")


Done mapping.


In [32]:
low_conf = {k: v for k, v in mapping.items() if v["similarity"] < 0.25}
print("Low confidence matches:", len(low_conf))


Low confidence matches: 16


In [33]:
def match_paper_topk(pid, k=3):
    trip_text = triplet_docs[pid]
    corpus = [trip_text] + list(grobid_docs.values())

    vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5))
    tfidf = vectorizer.fit_transform(corpus)

    sims = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()

    top_idx = sims.argsort()[-k:][::-1]

    results = []
    for idx in top_idx:
        results.append((list(grobid_docs.keys())[idx], sims[idx]))

    return results


In [34]:
for pid, info in mapping.items():
    if info["similarity"] < 0.25:
        print(pid, match_paper_topk(pid))
        break


2 [('544.Reading Turn by Turn Hierarchical Attention Architecture for Spoken Dialogue Comprehension-Grobid-out.txt', np.float64(0.43302192358235825)), ('652.doc2dial A Goal-Oriented Document-Grounded Dialogue Dataset-Grobid-out.txt', np.float64(0.42365534019603673)), ('429.Don’t Say That! Making Inconsistent Dialogue Unlikely with Unlikelihood Training-Grobid-out.txt', np.float64(0.41445582261615527))]


In [36]:
from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer("all-MiniLM-L6-v2")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 838.69it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [37]:
# Precompute grobid embeddings
grobid_embeddings = {}
for file, text in grobid_docs.items():
    grobid_embeddings[file] = model.encode(text, convert_to_tensor=True)


In [38]:
def semantic_match(pid):
    trip_text = triplet_docs[pid]
    trip_emb = model.encode(trip_text, convert_to_tensor=True)

    scores = {}

    for file, emb in grobid_embeddings.items():
        sim = util.cos_sim(trip_emb, emb).item()
        scores[file] = sim

    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_scores[:3]

semantic_match(2)


[('652.doc2dial A Goal-Oriented Document-Grounded Dialogue Dataset-Grobid-out.txt',
  0.7339877486228943),
 ('233.Modeling Dialogue Acts with Content Word Filtering and Speaker Preferences-Grobid-out.txt',
  0.6888933777809143),
 ('123.A Compare Aggregate Transformer for Understanding Document-grounded Dialogue-Grobid-out.txt',
  0.6872419118881226)]

In [42]:
def resolve_mapping(pid):
    # First try TF-IDF
    best_file, best_score = match_paper(pid)

    if best_score >= 0.30:
        return best_file, best_score, "tfidf"

    # Otherwise use semantic matching
    top3 = semantic_match(pid)

    top_file, top_score = top3[0]
    second_score = top3[1][1]

    if top_score >= 0.60 and (top_score - second_score) >= 0.03:
        return top_file, top_score, "semantic"

    return None, None, "manual"


In [43]:
final_mapping = {}

for pid in triplet_docs.keys():
    file, score, method = resolve_mapping(pid)
    final_mapping[pid] = {
        "filename": file,
        "score": score,
        "method": method
    }

manual_cases = {k:v for k,v in final_mapping.items() if v["method"] == "manual"}
print("Manual review needed:", len(manual_cases))


Manual review needed: 12
