In [1]:
import os
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer, util
import torch


  from .autonotebook import tqdm as notebook_tqdm


Load the model once exactly

In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2")


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 830.23it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [3]:
def extract_title(filename):
    title = filename.split(".", 1)[1]
    title = title.replace("-Grobid-out.txt", "")
    return title.strip()

def build_triplet_documents(df):
    docs = defaultdict(str)

    for _, row in df.iterrows():
        pid = row["paper_ID"]
        text = " ".join([
            str(row["sub"]),
            str(row["pred"]),
            str(row["obj"])
        ])
        docs[pid] += " " + text.lower()

    return docs

def load_grobid_documents(folder_path):
    docs = {}

    files = os.listdir(folder_path)

    for file in files:
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            docs[file] = f.read().lower()

    return docs

def tfidf_match(pid, triplet_docs, grobid_docs):
    trip_text = triplet_docs[pid]
    corpus = [trip_text] + list(grobid_docs.values())

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(corpus)

    sims = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()

    best_idx = sims.argmax()
    best_score = sims[best_idx]
    best_file = list(grobid_docs.keys())[best_idx]

    return best_file, best_score

def semantic_match(pid, triplet_docs, grobid_embeddings):
    trip_text = triplet_docs[pid]
    trip_emb = model.encode(trip_text, convert_to_tensor=True)

    scores = {}

    for file, emb in grobid_embeddings.items():
        sim = util.cos_sim(trip_emb, emb).item()
        scores[file] = sim

    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_scores[:3]




In [4]:
def run_mapping_for_domain(triplet_path, grobid_folder):

    print("Loading triplets...")
    df = pd.read_csv(triplet_path)

    triplet_docs = build_triplet_documents(df)

    print("Loading grobid files...")
    grobid_docs = load_grobid_documents(grobid_folder)

    print("Computing grobid embeddings (once)...")
    grobid_embeddings = {
        file: model.encode(text, convert_to_tensor=True)
        for file, text in grobid_docs.items()
    }

    final_mapping = {}

    for pid in tqdm(triplet_docs.keys()):

        # Try TF-IDF first
        best_file, best_score = tfidf_match(pid, triplet_docs, grobid_docs)

        if best_score >= 0.30:
            final_mapping[pid] = {
                "filename": best_file,
                "title": extract_title(best_file),
                "score": best_score,
                "method": "tfidf"
            }
            continue

        # Otherwise use semantic matching
        top3 = semantic_match(pid, triplet_docs, grobid_embeddings)

        top_file, top_score = top3[0]
        second_score = top3[1][1]

        if top_score >= 0.60 and (top_score - second_score) >= 0.02:
            final_mapping[pid] = {
                "filename": top_file,
                "title": extract_title(top_file),
                "score": top_score,
                "method": "semantic"
            }
        else:
            final_mapping[pid] = {
                "filename": None,
                "title": None,
                "score": top_score,
                "method": "manual"
            }

    return final_mapping


In [5]:

# SKGS AKA Historical Domains
mapping_dia_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/Dia_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/Dia"
)

mapping_mt_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/MT_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/MT"
)

mapping_nli_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/NLI_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/NLI"
)
mapping_par_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/PAR_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/PAR"
)
mapping_qa_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/QA_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/QA"
)
mapping_sa_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/SA_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/SA"
)
mapping_sum_skg = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/SKG/SUM_triplets.csv", 
    "Scientific_Novelty_Detection/grobid_files/SKG_Papers/SUM"
)

# Blogs

mapping_dia_blog = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Blogs/Dia_Blogs_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Blogs/Dia"
)
mapping_mt_blog = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Blogs/MT_Blogs_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Blogs/MT"
)
mapping_qa_blog = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Blogs/QA_Blogs_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Blogs/QA"
)
mapping_sa_blog = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Blogs/SA_Blogs_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Blogs/SA"
)
mapping_sum_blog = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Blogs/SUM_Blogs_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Blogs/SUM"
)

# Novel Papers

mapping_dia_novel = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Novel_Papers/Dia2021_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Novel_Papers/Dia2021"
)
mapping_mt_novel = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Novel_Papers/MT2021_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Novel_Papers/MT2021"
)
mapping_qa_novel = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Novel_Papers/QA2021_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Novel_Papers/QA2021"
)
mapping_sa_novel = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Novel_Papers/SA2021_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Novel_Papers/SA2021"
)
mapping_sum_novel = run_mapping_for_domain(
    "Scientific_Novelty_Detection/Triplets/Novel_Papers/SUM2021_triplets.csv",
    "Scientific_Novelty_Detection/grobid_files/Novel_Papers/SUM2021"
)

Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 304/304 [02:46<00:00,  1.83it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 1424/1424 [54:03<00:00,  2.28s/it]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 20/20 [00:00<00:00, 20.83it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 222/222 [01:23<00:00,  2.67it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 481/481 [06:43<00:00,  1.19it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 613/613 [09:04<00:00,  1.13it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 216/216 [01:26<00:00,  2.50it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 11/11 [00:00<00:00, 61.44it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 12/12 [00:00<00:00, 83.18it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 20/20 [00:00<00:00, 66.47it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 16/16 [00:00<00:00, 79.78it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 25/25 [00:00<00:00, 68.99it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 71/71 [00:09<00:00,  7.38it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 299/299 [02:30<00:00,  1.98it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 86/86 [00:14<00:00,  6.06it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 70/70 [00:09<00:00,  7.37it/s]


Loading triplets...
Loading grobid files...
Computing grobid embeddings (once)...


100%|██████████| 75/75 [00:10<00:00,  6.83it/s]


In [6]:
import json

with open("../mapped_json/mapping_dia_skg.json", "w") as f:
    json.dump(mapping_dia_skg, f, indent=2)

with open("../mapped_json/mapping_mt_skg.json", "w") as f:
    json.dump(mapping_mt_skg, f, indent=2)

with open("../mapped_json/mapping_nli_skg.json", "w") as f:
    json.dump(mapping_nli_skg, f, indent=2)

with open("../mapped_json/mapping_par_skg.json", "w") as f:
    json.dump(mapping_par_skg, f, indent=2)

with open("../mapped_json/mapping_qa_skg.json", "w") as f:
    json.dump(mapping_qa_skg, f, indent=2)

with open("../mapped_json/mapping_sa_skg.json", "w") as f:
    json.dump(mapping_sa_skg, f, indent=2)

with open("../mapped_json/mapping_sum_skg.json", "w") as f:
    json.dump(mapping_sum_skg, f, indent=2)

with open("../mapped_json/mapping_dia_blog.json", "w") as f:
    json.dump(mapping_dia_blog, f, indent=2)

with open("../mapped_json/mapping_mt_blog.json", "w") as f:
    json.dump(mapping_mt_blog, f, indent=2)

with open("../mapped_json/mapping_qa_blog.json", "w") as f:
    json.dump(mapping_qa_blog, f, indent=2)

with open("../mapped_json/mapping_sa_blog.json", "w") as f:
    json.dump(mapping_sa_blog, f, indent=2)

with open("../mapped_json/mapping_sum_blog.json", "w") as f:
    json.dump(mapping_sum_blog, f, indent=2)

with open("../mapped_json/mapping_dia_novel.json", "w") as f:
    json.dump(mapping_dia_novel, f, indent=2)

with open("../mapped_json/mapping_mt_novel.json", "w") as f:
    json.dump(mapping_mt_novel, f, indent=2)

with open("../mapped_json/mapping_qa_novel.json", "w") as f:
    json.dump(mapping_qa_novel, f, indent=2)

with open("../mapped_json/mapping_sa_novel.json", "w") as f:
    json.dump(mapping_sa_novel, f, indent=2)

with open("../mapped_json/mapping_sum_novel.json", "w") as f:
    json.dump(mapping_sum_novel, f, indent=2)

