In [1]:
from glob import glob
import json
import pandas as pd
import os

In [2]:
## Locate All Mapping JSON Files

MAPPING_FOLDER = "../mapped_json/"
mapping_files = glob(os.path.join(MAPPING_FOLDER, "mapping_*.json"))
print(f"Found {len(mapping_files)} mapping files.")

Found 17 mapping files.


In [3]:
# clean_titles = []
#
# for path in mapping_files:
#     with open(path, 'r') as f:
#         data = json.load(f)
#
#     for entry in data.values():
#         title = entry.get("title")
#         score = entry.get("score", 0)
#
#         # Keep only valid mappings
#         if (
#             isinstance(title, str)
#             and len(title) > 15
#             and score >= 0.5
#         ):
#             clean_titles.append(title)
#
# print("Clean titles:", len(set(clean_titles)))

In [4]:
## Load and merge all mappings
all_papers = []
for file in mapping_files:
    filename = os.path.basename(file)
    ## Extract split + domain with filename
    ## mapping_json
    parts = filename.replace('.json', '').split('_')

    ## Expected format: mapping_domain_split
    if len(parts)<3:
        print("Skipping file: ", filename)
        continue

    _, domain, split = parts
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)

        for local_paper_id, info in data.items():
            try:
                local_paper_id = int(local_paper_id)
            except:
                continue

        ## Drop unresolved papers
            if info["filename"] is None:
                continue

            if info["method"] == "manual":
                continue

            if info["score"] < 0.5:
                continue

            global_paper_id = f"{split.upper()}_{domain.upper()}_{local_paper_id}"

            all_papers.append({
                "global_paper_id": global_paper_id,
                "local_paper_id": local_paper_id,
                "title": info["title"],
                "domain": domain.upper(),
                "split": split.upper(),
                "score": info["score"],
                "method": info["method"]
            })

papers_df = pd.DataFrame(all_papers)

print("Before title cleaning:", len(papers_df))

Before title cleaning: 2716


In [5]:
## Remove duplicates
def is_valid_title(title):
    if not isinstance(title, str):
        return False

    title = title.strip()

    if len(title) < 15:
        return False

    bad_tokens = ["pdf", "txt", ".md", "@", "·"]

    for token in bad_tokens:
        if token in title.lower():
            return False

    if not any(c.isalpha() for c in title):
        return False

    return True


papers_df = papers_df[papers_df["title"].apply(is_valid_title)]

print("After title cleaning:", len(papers_df))

# Now deduplicate
# unique_titles_df = papers_df[["title", "domain"]].drop_duplicates().reset_index(drop=True
unique_titles_df = papers_df[["global_paper_id", "title"]].drop_duplicates().reset_index(drop=True)

print("Unique titles to query in OpenAlex:", len(unique_titles_df))

After title cleaning: 2628
Unique titles to query in OpenAlex: 2628


In [6]:
## CHECK — Duplicate titles across different global IDs
dup_titles = (
    papers_df.groupby("title")["global_paper_id"]
    .nunique()
)

dup_titles = dup_titles[dup_titles > 1]

print("Titles appearing in multiple global IDs:", len(dup_titles))

if len(dup_titles) > 0:
    display(
        papers_df[papers_df["title"].isin(dup_titles.index)]
        .sort_values("title")
    )

Titles appearing in multiple global IDs: 91


Unnamed: 0,global_paper_id,local_paper_id,title,domain,split,score,method
2237,SKG_SA_200,200,A Challenge Dataset and Effective Models for A...,SA,SKG,0.683342,tfidf
2143,SKG_SA_11,11,A Challenge Dataset and Effective Models for A...,SA,SKG,0.660404,tfidf
677,SKG_MT_222,222,A High-Quality Multilingual Dataset for Struct...,MT,SKG,0.588162,tfidf
900,SKG_MT_544,544,A High-Quality Multilingual Dataset for Struct...,MT,SKG,0.598976,tfidf
2568,SKG_SUM_13,13,A Transformer-based Approach for Source Code S...,SUM,SKG,0.503664,tfidf
...,...,...,...,...,...,...,...
1476,SKG_MT_1422,1422,Unsupervised Question Answering by Cloze Trans...,MT,SKG,0.738462,tfidf
1778,SKG_QA_60,60,Using Paraphrasing and Memory-Augmented Models...,QA,SKG,0.644838,tfidf
90,SKG_DIA_32,32,Using Paraphrasing and Memory-Augmented Models...,DIA,SKG,0.668987,tfidf
1386,SKG_MT_1276,1276,Using Syntactic Head Information in Hierarchic...,MT,SKG,0.873173,tfidf


In [7]:
## Save
papers_df.to_csv("../outputs/filtered_papers.csv", index=False)
unique_titles_df.to_csv("../outputs/unique_papers.csv", index=False)

print("n\Saved: ")
print("- filtered_papers.csv")
print("- unique_titles.csv")

n\Saved: 
- filtered_papers.csv
- unique_titles.csv


In [21]:
pd.read_csv('../outputs/filtered_papers.csv').head()

Unnamed: 0,global_paper_id,local_paper_id,title,domain,split,score,method
0,NOVEL_DIA_0,0,MRF-Chat Improving Dialogue with Markov Random...,DIA,NOVEL,0.606481,tfidf
1,NOVEL_DIA_1,1,Towards Making the Most of Dialogue Characteri...,DIA,NOVEL,0.66398,tfidf
2,NOVEL_DIA_2,2,Domain-Adaptive Pretraining Methods for Dialog...,DIA,NOVEL,0.668112,tfidf
3,NOVEL_DIA_3,3,Adaptive Bridge between Training and Inference...,DIA,NOVEL,0.636847,tfidf
4,NOVEL_DIA_4,4,Controlling Dialogue Generation with Semantic ...,DIA,NOVEL,0.811583,tfidf


In [22]:
pd.read_csv('../outputs/unique_papers.csv').head()

Unnamed: 0,global_paper_id,title
0,NOVEL_DIA_0,MRF-Chat Improving Dialogue with Markov Random...
1,NOVEL_DIA_1,Towards Making the Most of Dialogue Characteri...
2,NOVEL_DIA_2,Domain-Adaptive Pretraining Methods for Dialog...
3,NOVEL_DIA_3,Adaptive Bridge between Training and Inference...
4,NOVEL_DIA_4,Controlling Dialogue Generation with Semantic ...


In [10]:
print("Total filtered papers:", len(papers_df))
print("Total unique papers:", len(unique_titles_df))

Total filtered papers: 2628
Total unique papers: 2628


In [11]:
print(papers_df["local_paper_id"].dtype)
print(papers_df["local_paper_id"].isnull().sum())

int64
0


In [13]:
print("Duplicate global IDs:", papers_df["global_paper_id"].duplicated().sum())

Duplicate global IDs: 0


In [15]:
papers_df["global_paper_id"].head(5)

8     NOVEL_DIA_0
9     NOVEL_DIA_1
10    NOVEL_DIA_2
11    NOVEL_DIA_3
12    NOVEL_DIA_4
Name: global_paper_id, dtype: object

In [16]:
print("Min title length:",
      papers_df["title"].str.len().min())

print("Any titles under 15 chars:",
      (papers_df["title"].str.len() < 15).sum())

Min title length: 18
Any titles under 15 chars: 0


In [17]:
dup_titles = (
    papers_df.groupby("title")["global_paper_id"]
    .nunique()
)

print("Titles reused across papers:",
      (dup_titles > 1).sum())

Titles reused across papers: 91


In [18]:
unique_titles_df.head()

Unnamed: 0,global_paper_id,title
0,NOVEL_DIA_0,MRF-Chat Improving Dialogue with Markov Random...
1,NOVEL_DIA_1,Towards Making the Most of Dialogue Characteri...
2,NOVEL_DIA_2,Domain-Adaptive Pretraining Methods for Dialog...
3,NOVEL_DIA_3,Adaptive Bridge between Training and Inference...
4,NOVEL_DIA_4,Controlling Dialogue Generation with Semantic ...


In [19]:
print("Empty titles:",
      (papers_df["title"].str.strip() == "").sum())

Empty titles: 0


In [20]:
assert papers_df["global_paper_id"].is_unique
assert papers_df["local_paper_id"].dtype == "int64"
assert papers_df["title"].isnull().sum() == 0
assert (papers_df["title"].str.len() < 15).sum() == 0

print("All structural checks passed.")

All structural checks passed.
