In [1]:
import pandas as pd
import requests
import time
import json
import difflib

In [2]:
## Config
# OPENALEX_EMAIL = "kar.sushmit@gmail.com"  # <-- change this
BASE_URL = "https://api.openalex.org/works"

SIMILARITY_THRESHOLD = 0.80
SLEEP_TIME = 2.5      # to avoid rate limits

In [3]:
## Load titles
titles_df = pd.read_csv('../outputs/unique_papers.csv')
# titles_df = titles_df[titles_df['domain'] == 'DIA']

# titles = titles_df['title'].dropna().unique().tolist()
titles_df = titles_df.dropna(subset=["title"])
print("Total papers to query:", len(titles_df))
# print("Total DIA titles:", len(titles))

Total papers to query: 2628


In [28]:
# ## Title Validation
# def is_valid_title(title):
#     if not isinstance(title, str):
#         return False
#
#     title = title.strip()
#
#     if len(title) < 15:
#         return False
#
#     bad_keywords = ["pdf", "txt", "md", "@", "·"]
#     if any(k in title.lower() for k in bad_keywords):
#         return False
#
#     if not any(c.isalpha() for c in title):
#         return False
#
#     return True
#
#
# filtered_titles = [t for t in titles if is_valid_title(t)]
#
# print("Valid titles:", len(filtered_titles))

In [9]:
# print("Before:", len(titles))
# print("After:", len(filtered_titles))

In [4]:
# ABSTRACT RECONSTRUCTION
def reconstruct_abstract(inv_index):
    if not inv_index:
        return None

    max_pos = max(pos for positions in inv_index.values() for pos in positions)
    abstract_words = [""] * (max_pos + 1)

    for word, positions in inv_index.items():
        for pos in positions:
            abstract_words[pos] = word

    return " ".join(abstract_words)

In [6]:
## Query OpenAlex
import os

checkpoint_path = "../outputs/openalex_metadata_partial.csv"

if os.path.exists(checkpoint_path):
    existing_df = pd.read_csv(checkpoint_path)
    processed_ids = set(existing_df["global_paper_id"])
    metadata_store = existing_df.to_dict("records")
    print("Resuming from checkpoint:", len(processed_ids), "already done")
else:
    processed_ids = set()
    metadata_store = []

missed_titles = []
low_similarity_titles = []

for i, row in titles_df.iterrows():

    global_id = row["global_paper_id"]
    title = row["title"]

    # Skip already processed
    if global_id in processed_ids:
        continue

    print(f"[{i}/{len(titles_df)}] Processing {global_id}")

    params = {
        "search": title,
        "per_page": 1,
    }

    try:
        response = requests.get(BASE_URL, params=params, timeout=15)

    except Exception as e:
        print("Request failed:", e)
        missed_titles.append(global_id)
        continue

    if response.status_code == 429:
        print("Rate limited. Sleeping 60 seconds...")
        time.sleep(60)
        continue

    if response.status_code != 200:
        print("Bad status:", response.status_code)
        missed_titles.append(global_id)
        time.sleep(SLEEP_TIME)
        continue

    # results = response.json().get("results", [])
    try:
        data = response.json()
    except Exception as e:
        print("JSON parse failed:", e)
        missed_titles.append(global_id)
        time.sleep(SLEEP_TIME)
        continue

    results = data.get("results", [])

    if not results:
        missed_titles.append(global_id)
        time.sleep(SLEEP_TIME)
        continue

    best_match = None
    best_score = 0

    for r in results:
        oa_title = r.get("title", "")
        score = difflib.SequenceMatcher(
            None,
            title.lower(),
            oa_title.lower()
        ).ratio()

        if score > best_score:
            best_score = score
            best_match = r

    if best_score >= SIMILARITY_THRESHOLD and best_match:

        openalex_id = best_match["id"]
        year = best_match.get("publication_year")
        cited_by = best_match.get("cited_by_count")
        references = best_match.get("referenced_works", [])
        abstract = reconstruct_abstract(
            best_match.get("abstract_inverted_index")
        )

        metadata_store.append({
            "global_paper_id": global_id,
            "openalex_id": openalex_id,
            "year": year,
            "cited_by_count": cited_by,
            "referenced_works": references,
            "abstract": abstract,
            "match_score": best_score
        })

        print("✔ Matched")

    else:
        print("⚠ Low similarity:", best_score)
        low_similarity_titles.append((global_id, best_score))

    # Optional: autosave every 50
    if len(metadata_store) % 50 == 0:
        pd.DataFrame(metadata_store).to_csv(
            checkpoint_path, index=False
        )
        print("Checkpoint saved.")

    time.sleep(SLEEP_TIME)

Resuming from checkpoint: 1312 already done
[221/2628] Processing SKG_DIA_249
Rate limited. Sleeping 60 seconds...
[276/2628] Processing NOVEL_MT_3
Rate limited. Sleeping 60 seconds...
[282/2628] Processing NOVEL_MT_12
Rate limited. Sleeping 60 seconds...
[288/2628] Processing NOVEL_MT_22
Rate limited. Sleeping 60 seconds...
[294/2628] Processing NOVEL_MT_30
Rate limited. Sleeping 60 seconds...
[296/2628] Processing NOVEL_MT_33
Rate limited. Sleeping 60 seconds...
[297/2628] Processing NOVEL_MT_34
Rate limited. Sleeping 60 seconds...
[299/2628] Processing NOVEL_MT_36
Rate limited. Sleeping 60 seconds...
[307/2628] Processing NOVEL_MT_51
Rate limited. Sleeping 60 seconds...


KeyboardInterrupt: 

In [31]:
metadata_df = pd.DataFrame(metadata_store)
metadata_df.to_csv("../outputs/openalex_metadata_partial.csv", index=False)
print("Partial save complete.")

Partial save complete.


In [8]:
## SAVE JSON METADATA

with open("../outputs/openalex_metadata_full.json", "w", encoding="utf-8") as f:
    json.dump(metadata_store, f, indent=2)

print("Saved openalex_metadata_full.json")

Saved openalex_metadata.json


In [12]:
## Saving csv summary
metadata_df = pd.DataFrame(metadata_store)
metadata_df.to_csv("../outputs/openalex_metadata_full.csv", index=False)

print("Saved openalex_metadata_full.csv")

Saved openalex_metadata.csv


In [13]:
 # SAVE MATCH REPORT
print("\n==== MATCH REPORT ====")
print("Total valid titles:", len(titles_df))
print("Successful matches:", len(metadata_df))
print("Missed titles:", len(missed_titles))
print("Low similarity titles:", len(low_similarity_titles))
print("Match rate:",
      round(len(metadata_store) / len(titles_df), 3))


pd.DataFrame(missed_titles, columns=["title"]).to_csv(
    "../outputs/openalex_missed_titles.csv", index=False
)

pd.DataFrame(low_similarity_titles,
             columns=["title", "similarity_score"]).to_csv(
    "../outputs/openalex_low_similarity.csv", index=False
)

print("Saved diagnostic files.")


==== MATCH REPORT ====
Total valid titles: 274
Successful matches: 273
Missed titles: 1
Low similarity titles: 0
Match rate: 0.996
Saved diagnostic files.


In [14]:
pd.read_csv('../outputs/openalex_metadata_full.csv').head()

Unnamed: 0,title,openalex_id,publication_year,cited_by_count,referenced_works,abstract,match_score
0,MRF-Chat Improving Dialogue with Markov Random...,https://openalex.org/W3214342458,2021,0,"['https://openalex.org/W242376439', 'https://o...",Recent state-of-the-art approaches in open-dom...,0.990654
1,Towards Making the Most of Dialogue Characteri...,https://openalex.org/W3196896228,2021,12,"['https://openalex.org/W222053410', 'https://o...",Neural Chat Translation (NCT) aims to translat...,1.000000
2,Domain-Adaptive Pretraining Methods for Dialog...,https://openalex.org/W3173606101,2021,19,"['https://openalex.org/W1522301498', 'https://...","Han Wu, Kun Xu, Linfeng Song, Lifeng Jin, Hais...",1.000000
3,Adaptive Bridge between Training and Inference...,https://openalex.org/W3214623240,2021,5,"['https://openalex.org/W648786980', 'https://o...",Although exposure bias has been widely studied...,1.000000
4,Controlling Dialogue Generation with Semantic ...,https://openalex.org/W3074476581,2021,6,"['https://openalex.org/W2037789405', 'https://...",Dialogue systems pretrained with large languag...,1.000000
...,...,...,...,...,...,...,...
268,Improving Limited Labeled Dialogue State Track...,https://openalex.org/W3105480731,2020,11,"['https://openalex.org/W1522301498', 'https://...",Existing dialogue state tracking (DST) models ...,1.000000
269,ScoutBot A Dialogue System for Collaborative N...,https://openalex.org/W2803503442,2018,22,"['https://openalex.org/W175385064', 'https://o...","Stephanie M. Lukin, Felix Gervits, Cory J. Hay...",0.990991
270,Bridging the Gap between Prior and Posterior K...,https://openalex.org/W3104123491,2020,78,"['https://openalex.org/W1522301498', 'https://...",Knowledge selection plays an important role in...,1.000000
271,Evaluating Coherence in Dialogue Systems using...,https://openalex.org/W2929767294,2019,17,"['https://openalex.org/W119047706', 'https://o...",Evaluating open-domain dialogue systems is dif...,1.000000


In [None]:
print("Duplicate global IDs:",
      metadata_df["global_paper_id"].duplicated().sum())

assert metadata_df["global_paper_id"].is_unique