In [2]:
import pandas as pd
import requests
import time
import difflib
import json

In [10]:
BASE_URL = "https://api.openalex.org/works"
SIMILARITY_THRESHOLD = 0.80
SLEEP_TIME = 2.0

In [11]:
# Load full titles
titles_df = pd.read_csv("../outputs/unique_papers.csv")

# Load existing metadata
existing_df = pd.read_csv("../outputs/openalex_metadata_full.csv")
processed_ids = set(existing_df["global_paper_id"])

print("Already processed:", len(processed_ids))

Already processed: 2529


In [12]:
# Find missing ones
remaining_df = titles_df[
    ~titles_df["global_paper_id"].isin(processed_ids)
].reset_index(drop=True)

print("Remaining to retry:", len(remaining_df))

Remaining to retry: 99


In [13]:
new_records = []

def reconstruct_abstract(inv_index):
    if not inv_index:
        return None

    max_pos = max(pos for positions in inv_index.values() for pos in positions)
    words = [""] * (max_pos + 1)

    for word, positions in inv_index.items():
        for pos in positions:
            words[pos] = word

    return " ".join(words)

for i, row in remaining_df.iterrows():

    global_id = row["global_paper_id"]
    title = row["title"]

    print(f"[{i}/{len(remaining_df)}] Retrying {global_id}")

    params = {
        "search": title,
        "per_page": 1,
    }

    try:
        response = requests.get(BASE_URL, params=params, timeout=15)
    except Exception as e:
        print("Request failed:", e)
        continue

    if response.status_code != 200:
        print("Bad status:", response.status_code)
        time.sleep(60)
        continue

    results = response.json().get("results", [])

    if not results:
        continue

    best_match = None
    best_score = 0

    for r in results:
        oa_title = r.get("title", "")
        score = difflib.SequenceMatcher(
            None,
            title.lower(),
            oa_title.lower()
        ).ratio()

        if score > best_score:
            best_score = score
            best_match = r

    if best_score >= SIMILARITY_THRESHOLD and best_match:

        new_records.append({
            "global_paper_id": global_id,
            "openalex_id": best_match["id"],
            "year": best_match.get("publication_year"),
            "cited_by_count": best_match.get("cited_by_count"),
            "referenced_works": best_match.get("referenced_works", []),
            "abstract": reconstruct_abstract(
                best_match.get("abstract_inverted_index")
            ),
            "match_score": best_score
        })

        print("âœ” Matched")

    else:
        print("Low similarity:", best_score)

    time.sleep(SLEEP_TIME)

[0/99] Retrying SKG_DIA_249
[1/99] Retrying NOVEL_MT_3
[2/99] Retrying NOVEL_MT_12
[3/99] Retrying NOVEL_MT_22
[4/99] Retrying NOVEL_MT_30
Low similarity: 0.40930232558139534
[5/99] Retrying NOVEL_MT_33
[6/99] Retrying NOVEL_MT_34
Low similarity: 0.16923076923076924
[7/99] Retrying NOVEL_MT_36
Low similarity: 0.2875
[8/99] Retrying NOVEL_MT_51
Low similarity: 0.6761363636363636
[9/99] Retrying NOVEL_MT_55
[10/99] Retrying NOVEL_MT_58
Low similarity: 0.27450980392156865
[11/99] Retrying NOVEL_MT_59
[12/99] Retrying NOVEL_MT_68
[13/99] Retrying NOVEL_MT_69
Low similarity: 0.4090909090909091
[14/99] Retrying NOVEL_MT_72
Low similarity: 0.3469387755102041
[15/99] Retrying NOVEL_MT_73
Low similarity: 0.3417085427135678
[16/99] Retrying NOVEL_MT_79
[17/99] Retrying NOVEL_MT_80
Low similarity: 0.33613445378151263
[18/99] Retrying NOVEL_MT_94
Low similarity: 0.2777777777777778
[19/99] Retrying NOVEL_MT_97
[20/99] Retrying NOVEL_MT_98
[21/99] Retrying NOVEL_MT_102
Low similarity: 0.36
[22/99] R

In [14]:

# Merge new matches
if new_records:
    new_df = pd.DataFrame(new_records)
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    combined_df = combined_df.drop_duplicates("global_paper_id")
    combined_df.to_csv("../outputs/openalex_metadata_full.csv", index=False)

    print("Saved updated metadata file.")
else:
    print("No new matches found.")

No new matches found.
