In [2]:
!pip install pyalex networkx ijson

Collecting pyalex
  Downloading pyalex-0.16-py3-none-any.whl.metadata (14 kB)
Collecting ijson
  Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading pyalex-0.16-py3-none-any.whl (11 kB)
Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson, pyalex
Successfully installed ijson-3.3.0 pyalex-0.16


In [9]:
import pyalex
import networkx as nx
import matplotlib.pyplot as plt
import json
import os
from pyalex import Works
from tqdm import tqdm

# Set OpenAlex API email
pyalex.config.email = "email here"

# Paths to existing dataset files
SEED_WORKS_PATH = "/kaggle/input/social-resilience-data/seed_works.json"
REFERENCE_METADATA_PATH = "/kaggle/input/social-resilience-data/reference_metadata.json"
local_seed_path = 'seed_works.json'
local_metadata_path = 'reference_metadata.json'

def load_local_data():
    """Load local data from JSON files if available."""
    seed_works = []
    reference_metadata = {}
    
    if os.path.exists(SEED_WORKS_PATH):
        with open(SEED_WORKS_PATH, "r") as f:
            seed_works = json.load(f)
    
    if os.path.exists(REFERENCE_METADATA_PATH):
        with open(REFERENCE_METADATA_PATH, "r") as f:
            reference_metadata = json.load(f)
    
    return seed_works, reference_metadata

def save_local_data(seed_works, reference_metadata):
    """Save updated data to JSON files."""
    with open(local_seed_path, "w") as f:
        json.dump(list(seed_works), f, indent=4)
    
    with open(local_metadata_path, "w") as f:
        json.dump(reference_metadata, f, indent=4)

def fetch_seed_works(query="social resilience", per_page=100, max_pages=100):
    """Fetch new seed works while avoiding duplicates."""
    existing_seed_ids = {work["id"] for work in seed_works}
    new_seed_works = []
    
    pager = (
        Works()
        .search_filter(title=query)
        .paginate(method="page", per_page=per_page)
    )
    
    for page_index, page in enumerate(pager):
        if page_index == max_pages:
            break
        if not page:
            break
        
        for work in page:
            if work["id"] not in existing_seed_ids:
                new_seed_works.append(work)
    
    return new_seed_works

def bulk_fetch_metadata_for_ids(ids_list, batch_size=50):
    """Fetch metadata for missing referenced works."""
    metadata_dict = {}
    ids_list = list(ids_list)
    
    for start_idx in tqdm(range(0, len(ids_list), batch_size), desc="Fetching metadata batches"):
        batch = ids_list[start_idx : start_idx + batch_size]
        joined_ids = "|".join(batch)
        
        pager = (
            Works()
            .filter(openalex=joined_ids)
            .paginate(method="page", per_page=batch_size)
        )
        
        temp_data = []
        for page_result in pager:
            if not page_result:
                break
            temp_data.extend(page_result)
        
        for record in temp_data:
            record_id = record.get("id")
            if record_id:
                metadata_dict[record_id] = record
    
    return metadata_dict

# Load existing data
seed_works, reference_metadata = load_local_data()
print(f"Loaded {len(seed_works)} existing seed works and {len(reference_metadata)} referenced works.")

# Fetch new seed works
new_seed_works = fetch_seed_works(query="resilience", per_page=100, max_pages=100)
print(f"Fetched {len(new_seed_works)} new seed works.")

# Merge new and existing seed works
seed_works_dict = {work["id"]: work for work in seed_works}
for work in new_seed_works:
    seed_works_dict[work["id"]] = work
seed_works = list(seed_works_dict.values())

# Collect all referenced work IDs
all_ref_ids = set(ref for work in seed_works for ref in work.get("referenced_works", []))
existing_ref_ids = set(reference_metadata.keys())
missing_ref_ids = all_ref_ids - existing_ref_ids

print(f"Found {len(all_ref_ids)} unique referenced works. {len(missing_ref_ids)} are missing.")

# Fetch missing references
if missing_ref_ids:
    new_reference_metadata = bulk_fetch_metadata_for_ids(ids_list=missing_ref_ids, batch_size=50)
    reference_metadata.update(new_reference_metadata)
    print(f"Fetched metadata for {len(new_reference_metadata)} new referenced works.")

# Ensure reference_metadata is correctly retained
reference_metadata = {**reference_metadata, **new_reference_metadata}

# Save updated data
save_local_data(seed_works, reference_metadata)
print("Updated dataset saved.")