In [None]:
import pandas as pd
import json
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

# --- CONFIGURATION ---
INPUT_FILE = "openalex_works_full.csv"
OUTPUT_FILE = "institutions_osm_coords.csv"

# It is important to define a unique User-Agent (e.g., your project name or email).
# This is required by OpenStreetMap policies to avoid being blocked.
USER_AGENT = "university_research_script_v1_english"
# ---------------------

def get_osm_coords(geolocator, query):
    """
    Queries OpenStreetMap (Nominatim) to get coordinates.
    Returns a string "lat, lon" or None if not found.
    """
    try:
        # Attempt geocoding with a timeout
        location = geolocator.geocode(query, timeout=10)
        if location:
            return f"{location.latitude}, {location.longitude}"
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"   ! Temporary OSM error for '{query}': {e}")
        return None
    except Exception as e:
        print(f"   ! Generic error for '{query}': {e}")
        return None
    return None

# 1. Load Data
print(f"Reading file {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)

institutions_dict = {}

print("Extracting institutions from raw data...")

# 2. Extract Unique Data
for index, row in df.iterrows():
    # Skip if raw_json is missing
    if pd.isna(row.get('raw_json')):
        continue
    try:
        w = json.loads(row['raw_json'])
        authorships = w.get('authorships', [])
        for authorship in authorships:
            institutions = authorship.get('institutions', [])
            for inst in institutions:
                inst_id = inst.get('id')
                # Store only if it has an ID and we haven't processed it yet
                if inst_id and inst_id not in institutions_dict:
                    institutions_dict[inst_id] = {
                        "display_name": inst.get("display_name"),
                        "id": inst_id,
                        "ror": inst.get("ror"),
                        "country_code": inst.get("country_code"),
                        "type": inst.get("type"),
                        "coords": None
                    }
    except json.JSONDecodeError:
        continue

print(f"Unique institutions found: {len(institutions_dict)}")

# 3. Geocoding with OpenStreetMap
print("Starting coordinate search on OpenStreetMap...")
geolocator = Nominatim(user_agent=USER_AGENT)

inst_list = list(institutions_dict.values())
total = len(inst_list)

for i, inst in enumerate(inst_list):
    name = inst['display_name']
    country = inst['country_code']
    # Construct the query: "Name, Country" for better precision
    search_query = f"{name}"
    if country:
        search_query += f", {country}"
    print(f"[{i+1}/{total}] Searching: {search_query}...", end=" ")
    coords = get_osm_coords(geolocator, search_query)
    if coords:
        inst['coords'] = coords
        print(f"-> OK: {coords}")
    else:
        # Fallback: If "Name, Country" fails, try just "Name"
        # This helps if the name already includes the city/country and the extra comma confuses OSM
        coords_fallback = get_osm_coords(geolocator, name)
        if coords_fallback:
            inst['coords'] = coords_fallback
            print(f"-> OK (Fallback): {coords_fallback}")
        else:
            print("-> Not found.")
    # MANDATORY PAUSE FOR OSM POLICY (minimum 1 second)
    # This prevents your IP from being banned for spamming requests.
    time.sleep(1.2)

# 4. Save to CSV
df_final = pd.DataFrame(inst_list)

# Select desired columns
cols = ['display_name', 'id', 'coords', 'country_code', 'ror', 'type']
# Handle cases where columns might be missing
existing_cols = [c for c in cols if c in df_final.columns]
df_final = df_final[existing_cols]

df_final.to_csv(OUTPUT_FILE, index=False)
print(f"\nFinished! File saved to: {OUTPUT_FILE}")