### Topics

In [1]:
import pandas as pd
import json

# Load CSV
df = pd.read_csv("openalex_works_full.csv")

# Function to extract topics from raw_json
def extract_topics(raw):
    try:
        data = json.loads(raw)
        # OpenAlex topics can be in "topics" or under "concepts" (older schema)
        if "topics" in data and data["topics"]:
            return [t.get("display_name") for t in data["topics"] if t.get("display_name")]
        elif "concepts" in data and data["concepts"]:
            return [c.get("display_name") for c in data["concepts"] if c.get("display_name")]
    except Exception:
        pass
    return []

# Extract all topics
df["topics"] = df["raw_json"].apply(extract_topics)

# Flatten list of topics
all_topics = [t for sublist in df["topics"] for t in sublist]

# Count unique topics
unique_topics = set(all_topics)

print("Number of publications:", len(df))
print("Total topic occurrences:", len(all_topics))
print("Unique topics:", len(unique_topics))


Number of publications: 236
Total topic occurrences: 641
Unique topics: 346
