# Getting Data from Wikidata for each economist

The code below loads the economists list, query Wikidata for each QID, extracts gender, birth year, country of citizenship, occupations, field of work and saves results into a new CSV. The CSV will be used later to merge with pageview data in Step 3

In [1]:
import pandas as pd
import requests
import time

In [None]:
df = pd.read_csv("economists_list_with_summaries.csv")
df.head()

Unnamed: 0,name,article_url,qid,summary
0,Edith Abbott,https://en.wikipedia.org/wiki/Edith_Abbott,Q272731,"Edith Abbott (September 26, 1876 – July 28, 19..."
1,Daron Acemoglu,https://en.wikipedia.org/wiki/Daron_Acemoglu,Q718581,"Kamer Daron Acemoğlu (born September 3, 1967) ..."
2,Nicola Acocella,https://en.wikipedia.org/wiki/Nicola_Acocella,Q7001311,Nicola Acocella (born 3 July 1939) is an Itali...
3,Zoltan Acs,https://en.wikipedia.org/wiki/Zoltan_Acs,Q8073604,Zoltan J. Acs (born 1947) is an American econo...
4,Henry Carter Adams,https://en.wikipedia.org/wiki/Henry_Carter_Adams,Q518021,"Henry Carter Adams (December 31, 1851 – August..."


In [5]:
def fetch_wikidata(qid):
    """Fetch gender, birth year, citizenship, occupations, field of work from Wikidata."""
    if pd.isna(qid):
        return None

    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    headers = {"User-Agent": "SohaKhan/1.0 (sk131@wellesley.edu) CourseProject/CS234; respectful academic use"}

    try:
        r = requests.get(url, headers=headers, timeout=10)
        data = r.json()
    except Exception as e:
        print("Error:", qid, e)
        return None

    try:
        ent = data["entities"][qid]["claims"]
    except KeyError:
        return None
    
    def get_value(prop):
        """Extract value from wikidata claim."""
        if prop not in ent:
            return None
        values = ent[prop]
        results = []
        for v in values:
            datav = v["mainsnak"].get("datavalue", {})
            if "value" in datav:
                results.append(datav["value"])
        return results if results else None
    
    # gender (P21)
    gender = get_value("P21")
    # birthdate (P569)
    birthdate = get_value("P569")
    # citizenship (P27)
    citizenship = get_value("P27")
    # occupations (P106)
    occupations = get_value("P106")
    # field of work (P101)
    fields = get_value("P101")

    return {
        "gender_raw": gender,
        "birthdate_raw": birthdate,
        "citizenship_raw": citizenship,
        "occupation_raw": occupations,
        "field_raw": fields
    }


In [6]:
wikidata_results = []

for i, row in df.iterrows():
    qid = row["qid"]
    if pd.isna(qid):
        wikidata_results.append({})
        continue

    info = fetch_wikidata(qid)
    wikidata_results.append(info)
    
    if i % 50 == 0:
        print(f"Processed {i} economists...")
    
    time.sleep(0.1)


Processed 0 economists...
Processed 50 economists...
Processed 100 economists...
Processed 150 economists...
Processed 200 economists...
Processed 250 economists...
Processed 300 economists...
Processed 350 economists...
Processed 400 economists...
Processed 450 economists...
Processed 500 economists...
Processed 550 economists...
Processed 700 economists...
Processed 750 economists...
Processed 800 economists...
Processed 900 economists...
Processed 950 economists...
Processed 1000 economists...
Processed 1050 economists...
Processed 1100 economists...
Processed 1150 economists...
Processed 1200 economists...


In [7]:
wikidata_df = pd.DataFrame(wikidata_results)
full = pd.concat([df, wikidata_df], axis=1)
full.head()

Unnamed: 0,name,article_url,qid,summary,gender_raw,birthdate_raw,citizenship_raw,occupation_raw,field_raw
0,Edith Abbott,https://en.wikipedia.org/wiki/Edith_Abbott,Q272731,"Edith Abbott (September 26, 1876 – July 28, 19...","[{'entity-type': 'item', 'numeric-id': 6581072...","[{'time': '+1876-09-26T00:00:00Z', 'timezone':...","[{'entity-type': 'item', 'numeric-id': 30, 'id...","[{'entity-type': 'item', 'numeric-id': 188094,...","[{'entity-type': 'item', 'numeric-id': 8134, '..."
1,Daron Acemoglu,https://en.wikipedia.org/wiki/Daron_Acemoglu,Q718581,"Kamer Daron Acemoğlu (born September 3, 1967) ...","[{'entity-type': 'item', 'numeric-id': 6581097...","[{'time': '+1967-09-03T00:00:00Z', 'timezone':...","[{'entity-type': 'item', 'numeric-id': 43, 'id...","[{'entity-type': 'item', 'numeric-id': 188094,...","[{'entity-type': 'item', 'numeric-id': 8134, '..."
2,Nicola Acocella,https://en.wikipedia.org/wiki/Nicola_Acocella,Q7001311,Nicola Acocella (born 3 July 1939) is an Itali...,"[{'entity-type': 'item', 'numeric-id': 6581097...","[{'time': '+1939-07-03T00:00:00Z', 'timezone':...","[{'entity-type': 'item', 'numeric-id': 172579,...","[{'entity-type': 'item', 'numeric-id': 188094,...",
3,Zoltan Acs,https://en.wikipedia.org/wiki/Zoltan_Acs,Q8073604,Zoltan J. Acs (born 1947) is an American econo...,"[{'entity-type': 'item', 'numeric-id': 6581097...","[{'time': '+1947-03-09T00:00:00Z', 'timezone':...","[{'entity-type': 'item', 'numeric-id': 30, 'id...","[{'entity-type': 'item', 'numeric-id': 188094,...","[{'entity-type': 'item', 'numeric-id': 8134, '..."
4,Henry Carter Adams,https://en.wikipedia.org/wiki/Henry_Carter_Adams,Q518021,"Henry Carter Adams (December 31, 1851 – August...","[{'entity-type': 'item', 'numeric-id': 6581097...","[{'time': '+1851-12-31T00:00:00Z', 'timezone':...","[{'entity-type': 'item', 'numeric-id': 30, 'id...","[{'entity-type': 'item', 'numeric-id': 188094,...","[{'entity-type': 'item', 'numeric-id': 8134, '..."


In [8]:
# Birth year
def extract_year(birth_raw):
    if not birth_raw or not isinstance(birth_raw, list):
        return None
    value = birth_raw[0].get("time")   # format: "+1971-00-00T00:00:00Z"
    if value:
        return int(value[1:5])
    return None

full["birth_year"] = full["birthdate_raw"].apply(extract_year)

# Gender
def extract_gender(gender_raw):
    if not gender_raw:
        return None
    try:
        return gender_raw[0]["id"]   # Q6581097 = male, Q6581072 = female
    except:
        return None

full["gender_qid"] = full["gender_raw"].apply(extract_gender)


In [9]:
full.to_csv("economists_with_wikidata.csv", index=False)
