# The code below gets a list of all economists who have an article on wikipedia. It uses requests to access the page and BeautifulSoup to parse the page.

Import all libraries

In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import time
from urllib.parse import quote

The list_url url has all economists listen in the alphabetical order

In [2]:
main_url = "https://en.wikipedia.org"
list_url = "https://en.wikipedia.org/wiki/List_of_economists"

Headers had to be used to get a 200 response status, without a header response was 304.

In [3]:
headers = {"User-Agent": "SohaKhan/1.0 (sk131@wellesley.edu) CourseProject/CS234; respectful academic use"}
response = requests.get(list_url, headers=headers)
domTree = BS(response.text, "html.parser")

All names are listed under div-col

In [4]:
sections = domTree.find_all("div", class_="div-col")

In [5]:
economists = []

for sec in sections:
    for li in sec.find_all("li"):
        # Only take THE FIRST <a> — economist name
        first_link = li.find("a")
        if not first_link:
            continue

        href = first_link.get("href", "")
        
        # ensure it's a proper Wikipedia article
        if href.startswith("/wiki/") and not ":" in href:
            economists.append({
                "name": first_link.text.strip(),
                "article_url": "https://en.wikipedia.org" + href
            })

len(economists)

1239

Compare len of names to the length of unique names in the list to make sure no name is duplicated

In [6]:
names = [e["name"] for e in economists]
len(names), len(set(names))

(1239, 1239)

In [7]:
economists

[{'name': 'Edith Abbott',
  'article_url': 'https://en.wikipedia.org/wiki/Edith_Abbott'},
 {'name': 'Daron Acemoglu',
  'article_url': 'https://en.wikipedia.org/wiki/Daron_Acemoglu'},
 {'name': 'Nicola Acocella',
  'article_url': 'https://en.wikipedia.org/wiki/Nicola_Acocella'},
 {'name': 'Zoltan Acs',
  'article_url': 'https://en.wikipedia.org/wiki/Zoltan_Acs'},
 {'name': 'Henry Carter Adams',
  'article_url': 'https://en.wikipedia.org/wiki/Henry_Carter_Adams'},
 {'name': 'Walter Adams',
  'article_url': 'https://en.wikipedia.org/wiki/Walter_Adams_(economist)'},
 {'name': 'Philippe Aghion',
  'article_url': 'https://en.wikipedia.org/wiki/Philippe_Aghion'},
 {'name': 'Montek Singh Ahluwalia',
  'article_url': 'https://en.wikipedia.org/wiki/Montek_Singh_Ahluwalia'},
 {'name': 'Qazi Kholiquzzaman Ahmad',
  'article_url': 'https://en.wikipedia.org/wiki/Qazi_Kholiquzzaman_Ahmad'},
 {'name': 'George Akerlof',
  'article_url': 'https://en.wikipedia.org/wiki/George_Akerlof'},
 {'name': 'Armen

Get QID & Summary for each economist using Wikipedia’s API

Get the summary of each economist to later use that summary to categorize which area of economics they work in.

In [10]:
def fetch_wiki_api(title):
    """Returns QID and extract summary from Wikipedia API."""
    
    # # Wikipedia requires URL encoding for parentheses, spaces, accents, etc.
    # title = quote(title)
    
    S = requests.Session()
    api_url = "https://en.wikipedia.org/w/api.php"

    params = {
        "action": "query",
        "titles": title,
        "prop": "pageprops|extracts",
        "exintro": True,
        "explaintext": True,
        "format": "json",
    }

    r = S.get(url=api_url, params=params, headers=headers)

    # Check for errors before decoding
    if r.status_code != 200:
        print("HTTP error:", r.status_code, title)
        print(r.text[:200])
        return None, None

    try:
        data = r.json()
    except Exception:
        print("JSON decode error for:", title)
        print(r.text[:300])
        return None, None

    pages = data.get("query", {}).get("pages", {})
    for _, page in pages.items():
        qid = page.get("pageprops", {}).get("wikibase_item", None)
        summary = page.get("extract", "")
        return qid, summary

    return None, ""

The code below takes around 10 mins to run, results are saved in the csv.

In [11]:
# Loop through economists

for econ in economists:
    # extract last part of URL as page title
    title = econ["article_url"].split("/wiki/")[1]

    qid, summary = fetch_wiki_api(title)
    econ["qid"] = qid
    econ["summary"] = summary

    time.sleep(0.1)  # avoid hitting API rate limits

df = pd.DataFrame(economists)
df.head()

Unnamed: 0,name,article_url,qid,summary
0,Edith Abbott,https://en.wikipedia.org/wiki/Edith_Abbott,Q272731,"Edith Abbott (September 26, 1876 – July 28, 19..."
1,Daron Acemoglu,https://en.wikipedia.org/wiki/Daron_Acemoglu,Q718581,"Kamer Daron Acemoğlu (born September 3, 1967) ..."
2,Nicola Acocella,https://en.wikipedia.org/wiki/Nicola_Acocella,Q7001311,Nicola Acocella (born 3 July 1939) is an Itali...
3,Zoltan Acs,https://en.wikipedia.org/wiki/Zoltan_Acs,Q8073604,Zoltan J. Acs (born 1947) is an American econo...
4,Henry Carter Adams,https://en.wikipedia.org/wiki/Henry_Carter_Adams,Q518021,"Henry Carter Adams (December 31, 1851 – August..."


In [12]:
df.to_csv("economists_list_with_summaries.csv", index=False)