In [1]:
import requests
import re
import random

base_URL = 'https://en.wikipedia.org/w/api.php'

In [12]:
person_params = {'action': 'parse', 
                 'prop': 'wikitext',
                 'format':'json'}

# parse wikipedia category pages
def parse_categories(category_params):
    actors_pages = []
    while True:
        try:
            response = requests.get(base_URL, params=category_params).json()
            actors_pages.append(response['query']['categorymembers'])
            category_params['cmcontinue'] = response['continue']['cmcontinue']
        except:
            break
    
    actors = {actor['title']: actor['pageid'] for page in actors_pages for actor in page}
    return actors

# get wikitext for a single page given its page ID
def get_page_data(pageid):
    person_params['pageid'] = pageid
    data = requests.get(base_URL, params=person_params).json()['parse']['wikitext']['*']
    data = re.sub('\s+', ' ', data)
    return data

# get a tuple of True/False, list of years when they got married, list of years when they got divorced
# True/False: if the person has ever been married
# the third tuple element could be an empty list if they have never got divorced before
def get_marriage_info(data):
    data = data[data.find('spouse'):]
    start_idx = [m.start() for m in re.finditer('{{marriage', data.lower())]
    n = len(start_idx)
    if n > 0: # person is married
        end_idx = [data[start_idx[i]:].find('}}') for i in range(n)]
        marriages = [data[start_idx[i] : start_idx[i] + end_idx[i]] for i in range(n)]
        marriage_years = [[m.group() for m in re.finditer('[1-3][0-9]{3}', marriage)] for marriage in marriages]
        start_years = [m[0] for m in marriage_years]
        divorce_years = [m[1] for m in marriage_years if len(m) > 1]
        return True, start_years, divorce_years
    else:
        return False, None, None

# get n random celebrities from the actors list because parsing through all pages will take too long
def get_random_celebs(actors, n, seed=None):
    if seed is not None:
        random.seed(seed)
    return random.sample(list(actors.values()), n)

# calculate a simple divorce rate = number of people who got a divorce/number of people who were once married
def simple_divorce_rate(marriage_info):
    married = [m for m in marriage_info if m[0]]
    divorced = [m for m in married if len(m[2]) > 0]
    return len(divorced)/len(married)

In [13]:
# parse American male film actors category
category_params = {'action': 'query', 
                   'list': 'categorymembers', 
                   'cmtitle': 'Category:American male film actors', 
                   'cmlimit': 500, 
                   'format':'json'}
actors = parse_categories(category_params)
len(actors)

10679

In [14]:
# TESTING
# actors_test = ['Tom Cruise', 'Brad Pitt', 'Eddie Murphy', 'Tom Hanks', 'Dylan Minnette']
get_marriage_info(get_page_data(actors['Ashton Kutcher']))

(True, ['2005', '2015'], ['2013'])

In [21]:
# get 100 random actors
random_pageids = get_random_celebs(actors, 500)

# get marriage info for each of 100 actors
marriage_info = [get_marriage_info(get_page_data(pageid)) for pageid in random_pageids]

# do a simple calculation of the divorce rate
print(simple_divorce_rate(marriage_info))

0.5245901639344263


In [22]:
import pandas as pd
df = pd.DataFrame(marriage_info)

In [23]:
df[df[0] == True].count()[0] / df.count()[0]

0.122