# Celebrity Divorce Rate Calculation #

In [1]:
import requests
import re
import random
import wptools
import numpy as np
import pandas as pd
import pickle

base_URL = 'https://en.wikipedia.org/w/api.php'

In [2]:
person_params = {'action': 'parse', 
                 'prop': 'wikitext',
                 'format':'json'}

# parse wikipedia category pages
def parse_categories(category_params):
    actors_pages = []
    while True:
        try:
            response = requests.get(base_URL, params=category_params).json()
            actors_pages.append(response['query']['categorymembers'])
            category_params['cmcontinue'] = response['continue']['cmcontinue']
        except:
            break
    
    actors = {actor['title']: actor['pageid'] for page in actors_pages for actor in page}
    return actors

# get wikitext for a single page given its page ID
def get_page_data(pageid):
    person_params['pageid'] = pageid
    data = requests.get(base_URL, params=person_params).json()['parse']['wikitext']['*']
    data = re.sub('\s+', ' ', data)
    return data

# get a tuple of True/False, list of years when they got married, list of years when they got divorced
# True/False: if the person has ever been married
# the third tuple element could be an empty list if they have never got divorced before
def get_marriage_info(pageid, data):
    data = data[data.find('spouse'):]
    start_idx = [m.start() for m in re.finditer('{{marriage', data.lower())]
    n = len(start_idx)
    if n > 0: # person is married
        try:
            end_idx = [data[start_idx[i]:].find('}}') for i in range(n)]
            marriages = [data[start_idx[i] : start_idx[i] + end_idx[i]] for i in range(n)]
            marriage_years = [[m.group() for m in re.finditer('[1-3][0-9]{3}', marriage)] for marriage in marriages]
            start_years = [m[0] for m in marriage_years]
            divorce_years = [m[1] for m in marriage_years if len(m) > 1]
            return True, start_years, divorce_years
        except:
            print('error parsing {0}, marriages text: {1}'.format(pageid, marriages))
            return False, None, None
    else:
        return False, None, None

# get a person's info and returns marriage years for a person
def get_simple_info(group):
    name, pageid = list(group['name'])[0], int(list(group['pageid'])[0])
    
    # Getting marriage info
    married, married_years, divorced_years = get_marriage_info(pageid, get_page_data(pageid))
    
    if not married:
        person_info = [(name, pageid, np.nan, np.nan, married)]
    else:
        while len(married_years) > len(divorced_years): 
            divorced_years.append(np.nan)
        person_info = [(name, pageid, married_years[i], divorced_years[i], married) 
                       for i in range(len(married_years))]
        
    return pd.DataFrame(person_info)

# gets a person's info and returns birth year, death year along with marriage info
def get_info(group):
    name, pageid = list(group['name'])[0], int(list(group['pageid'])[0])
    
    try: 
        page_parse = wptools.page(pageid=pageid, silent=True).get_parse(show=False)
        infobox = page_parse.infobox
        infobox_keys = infobox.keys()
    except:
        return None
    
    # Checking if birthdate is present
    try:
        birth_year = np.nan if 'birth_date' not in infobox_keys \
        else re.findall('[1-3][0-9]{3}', infobox['birth_date'])[0]
    except:
        birth_year = np.nan
        
    # Checking if deathdate is present
    try:
        death_year = np.nan if 'death_date' not in infobox_keys \
        else max(re.findall('[1-3][0-9]{3}', infobox['death_date']))
    except:
        death_year = np.nan
    
    # Getting marriage info
    married, married_years, divorced_years = get_marriage_info(pageid, get_page_data(pageid))
    
    if not married:
        person_info = [(name, pageid, birth_year, death_year, np.nan, np.nan, married)]
    else:
        if len(married_years) > len(divorced_years): 
            divorced_years.append(np.nan)
        person_info = [(name, pageid, birth_year, death_year, married_years[i], divorced_years[i], married) 
                       for i in range(len(married_years))]
    
    return pd.DataFrame(person_info)

# calculate a simple divorce rate = number of people who got a divorce/number of people who were once married
def simple_divorce_rate(marriage_info):
    married = [m for m in marriage_info if m[0]]
    divorced = [m for m in married if len(m[2]) > 0]
    return len(divorced)/len(married)

# utility function: 
# get n random celebrities from the actors list 
# because parsing through all pages will take too long
def get_random_celebs(actors, n, seed=None):
    if seed is not None:
        random.seed(seed)
    return random.sample(list(actors.values()), n)

# utility function: 
# see if a person is married
def is_married(pageid):
    married, _, _ = get_marriage_info(pageid, get_page_data(pageid))
    return married

# run num_iters round of random sampling and calculate the simple divorce rate of drawn random samples
def divorce_rate_simulation(celebs_df, num_iters, sample_size, random_state):
    queried_celebs_dict, divorce_rates = {}, []
    for i in range(num_iters):
        random_pageids = list(celebs_df.sample(sample_size, replace=False, random_state=random_state)['pageid'])
        
        # fetch existing info from the cache
        marriage_info_existing = [queried_celebs_dict[pageid] for pageid in random_pageids 
                                  if pageid in queried_celebs_dict]
        
        # get new marriages not in cache
        new_marriages = [(pageid, get_marriage_info(pageid, get_page_data(pageid))) for pageid in random_pageids 
                         if pageid not in queried_celebs_dict]
        pageid_new, marriage_info_new = list(zip(*new_marriages))
        marriage_info_new = list(marriage_info_new)
        
        # update cache
        queried_celebs_dict.update({pageid: marriage_info_new[i] for idx, pageid in enumerate(pageid_new)})
        
        # calculate simple divorce rate for this simulation
        marriage_info = marriage_info_existing + marriage_info_new
        divorce_rates.append(simple_divorce_rate(marriage_info))
    return divorce_rates

In [3]:
# parse American male film actor category
category_params = {'cmtitle': 'Category:American male film actors',
                   'action': 'query', 
                   'list': 'categorymembers', 
                   'cmlimit': 500, 
                   'format':'json'}
actors = parse_categories(category_params)

# parse American female film actress category
category_params = {'cmtitle': 'Category:American film actresses',
                   'action': 'query', 
                   'list': 'categorymembers', 
                   'cmlimit': 500, 
                   'format':'json'}
actresses = parse_categories(category_params)

celebs ={**actors, **actresses}

print('number of actors: ', len(actors))
print('number of actresses: ', len(actresses))
print('number of total celebrities: ', len(celebs))

number of actors:  10679
number of actresses:  7917
number of total celebrities:  18592


### Divorce Rate of Random Sampling of Celebrities on Wikipedia ###

In [4]:
celebs_df = pd.DataFrame(list(celebs.items()), columns=['name', 'pageid'])
divorce_rates_filename = 'divorce_rates'

# TODO: Update these params 
num_iters, sample_size, random_state = 5, 10, None
# num_iters, sample_size, random_state = 50, 1000, None

divorce_rates = divorce_rate_simulation(celebs_df, num_iters, sample_size, random_state)

In [5]:
# write the list to a local file
with open(divorce_rates_filename, 'wb') as fp:
    pickle.dump(divorce_rates, fp)

In [6]:
# read the list from local file
with open (divorce_rates_filename, 'rb') as fp:
    divorce_rates = pickle.load(fp)
    
divorce_rates

[0.5, 1.0, 1.0, 1.0, 0.5]

### Divorce Rate of Celebrities on People Magazine ###

In [7]:
something deliberately wrong

SyntaxError: invalid syntax (<ipython-input-7-ca553ca67a25>, line 1)

In [None]:
# celeb list from People magazine
celeb_list = []
with open ("celebrity_list.txt", 'r') as f:
    inp_celeb_list = f.read()

celeb_list = inp_celeb_list.split("\n")

people_celebs = {celeb: celebs[celeb] for celeb in celeb_list if celeb in celebs}
print(len(people_celebs))

# creating the people celebrities dataframe
people_actors_df = pd.DataFrame([(celeb, people_celebs[celeb]) for celeb in people_celebs])
people_actors_df.columns = ['name', 'pageid']
people_actors_df.head()

In [None]:
actors_df = people_actors_df

# calculate marriage info and stor in a local file
csv_filename = 'people_actors.csv'

actors_df = actors_df.groupby('name', group_keys=False).apply(get_info)
actors_df.columns = ['name', 'pageid', 'birth_year', 'death_year', 'married_year', 'divorced_year', 'married']

actors_df.to_csv(csv_filename, index=False)

In [None]:
df = pd.read_csv(csv_filename)
df.head()

### Code Scrapyard ###

In [None]:
# get 100 random actors
# random_pageids = get_random_celebs(actors, 500)

# get marriage info for each of 100 actors
# marriage_info = [get_marriage_info(pageid, get_page_data(pageid)) for pageid in random_pageids]

# do a simple calculation of the divorce rate
# print(simple_divorce_rate(marriage_info))

In [None]:
# married = actors_df.apply(lambda x: is_married(x['pageid']), axis=1)
# actors_df['married'] = married
# actors_df.to_csv('american female actress.csv', index=False)

In [None]:
# # percentage of people married
# df = pd.DataFrame(marriage_info)
# df[df[0] == True].count()[0] / df.count()[0]