# THINGS TO CHANGE #
1. change category, search for '# change this accordingly'
2. change local file name '# change to update local file name'

In [1]:
import requests
import re
import random
import wptools
import numpy as np
import pandas as pd
import csv

base_URL = 'https://en.wikipedia.org/w/api.php'

In [2]:
person_params = {'action': 'parse', 
                 'prop': 'wikitext',
                 'format':'json'}

# parse wikipedia category pages
def parse_categories(category_params):
    actors_pages = []
    while True:
        try:
            response = requests.get(base_URL, params=category_params).json()
            actors_pages.append(response['query']['categorymembers'])
            category_params['cmcontinue'] = response['continue']['cmcontinue']
        except:
            break
    
    actors = {actor['title']: actor['pageid'] for page in actors_pages for actor in page}
    return actors

# get wikitext for a single page given its page ID
def get_page_data(pageid):
    person_params['pageid'] = pageid
    data = requests.get(base_URL, params=person_params).json()['parse']['wikitext']['*']
    data = re.sub('\s+', ' ', data)
    return data

# get a tuple of True/False, list of years when they got married, list of years when they got divorced
# True/False: if the person has ever been married
# the third tuple element could be an empty list if they have never got divorced before
def get_marriage_info(pageid, data):
    data = data[data.find('spouse'):]
    start_idx = [m.start() for m in re.finditer('{{marriage', data.lower())]
    n = len(start_idx)
    if n > 0: # person is married
        try:
            end_idx = [data[start_idx[i]:].find('}}') for i in range(n)]
            marriages = [data[start_idx[i] : start_idx[i] + end_idx[i]] for i in range(n)]
            marriage_years = [[m.group() for m in re.finditer('[1-3][0-9]{3}', marriage)] for marriage in marriages]
            start_years = [m[0] for m in marriage_years]
            divorce_years = [m[1] for m in marriage_years if len(m) > 1]
            return True, start_years, divorce_years
        except:
            print('error parsing {0}, marriages text: {1}'.format(pageid, marriages))
            return False, None, None
    else:
        return False, None, None

# get a person's info
def get_info(group):
    """ 
    Returns a dict with name, birth year and death year (if present)
    """
    name = list(group['name'])[0]
    pageid = int(list(group['pageid'])[0])
    try: 
        page_parse = wptools.page(pageid=pageid, silent=True).get_parse(show=False)
        infobox = page_parse.infobox
        infobox_keys = infobox.keys()
    except:
        return None
    
    # Checking if birthdate is present
    try:
        birth_year = np.nan if 'birth_date' not in infobox_keys else re.findall('[1-3][0-9]{3}', infobox['birth_date'])[0]
    except:
        birth_year = np.nan
        
    # Checking if deathdate is present
    try:
        death_year = np.nan if 'death_date' not in infobox_keys else max(re.findall('[1-3][0-9]{3}', infobox['death_date']))
    except:
        death_year = np.nan
    
    # Getting marriage info
    married, married_years, divorced_years = get_marriage_info(pageid, get_page_data(pageid))
    
    if not married:
        person_info = [(name, pageid, birth_year, death_year, np.nan, np.nan, married)]
    else:
        if len(married_years) > len(divorced_years): 
            divorced_years.append(np.nan)
        person_info = [(name, pageid, birth_year, death_year, married_years[i], divorced_years[i], married) 
                       for i in range(len(married_years))]
    
    return pd.DataFrame(person_info)

# calculate a simple divorce rate = number of people who got a divorce/number of people who were once married
def simple_divorce_rate(marriage_info):
    married = [m for m in marriage_info if m[0]]
    divorced = [m for m in married if len(m[2]) > 0]
    return len(divorced)/len(married)

# utility function: get n random celebrities from the actors list because parsing through all pages will take too long
def get_random_celebs(actors, n, seed=None):
    if seed is not None:
        random.seed(seed)
    return random.sample(list(actors.values()), n)

# utility function: see if a person is married
def is_married(pageid):
    married, _, _ = get_marriage_info(pageid, get_page_data(pageid))
    return married

In [3]:
# parse American male actor category
category_params = {'action': 'query', 
                   'list': 'categorymembers', 
                   'cmtitle': 'Category:American male film actors', # change this accordingly
                   'cmlimit': 500, 
                   'format':'json'}

actors = parse_categories(category_params)
len(actors)

10678

In [4]:
# parse American female film actress category
category_params = {'action': 'query', 
                   'list': 'categorymembers', 
                   'cmtitle': 'Category:American film actresses', # change this accordingly
                   'cmlimit': 500, 
                   'format':'json'}

actress = parse_categories(category_params)
print(len(actors))
print(len(actress))

10678
7916


In [5]:
celeb_list = []
with open ("celebrity_list.txt", 'r') as f:
    inp_celeb_list = f.read()

celeb_list = inp_celeb_list.split("\n")

In [6]:
celebs = {}
for actor in actors:
    if actor in celeb_list:
        celebs[actor] = actors[actor]

for actor in actress:
    if actor in celeb_list:
        celebs[actor] = actress[actor]
        
len(celebs)

169

In [7]:
# creating the initial actors dataframe
actors_df = pd.DataFrame([(celeb, celebs[celeb]) for celeb in celebs])
actors_df.columns = ['name', 'pageid']
actors_df.head()

Unnamed: 0,name,pageid
0,Sarah Michelle Gellar,27611
1,Kristen Bell,1296564
2,Audrina Patridge,7600033
3,Jenny McCarthy,11494260
4,Corbin Bleu,3591242


In [9]:
csv_filename = 'american_male_actors.csv' # change to update local file name
actors_df = actors_df.groupby('name', group_keys=False).apply(get_info)
actors_df.columns = ['name', 'pageid', 'birth_year', 'death_year', 'married_year', 'divorced_year', 'married']
actors_df.to_csv(csv_filename, index=False)

In [13]:
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,name,pageid,birth_year,death_year,married_year,divorced_year,married
0,Adam Levine,1084346,1979.0,,2014.0,,True
1,Ali Larter,691715,1976.0,,2009.0,,True
2,Alicia Keys,59720,1981.0,,2010.0,,True
3,Amanda Bynes,203421,1986.0,,,,False
4,Amanda Seyfried,1020882,1985.0,,,,False


### Test the overall divorce rate for a random sample ###

In [15]:
something deliberately wrong

SyntaxError: invalid syntax (<ipython-input-15-ca553ca67a25>, line 1)

In [16]:
# # get 100 random actors
# random_pageids = get_random_celebs(actors, 500)

# # get marriage info for each of 100 actors
# marriage_info = [get_marriage_info(pageid, get_page_data(pageid)) for pageid in random_pageids]

# # do a simple calculation of the divorce rate
# print(simple_divorce_rate(marriage_info))

In [17]:
# married = actors_df.apply(lambda x: is_married(x['pageid']), axis=1)
# actors_df['married'] = married
# actors_df.to_csv('american female actress.csv', index=False)

In [18]:
# # percentage of people married
# df = pd.DataFrame(marriage_info)
# df[df[0] == True].count()[0] / df.count()[0]