This notebook can be used to get you started in your project. It contains functions to fetch data from wikipedia and parse through some of the source data. The data for politicians is already prefetched for you, and you can download the "politicians.json" through A+.

For Wikipedia API see documentation here:
https://pypi.org/project/Wikipedia-API/

In [1]:
import wikipediaapi, time, json,requests,os
#from tqdm import tqdm # you can import this for progress bar instead if you are not using notebooks
from tqdm.notebook import tqdm

def ensure_person_data():
    """Ensures the existence of the person-data.tsv file.
    
    For downloading the file 'person-data.tsv', please go to https://search.gesis.org/research_data/SDN-10.7802-1515

    Raises:
        Exception: If the person-data.tsv file is not found in the current directory.
    """
    if not os.path.isfile("person-data.tsv"):
        raise Exception("For downloading the file 'person-data.tsv', please go to https://search.gesis.org/research_data/SDN-10.7802-1515")
    
def ensure_gender_data():
    """Ensures the existence of the gender data file and downloads it from a remote URL if it is not found.
    
    The file is downloaded from http://www.cs.cmu.edu/~ark/bio/data/wiki.genders.txt
    """
    if not os.path.isfile("wiki.genders.txt"):
        print("Downloading the gender data file...")
        open('wiki.genders.txt', 'wb').write(requests.get("http://www.cs.cmu.edu/~ark/bio/data/wiki.genders.txt", allow_redirects=True).content)
    
    
def filter_persons_by(occupation=None,birth_less=None,birth_more=None,nationality=None):
    """
    Filters persons from the person-data.tsv file based on specified criteria.

    Args:
        occupation (str, optional): The occupation of the person. Defaults to None.
        birth_less (int, optional): The upper bound of the birth year of the person. Defaults to None.
        birth_more (int, optional): The lower bound of the birth year of the person. Defaults to None.
        nationality (str, optional): The nationality of the person. Defaults to None.

    Returns:
        dict: A dictionary of persons that match the specified criteria. The keys are the person names and the values are 
        dictionaries containing the person's attributes.
    """
    ensure_person_data()
    pfile=open("person-data.tsv",'r', encoding='utf-8')
    titles=pfile.readline().strip().split("\t")
    i=0
    persons={}
    for line in pfile:
        person=dict(zip(titles,line.strip().split("\t")))
        if person["birthDate"]=='NA':
            birthYear=None
        else:
            birthDate=person["birthDate"].strip("[]\t' ")
            birthYear=int(birthDate.strip("-").split("-")[0])
            if birthDate[0]=="-":
                birthYear=-birthYear
        
        occupation_ok=occupation==None or occupation in person["occupation"] 
        nationality_ok=nationality==None or nationality in person["nationality"] 
        birth_less_ok=birth_less==None or birthYear!=None and birthYear<birth_less
        birth_more_ok=birth_more==None or birthYear!=None and birthYear>birth_more
               
        if occupation_ok and nationality_ok and birth_less_ok and birth_more_ok:
            name=person["WikiURL"][len("http://en.wikipedia.org/wiki/"):]
            persons[name]=person
    return persons

def get_genderdata():
    """Reads a tab-separated file containing Wikipedia article information and returns a dictionary of gender data.

    The function reads a file named "wiki.genders.txt" and extracts the gender data for each name in the file, using the first letter of the gender field. The gender data is then stored in a dictionary with the name as the key and the gender abbreviation as the value.

    Returns:
        A dictionary containing gender data for each name in the file.

    Raises:
        FileNotFoundError: If the input file cannot be found or opened.

    Example:
        >>> gender_data = get_genderdata()
        >>> gender_data['Albert_Einstein']
        'M'
    """
    ensure_gender_data()
    genderdata={}
    with open("wiki.genders.txt", "r", encoding='utf-8') as inputfile:
        inputfile.readline()
        for line in inputfile:
            wid,gender,name=line.strip().split("\t")
            name=name.replace(" ","_")
            genderdata[name]=gender[:1]
    return genderdata

def fill_in_genders(persons):
    """
    Fills in the gender information of persons in a dictionary.

    Args:
        persons (dict): A dictionary containing information about persons.

    Returns:
        None. The function modifies the input dictionary in place.

    Examples:
        >>> persons = {'Alice': {'age': 25}, 'Bob': {'age': 30}}
        >>> fill_in_genders(persons)
        >>> persons
        {'Alice': {'age': 25, 'gender': 'F'}, 'Bob': {'age': 30, 'gender': 'M'}}

    """
    genderdata=get_genderdata()
    for person in list(persons.keys()):
        if person in genderdata:
            gender=genderdata[person]
        else:
            gender="NA"
        persons[person]["gender"]=gender
        
def fetch_links(people,batch_size=None,lang='en'):
    """Uses the Wikipedia API to fetch Wikipedia links between the given people.
    
    The links are filled into the people dictionary in place.
    
    Note that only links between the people are saved, and if you want to inspect other links
    you should write your own fetching function.

    Args:
        people (dict): A dictionary containing names of people as keys and attributes as values.
        batch_size (int, optional): The maximum number of people to fetch links for in a single batch. Defaults to None, which means there is no maximum.
        lang (str, optional): The language in which to fetch Wikipedia links. Defaults to 'en'.

    Returns:
        bool: True if the links were not fetched for every person due to the batch size, False otherwise.
    """
    wiki = wikipediaapi.Wikipedia(lang)
    i=0
    print('Fetching link data from Wikipedia')
    pbar=tqdm(total=len(people))
    for name,attributes in people.items():
        pbar.update(1)
        if "links" not in attributes:
            page=wiki.page(name)
            links=list(map(lambda x:x.replace(" ","_"),page.links.keys()))
            plinks=list(filter(lambda x:x in people,links))
            #print(name,plinks)
            people[name]["links"]=plinks
            i+=1
            time.sleep(0.1)
        if i==batch_size:
            return True
    return False

def fetch_langs(people,batch_size=None,lang='en'):
    """Uses the Wikipedia API to fetch list of Wikipedia language editions where each person in the people 
    dictionary appears.
    
    The language editions are filled into the people dictionary in place.

    Args:
        people (dict): A dictionary containing names of people as keys and attributes as values.
        batch_size (int, optional): The maximum number of people to fetch links for in a single batch. Defaults to None, which means there is no maximum.
        lang (str, optional): The language in which to fetch Wikipedia links. Defaults to 'en'.

    Returns:
        bool: True if the language editions were not fetched for every person due to the batch size, False otherwise.
    """
    wiki = wikipediaapi.Wikipedia(lang)
    i=0
    print('Fetching language editions data from Wikipedia')
    pbar=tqdm(total=len(people))
    for name,attributes in people.items():
        pbar.update(1)
        if "langs" not in attributes:
            page=wiki.page(name)
            langs=list(page.langlinks.keys())
            #print(name,langs)
            people[name]["langs"]=langs
            i+=1
            time.sleep(0.1)
        if i==batch_size:
            return True
    return False

def fetch_summaries(people,batch_size=None,lang='en'):
    """Uses the Wikipedia API to fetch summary texts for each person in the people dictionary.
    
    The summary texts are filled into the people dictionary in place.

    Args:
        people (dict): A dictionary containing names of people as keys and attributes as values.
        batch_size (int, optional): The maximum number of people to fetch links for in a single batch. Defaults to None, which means there is no maximum.
        lang (str, optional): The language in which to fetch Wikipedia links. Defaults to 'en'.

    Returns:
        bool: True if the summaries were not fetched for every person due to the batch size, False otherwise.
    """

    wiki = wikipediaapi.Wikipedia(lang)
    i=0
    print('Fetching summary text data from Wikipedia')
    pbar=tqdm(total=len(people))
    for name,attributes in people.items():
        pbar.update(1)
        if "summary" not in attributes:
            page=wiki.page(name)
            summary=page.summary
            #print(name,summary)
            people[name]["summary"]=summary
            i+=1
            time.sleep(0.1)
        if i==batch_size:
            return True
    return False

def save_people_json(people,filename):
    with open(filename, "w") as pfile: json.dump(people,pfile)
        
def load_people_json(filename):
    with open(filename, "r") as pfile: 
        return json.load(pfile)

In the next cell, you will find the code for loading the politician data to a dictionary from the json file that you can download through A+. The commented out code was used to parse and fetch the data. You can inspect how the data was created using that code and the functions in the previous cell. Lateer on in the project, you can use the same code to construct different sets of individuals and fetch data from the Wikipedia by slightly modifying this code.

In [2]:
filename="politicians.json"
#if not os.path.isfile(filename):
#    politicians=filter_persons_by(occupation="politician")
#    fill_in_genders(politicians)
#    save_people_json(politicians,filename)
    
politicians=load_people_json(filename)

## The code below fills in summaries, language editions and links from wikipedia.
## The fetching takes place in batches of 1000 queries after which the data is saved to disk.
#while fetch_summaries(politicians,batch_size=1000): save_people_json(politicians,filename)
#while fetch_langs(politicians,batch_size=1000): save_people_json(politicians,filename)
#while fetch_links(politicians,batch_size=1000): save_people_json(politicians,filename)
#save_people_json(politicians,filename)


Below are some lines of code you might find useful.

In [3]:
import math
import networkx as nx

# Here is some example code for going through politicians and their summaries and counting words
allwords={}
for name,data in politicians.items():
    summary=data["summary"]
    for word in summary.split(" "): # Splits the summary into words
        word=word.strip().strip(".,").lower() # Removes white spaces, dots, commas and makes the word lower case
        allwords[word]=allwords.get(word,0)+1 # This is a useful pattern for counting numbers of words, the get method returns the value related to word if it is in the dictionary and otherwise 0, so that the counting starts for 0.   

# Use the Graph object for constructing your undirected network. See exercise round 5 for examples how to work with networks
# You can also consult the documentation of Networkx library online.
network = nx.Graph()         
        
print("Number of times the word 'the' appears in all summaries:",allwords["the"])
print("Natural logarithm of 2.7 is: ",math.log(2.7)) #taking a logarithm might come in handy

Number of times the word 'the' appears in all summaries: 36188
Natural logarithm of 2.7 is:  0.9932517730102834


Later on in the project you might want to get more data. The following code gets data for all finnish people who were born between 1900 and 1940. 

In [4]:
filename="finns-1900-1940.json"
if not os.path.isfile(filename):
    people=filter_persons_by(nationality="fin",birth_more=1900,birth_less=1940)
    fill_in_genders(people)
    save_people_json(people,filename)
    
people=load_people_json(filename)

while fetch_links(people,batch_size=1000): save_people_json(people,filename)
while fetch_summaries(people,batch_size=1000): save_people_json(people,filename)
while fetch_langs(people,batch_size=1000): save_people_json(people,filename)

save_people_json(people,filename)


Fetching link data from Wikipedia


  0%|          | 0/58 [00:00<?, ?it/s]

Fetching summary text data from Wikipedia


  0%|          | 0/58 [00:00<?, ?it/s]

Fetching language editions data from Wikipedia


  0%|          | 0/58 [00:00<?, ?it/s]

In [3]:
filename="finns-1900-1940.json"
if not os.path.isfile(filename):
    people=filter_persons_by(nationality="fin",birth_more=1900,birth_less=1940)
    fill_in_genders(people)
    save_people_json(people,filename)
    
people=load_people_json(filename)

while fetch_links(people,batch_size=1000): save_people_json(people,filename)
while fetch_summaries(people,batch_size=1000): save_people_json(people,filename)
while fetch_langs(people,batch_size=1000): save_people_json(people,filename)

save_people_json(people,filename)

Fetching link data from Wikipedia


  0%|          | 0/58 [00:00<?, ?it/s]

Fetching summary text data from Wikipedia


  0%|          | 0/58 [00:00<?, ?it/s]

Fetching language editions data from Wikipedia


  0%|          | 0/58 [00:00<?, ?it/s]

In [5]:
filename="artists-1900-current.json"
if not os.path.isfile(filename):
   artists=filter_persons_by(occupation="artist", birth_more=1900,birth_less=2023)
   fill_in_genders(artists)
   save_people_json(artists,filename)

people=load_people_json(filename)

while fetch_links(people,batch_size=1000): save_people_json(people,filename)
while fetch_summaries(people,batch_size=1000): save_people_json(people,filename)
while fetch_langs(people,batch_size=1000): save_people_json(people,filename)

save_people_json(people,filename)

Fetching link data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching link data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching link data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching summary text data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching summary text data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching summary text data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching language editions data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching language editions data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]

Fetching language editions data from Wikipedia


  0%|          | 0/2108 [00:00<?, ?it/s]