In [1]:
import os
import requests
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString

base_dir = "tloz-scrape"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [2]:
base_url = "http://zelda.wikia.com"
characters = base_url + "/wiki/The_Legend_of_Zelda_recurring_characters"
recurring_characters = BeautifulSoup(requests.get(characters).text, "lxml")

In [3]:
mw_content_text = recurring_characters.find('div', {"id":"mw-content-text"})

attributes = set()
characters = []

for character_li in mw_content_text.findAll('li'):
    a = character_li.find('a')
    character_page = BeautifulSoup(requests.get(base_url + a['href']).text, "lxml")
    aside = character_page.find('aside')
    if aside is None:
        continue
    pi_datas = aside.findAll('div', {'class':'pi-item'}, recursive=False)
    character = {
        'name': a.text.strip()
    }
    for pi_data in pi_datas:
        if isinstance(pi_data, NavigableString):
            continue
        label = pi_data.find('h3').text.strip()
        value = pi_data.find('div', {"class":"pi-data-value"})
        attributes.add(label)
        if "Appears in" == label: 
            i = value.findAll('i')
            character[slugify(label, separator="_")] = [ap.text.strip() for ap in i]
        elif "Title(s)" == label or \
            "Appearances" == label or \
            "Kindred" == label or \
            "Attack method"== label or \
            "Effective weapon(s)" == label or \
            "Spoils" == label or \
            "Alternate form(s)" == label or \
            "Alternate form of" == label:
            character[slugify(label, separator="_")] = str(value)
        elif "Homeland" == label or \
            "Race" == label or \
            "Hometown" == label or \
            "Location(s)" == label or \
            "Affiliation(s)" == label:
            character[slugify(label, separator="_")] = []
            game_name = ''
            for element in value.descendants:
                if element.name == "u":
                    game_name = element.text.strip()
                elif element.name == "a":
                    if game_name == '': # Solo tiene una aparición
                        game_name = 'ORIGINAL'
                    character[slugify(label, separator="_")].append({'game': game_name,
                                                                     'value': element.text.strip()})
        else:
            character[slugify(label, separator="_")] = value.text.strip()
    characters.append(character)
    
with open(join(base_dir,'data.txt'), 'w') as outfile:
    json.dump(characters, outfile, indent=4)

In [4]:
with open(join(base_dir,'data.txt')) as json_data:
    characters = json.load(json_data)

attributes = set()
for c in characters:
    attributes.update(c.keys())
attributes.remove('name')
attributes = list(attributes)

print("Different attributes:", attributes)

Different attributes: ['title_s', 'alternate_form_of', 'location_s', 'gender', 'affiliation_s', 'hometown', 'alternate_form_s', 'attack_method', 'appears_in', 'homeland', 'age', 'effective_weapon_s', 'appearances', 'spoils', 'first_appearance', 'kindred', 'race']


In [5]:
fa_regex = re.compile('([\w\s\'&]+)\(([0-9]{4})\)')
different_games = set()
name, first_game, first_year, gender = [], [], [], []

for c in characters:
    match = fa_regex.search(c['first_appearance'])
    if match:
        different_games.add(match.group(1).strip())
        
        first_game.append(match.group(1).strip())
        first_year.append(int(match.group(2)))
        name.append(c['name'])
        
        if 'gender' in c:
            gender.append(c['gender'])
        else:
            gender.append('')
        
characters_initial_df = pd.DataFrame({'name': name,
                                      'gender': gender,
                                      'first_game': first_game,
                                      'first_year': first_year
                             })

In [6]:
characters_initial_df.to_csv(join(base_dir,'characters_initial.csv'))
characters_initial_df.head()

Unnamed: 0,first_game,first_year,gender,name
0,Ocarina of Time,1998,Female,Anju
1,The Wind Waker,2003,Male,Ankle
2,Ocarina of Time,1998,Female,Aveil
3,Ocarina of Time,1998,Male,Bean Seller
4,The Wind Waker,2003,Male,Beedle


In [7]:
game_regex = re.compile('^\(([0-9a-zA-Z/\'\s&]+)\)')

attributes = ['kindred', 'title_s']
simple_attributes = ['alternate_form_s', 'appearances', 'alternate_form_of']

for character in characters:
    for attr in simple_attributes:
        if attr in character:
            attr_value = character[attr][len('<div class="pi-data-value pi-font">'):-len("</div>")]
            value_list = [] 
            for kind in attr_value.split("<br/>"):
                    value_list.append(BeautifulSoup(kind, "html.parser").text.strip())
            character[attr] = value_list
    for attr in attributes:
        if attr in character:
            attr_value = character[attr][len('<div class="pi-data-value pi-font">'):-len("</div>")]
            kindred_list = []
            game = 'ORIGINAL'
            for kind in attr_value.split("<br/>"):
                soup_content = BeautifulSoup(kind, "html.parser").text.strip()
                match = game_regex.search(soup_content)
                if match:
                    game = match.group(1).strip()
                else:
                    kindred_list.append({'game':game, 'value':soup_content})
            character[attr] = kindred_list

    
with open(join(base_dir,'recurring_characters.json'), 'w') as outfile:
    json.dump(characters, outfile, indent=4)

In [8]:
for character in characters:
    if 'attack_method' in character:
        print(character['name'], character['attack_method'])

Dark Link <div class="pi-data-value pi-font">Mimicking <a href="/wiki/Link" title="Link">Link</a>'s attacks<br/>(<u><i>Tri Force Heroes</i></u>)<br/><a href="/wiki/Jinx" title="Jinx">Jinx</a></div>
