In [26]:
import requests
import json
import re
import pandas as pd

# Part A: Download the Zelda Wikipages of characters¶

In [27]:
baseurl = "https://zelda.fandom.com/api.php?"
action = "action=query"
title = "titles=Characters in Breath of the Wild|Enemies in Breath of the Wild|Bosses in Breath of the Wild"
content = "prop=revisions&rvprop=content&rvslots=*"
dataformat ="format=json"

query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
print(query)

https://zelda.fandom.com/api.php?action=query&prop=revisions&rvprop=content&rvslots=*&titles=Characters in Breath of the Wild|Enemies in Breath of the Wild|Bosses in Breath of the Wild&format=json


In [12]:
response = requests.get(query)
wikitext = response.json()
with open('zelda-characters.json', 'w') as f:
  json.dump(wikitext, f)

In [13]:
with open('zelda-characters.json', 'r') as f:
  data=f.read()

wikitext = json.loads(data)

In [30]:
df = pd.DataFrame()

for key, value in wikitext["query"]["pages"].items():
  character_type = value["title"].split(" ")[0]
  raw_text = value["revisions"][0]["slots"]["main"]["*"]

  if character_type == "Enemies": # Truncate to remove trap portion of enemies
    raw_text = raw_text.split("Traps")[0]

  matches = re.findall(r'(?<!\(){{\w*\|BotW\|([^|]*)\|link}}(?!\))', raw_text) # Here we use negative lookbehind/lookahed to ensure we don't match the location links

  for match in matches:
    print(match)
    df = df.append({"Type": character_type, "Name": match}, ignore_index=True)




Stone Talus
Stone Talus (Junior)
Stone Talus (Senior)
Stone Talus (Luminous)
Stone Talus (Rare)
Igneo Talus
Frost Talus
Hinox
Hinox
Hinox
Hinox
Blue Hinox
Black Hinox
Stalnox
Molduga
Igneo Talus Titan
Molduking
Windblight Ganon
Fireblight Ganon
Thunderblight Ganon
Waterblight Ganon
Master Kohga
Monk Maz Koshia
Calamity Ganon
Dark Beast Ganon
Baddek
Bamboo
Banji
Baumar
Bayge
Beedle
Bolson
Brigo
Cambo
Chabi
Chork
Chumin
Dabi
Daruk
Dauntless
Dinraal
Ena
Endai
Epona
Farosh
Fyson
Goflam
Greyson
Heehl
Hestu
Hudson
Hylia
Kabetta
Kanny
Kapson
Karson
Kass
Kenyo
Kilton
Leekah
Link
Meeshy
Meghyn
Mei
Mils
Mina
Mipha
Misko
Nat
Naydra
Nazbi
Pelison
Pikango
Regan
Revali
Rhondson
Rik
Ronn
Savelle
Sherfin
Sho
Sorelia
Spoone
Toren
Totsuna
Tye
Urbosa
Yammo
Princess Zelda
Zyle
Laroba
Naddon
Pitar
Dah Hesho
Dmitri
Gleema
Jana
Kah Mael
Kaifa
Ke'nai Shakah
Lonni
Nell
Rex
Stamm
Tenne
Tutsuwa Nima
Ze Kasho
Granté
Hagie
Hunnie
Moggs
Monari
Ruli
Ritaag Zumo
Tu Ka'loh
Aya
Cherry
Hoz
Jerrin
Katosa Aug
Khini
Nobo
R

In [16]:
df.replace(to_replace=["Characters", "Bosses", "Enemies"], value=["Ally", "Boss", "Enemy"], inplace=True)

In [17]:
df.Type.value_counts()

Ally     573
Enemy     72
Boss      25
Name: Type, dtype: int64

In [18]:
df.drop_duplicates(inplace=True) # Here we drop trivial duplicates

In [19]:
dupes = df.duplicated(subset="Name")

In [20]:
dupes = df.pivot_table(index=["Name"], aggfunc='size')
dupes[dupes > 1]

Name
Calamity Ganon    2
Master Kohga      2
dtype: int64

In [21]:
df2 = df[df["Name"].isin(dupes[dupes > 1].index)]
df2

Unnamed: 0,Type,Name
21,Boss,Master Kohga
23,Boss,Calamity Ganon
140,Ally,Calamity Ganon
365,Ally,Master Kohga


The non trivial duplicates are where the same character is listed as both and Ally, as well as a Boss. We choose the Boss classification to be carry more meaning and will therefore drop the duplicates listed as allies

In [22]:
i = df[((df.Type == 'Ally') & (df.Name == "Calamity Ganon"))].index

i2 = df[((df.Type == 'Ally') & (df.Name == "Master Kohga"))].index
i2

Int64Index([365], dtype='int64')

In [23]:
df = df.drop(i)


In [24]:
df = df.drop(i2)

In [25]:
df.to_csv('zelda-characters.csv')

Save all character pages in txt files:

# Part B: Building the network