In [1]:
import requests
import json
import re
import pandas as pd

# Part A: Download the Zelda Wikipages of characters¶

In [2]:
baseurl = "https://zelda.fandom.com/api.php?"
action = "action=query"
title = "titles=Characters in Breath of the Wild|Enemies in Breath of the Wild|Bosses in Breath of the Wild"
content = "prop=revisions&rvprop=content&rvslots=*"
dataformat ="format=json"

query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
print(query)

https://zelda.fandom.com/api.php?action=query&prop=revisions&rvprop=content&rvslots=*&titles=Characters in Breath of the Wild|Enemies in Breath of the Wild|Bosses in Breath of the Wild&format=json


In [3]:
response = requests.get(query)
wikitext = response.json()
with open('zelda-characters.json', 'w') as f:
  json.dump(wikitext, f)

In [4]:
with open('zelda-characters.json', 'r') as f:
  data=f.read()

wikitext = json.loads(data)

In [28]:
df = pd.DataFrame()

for key, value in wikitext["query"]["pages"].items():
  character_type = value["title"].split(" ")[0]
  raw_text = value["revisions"][0]["slots"]["main"]["*"]

  if character_type == "Enemies": # Truncate to remove trap portion of enemies
    raw_text = raw_text.split("Traps")[0]

  matches = re.findall(r'(?<!\(){{\w*\|BotW\|([^|]*)\|link}}(?!\))', raw_text) # Here we use negative lookbehind/lookahed to ensure we don't match the location links

  for match in matches:
    df = df.append({"Type": character_type, "Name": match}, ignore_index=True)




In [6]:
df.replace(to_replace=["Characters", "Bosses", "Enemies"], value=["Ally", "Boss", "Enemy"], inplace=True)

In [7]:
df.Type.value_counts()

Ally     573
Enemy     72
Boss      25
Name: Type, dtype: int64

In [8]:
df.drop_duplicates(inplace=True) # Here we drop trivial duplicates

In [9]:
dupes = df.duplicated(subset="Name")

In [10]:
dupes = df.pivot_table(index=["Name"], aggfunc='size')
dupes[dupes > 1]

Name
Calamity Ganon    2
Master Kohga      2
dtype: int64

In [11]:
df2 = df[df["Name"].isin(dupes[dupes > 1].index)]
df2

Unnamed: 0,Type,Name
21,Boss,Master Kohga
23,Boss,Calamity Ganon
140,Ally,Calamity Ganon
365,Ally,Master Kohga


The non trivial duplicates are where the same character is listed as both and Ally, as well as a Boss. We choose the Boss classification to be carry more meaning and will therefore drop the duplicates listed as allies

In [12]:
i = df[((df.Type == 'Ally') & (df.Name == "Calamity Ganon"))].index

i2 = df[((df.Type == 'Ally') & (df.Name == "Master Kohga"))].index
i2

Int64Index([365], dtype='int64')

In [13]:
df = df.drop(i)


In [14]:
df = df.drop(i2)

In [25]:
df.to_csv('zelda-characters.csv', index=False)

Save all character pages in txt files:

In [33]:
baseurl = "https://zelda.fandom.com/api.php?"
action = "action=query"
title = "titles=Stone_Talus_(Junior)"
content = "prop=revisions&rvprop=content&rvslots=*"
dataformat ="format=json"

query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
print(query)

https://zelda.fandom.com/api.php?action=query&prop=revisions&rvprop=content&rvslots=*&titles=Stone_Talus_(Junior)&format=json


In [26]:
characters = pd.read_csv("zelda-characters.csv")


Unnamed: 0,Type,Name
0,Boss,Stone Talus
1,Boss,Stone Talus (Junior)
2,Boss,Stone Talus (Senior)
3,Boss,Stone Talus (Luminous)
4,Boss,Stone Talus (Rare)
...,...,...
652,Enemy,Treasure Octorok
653,Enemy,Water Octorok
654,Enemy,White-Maned Lynel
655,Enemy,Yiga Blademaster


In [52]:
aliases = {}

for name in characters.Name:
  formatted_name = name.replace(" ", "_")
  title = "titles=" + formatted_name
  query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
  response = requests.get(query)
  wikitext = response.json()
  redirects = re.search(r"#REDIRECT\s\[\[(.*)\]\]", str(wikitext))
  if redirects:
    # Update dictionary containing aliases (redirects)
    aliases[name] = redirects.group(1)

    # Fetch correct page for redirected character
    formatted_name = redirects.group(1).replace(" ", "_")
    title = "titles=" + formatted_name
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    response = requests.get(query)
    wikitext = response.json()
    
  with open(f'characters/{formatted_name}.json', 'w') as f:
    json.dump(wikitext, f)

Cherry
Ancient Oven


# Part B: Building the network