In [1]:
# Import relevant libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import os

In [2]:
root_url = 'https://harrypotter.fandom.com/wiki/Hogwarts_Legacy'
page = requests.get(root_url)

In [3]:
# Ensure that if there are any issues the print message is displayed
if page.status_code != 200:
    print(f'There is an error with {root_url}')

In [4]:
# Get the html of the page and format the output 
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
relevant_h2_tags = {
    'Characters': ['Hogwarts students', 'Unknown House', 'Hufflepuff', 'Gryffindor', 'Ravenclaw', 'Slytherin', 'Hogwarts staff', 'Historical wizards', 'Keepers', 'Others', 'Hogsmeade villagers', 'Rookwood Gang'],
    'Locations': ['Locations'],
    'Spells': ['Essential', 'Selectable'],
    'Potions': ['Potions'],
    'Creatures': ['Creatures']
}
result = {}

# Loop through the h2 tags and their corresponding categories
for h2, categories in relevant_h2_tags.items():
    result[h2] = {}

    # Find the relevant headings outside of the inner loop
    headings = soup.find_all(['h2', 'h3', 'h4'])

    for category in categories:
        sub_headings = {}

        # Loop through the headings found in the soup
        for heading in headings:
            # Check if the heading matches the current h2 and category
            if heading.name in ['h2', 'h3'] and heading.text == category:
                current_heading = heading
                # If current heading has a ul tag, add li items to a list
                if current_heading.find_next('ul'):
                    li_items = [li.text for li in current_heading.find_next('ul').find_all('li')]
                    sub_headings[current_heading.text] = li_items

                    # Iterate over sub-headings until next h3 or end of document is found
                    next_heading = current_heading.find_next(['h3', 'h4'])
                    while next_heading is not None and next_heading.name == 'h4':
                        sub_sub_headings = {}

                        # If sub-heading is an li tag, add it as a dictionary with key and value being the same
                        if next_heading.name == 'li':
                            sub_sub_headings[next_heading.text] = next_heading.text
                        # If sub-heading has a ul tag, add li items to a dictionary with the key being the sub-heading text
                        if next_heading.find_next('ul'):
                            li_items = [li.text for li in next_heading.find_next('ul').find_all('li')]
                            sub_sub_headings[next_heading.text] = li_items

                            # Iterate over sub-sub-headings until next h4, h3 or end of document is found
                            next_sub_heading = next_heading.find_next(['h3', 'h4'])
                            while next_sub_heading is not None and next_sub_heading.name == 'h3':
                                sub_sub_sub_headings = {}

                                # If sub-sub-heading is an li tag, add it as a dictionary with key and value being the same
                                if next_sub_heading.name == 'li':
                                    sub_sub_sub_headings[next_sub_heading.text] = next_sub_heading.text

                                sub_sub_headings.update(sub_sub_sub_headings)
                                next_sub_heading = next_sub_heading.find_next(['h3','h4'])

                        sub_headings.update(sub_sub_headings)
                        next_heading = next_heading.find_next(['h3','h4'])

                # Add sub_headings to the result dictionary for the current category
                result[h2][category] = sub_headings

In [6]:
result.keys()

dict_keys(['Characters', 'Locations', 'Spells', 'Potions', 'Creatures'])

Create a dataframe containing all the data from result

In [7]:
# Flatten the dictionary
flattened_dict = {}
for keys in relevant_h2_tags.keys():
    for categories, values in result.items():
        if categories == keys:
            for category,sub_dict in result[categories].items():
                for sub_category, names in sub_dict.items():

                    for name in names:
                        flattened_dict[name] = sub_category


# Create a DataFrame from the flattened dictionary
df = pd.DataFrame.from_dict(flattened_dict, orient='index', columns=['category'])
df.index.name = 'names'
df.reset_index(inplace=True)

# Display the DataFrame
df.head()

Unnamed: 0,names,category
0,Unidentified student,Unknown House
1,Isaac Cooper,Unknown House
2,Arthur Siggs[20],Unknown House
3,Adelaide Oakes,Hufflepuff
4,Arthur Plummly,Hufflepuff


Save to csv format

In [8]:
data_dir = 'data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
df.to_csv('data/Hogwarts_legacy.csv', index_label=False)