In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By 
import pandas as pd
import os
import re

# Scraping the data

In [61]:
# Start chrome in background without any visual output or windows
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1200")

# Initialize the driver
driver = webdriver.Chrome(options=options,service=Service(ChromeDriverManager().install()))


[WDM] - Downloading: 100%|██████████| 6.79M/6.79M [00:01<00:00, 4.93MB/s]


In [17]:
def physical_attributes(nested_list):
    '''
    This function takes in a list of dictionaries, loops through it and
    appends the physical attributes scraped from the characters to it
    '''
    physical_atts = ['species', 'gender', 'hair colour', 'eye colour','skin colour']
    for atts in physical_atts:
        for value in nested_list:
            try: 
                driver.get(value['url'])
                titles = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/aside/section[3]/div/h3')
                attributes = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/aside/section[3]/div[{physical_atts.index(atts)+1}]/div')
                for title in titles:
                    if title.text.lower() == atts:
                        for attribute in attributes:
                            value[atts] = attribute.text
            except:
                pass
    return nested_list

In [18]:
# Scrape for the hogwarts staff
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
staff = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[6]/li/a[1]')
hogwarts_staff = []
for value in staff:
    hogwarts_staff.append({
        'name': value.text,
        'url':value.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
staff_df = pd.DataFrame(physical_attributes(hogwarts_staff))



In [19]:
# Scrape for the hogwarts students by house
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
house_names = ['Hufflepuff', 'Gryffindor', 'Ravenclaw', 'Slytherin']
houses_students = []
for name in house_names:
    if name == 'Gryffindor':
        xpaths = [f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li/a',f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li[10]/span']
    else:
        xpaths = [f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li/a',f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li[10]/span']
    for xpath in xpaths:
        houses = driver.find_elements(By.XPATH, xpath)
        for house in houses:
            houses_students.append({
                'house': name,
                'student': house.text,
                'url': house.get_attribute('href'),
                'species': None,
                'gender': None,
                'hair colour': None,
                'eye colour': None,
                'skin colour': None
            })
house_df = pd.DataFrame(physical_attributes(houses_students))


In [20]:
# Scraping for the historical wizards
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
wizard_titles = ['Keepers','Others']
historical_wizards = []
for wizard_title in wizard_titles:
    wizards = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[{wizard_titles.index(wizard_title)+7}]/li/a')
    for wizard in wizards:
        historical_wizards.append({
            'title': wizard_title,
            'name': wizard.text,
            'url': wizard.get_attribute('href'),
            'species': None,
            'gender': None,
            'hair colour': None,
            'eye colour': None,
            'skin colour': None
        })
wizards_df = pd.DataFrame(physical_attributes(historical_wizards))


In [21]:
# Scraping for the villagers in Hogsmead
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
villagers = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[9]/li/a[1]')
hogsmead_villagers = []
for villager in villagers:
    hogsmead_villagers.append({
        'name': villager.text,
        'url':villager.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
villagers_df = pd.DataFrame(physical_attributes(hogsmead_villagers))


In [22]:
# Scraping for enemies 
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
gang_members = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[10]/li/a[1]')
rookwood_gang = []
for gang_member in gang_members:
    rookwood_gang.append({
        'name': gang_member.text,
        'url':gang_member.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
enemies_df = pd.DataFrame(physical_attributes(rookwood_gang))


In [62]:
# Scraping for the different locations 
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
locations = []
for index in range(1,8):
    main_locations = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[12]/li[{index}]/a')
    town_locations = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[12]/li[{index}]/ul/li/a')
    for main_location in main_locations:
        for town_location in town_locations:
            locations.append({
                'main_location':main_location.text,
                'town_location':town_location.text
            })
locations_df = pd.DataFrame(locations)


In [63]:
locations_df

Unnamed: 0,main_location,town_location
0,Diagon Alley,Gringotts
1,Diagon Alley,Olivanders
2,Hogsmeade Valley,East Hogsmeade Valley
3,Hogsmeade Valley,Falbarton Castle
4,Hogsmeade Valley,Hogsmeade
5,Hogwarts Castle,Great Hall
6,Hogwarts Castle,Hogwarts Library
7,Hogwarts Castle,Potions Classroom
8,Hogwarts Castle,Charms Classroom
9,Hogwarts Castle,Defence Against the Dark Arts Classroom


In [24]:
# Scraping for the Spells   
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')

spells = []
for index in range(13,16):
    spell_list = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[{index}]/li/a')
    for spell in spell_list:
        spells.append({
            'spell_name': spell.text,
            'url': spell.get_attribute('href'),
            'incantation': None,
            'type': None,
            'light': None,
            'effect': None
        })
spell_atts = ['incantation', 'type', 'hand movement','light', 'effect']
for atts in spell_atts:
    for value in spells:
        driver.get(value['url'])
        titles = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/aside/section/div/h3')
        attributes = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/aside/section/div[{spell_atts.index(atts)+1}]/div')
        for title in titles:
            if title.text.lower() == atts:
                for attribute in attributes:
                    value[atts] = attribute.text
spells_df = pd.DataFrame(spells)

In [25]:
# Scraping for the Creatures
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
creatures = []
for index in range(1,5):
    creature_cats = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[17]/li[{index}]/a')
    creature_list = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[17]/li[{index}]/ul/li/a')
    for creature_cat in creature_cats:
        for creature in creature_list:
            creatures.append({
                'creature_type':creature_cat.text,
                'creatures':creature.text
            })
creatures_df = pd.DataFrame(creatures)

In [26]:
# Scrape for the hogwarts staff
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
others_list = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[11]/li/a')
other_characters = []
for people in others_list:
    other_characters.append({
        'name': people.text,
        'url':people.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
others_df = pd.DataFrame(physical_attributes(other_characters))

In [64]:
data_dir = 'data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

data = {
    'staff_df': staff_df,
    'house_df': house_df,
    'enemies_df': enemies_df,
    'wizards_df': wizards_df,
    'locations_df': locations_df,
    'villagers_df': villagers_df,
    'spells_df': spells_df,
    'creatures_df': creatures_df,
    'others_df': others_df
}

for name, df in data.items():
    filename = os.path.join(data_dir, f'{name}.csv')
    df.to_csv(filename, index_label=False)