In [15]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By 
import pandas as pd
import os

# Scraping the data

In [16]:
# Start chrome in background without any visual output or windows
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1200")

# Initialize the driver
driver = webdriver.Chrome(options=options,service=Service(ChromeDriverManager().install()))


[WDM] - Downloading: 100%|██████████| 6.78M/6.78M [00:01<00:00, 3.72MB/s]


In [17]:
def physical_attributes(nested_list):
    '''
    This function takes in a list of dictionaries, loops through it and
    appends the physical attributes scraped from the characters to it
    '''
    physical_atts = ['species', 'gender', 'hair colour', 'eye colour','skin colour']
    for atts in physical_atts:
        for value in nested_list:
            try: 
                driver.get(value['url'])
                titles = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/aside/section[3]/div/h3')
                attributes = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/aside/section[3]/div[{physical_atts.index(atts)+1}]/div')
                for title in titles:
                    if title.text.lower() == atts:
                        for attribute in attributes:
                            value[atts] = attribute.text
            except:
                pass
    return nested_list

In [18]:
# Scrape for the hogwarts staff
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
staff = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[6]/li/a[1]')
hogwarts_staff = []
for value in staff:
    hogwarts_staff.append({
        'name': value.text,
        'url':value.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
staff_df = pd.DataFrame(physical_attributes(hogwarts_staff))



In [19]:
# Scrape for the hogwarts students by house
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
house_names = ['Hufflepuff', 'Gryffindor', 'Ravenclaw', 'Slytherin']
houses_students = []
for name in house_names:
    if name == 'Gryffindor':
        xpaths = [f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li/a',f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li[10]/span']
    else:
        xpaths = [f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li/a',f'//*[@id="mw-content-text"]/div/ul[{house_names.index(name)+2}]/li[10]/span']
    for xpath in xpaths:
        houses = driver.find_elements(By.XPATH, xpath)
        for house in houses:
            houses_students.append({
                'house': name,
                'student': house.text,
                'url': house.get_attribute('href'),
                'species': None,
                'gender': None,
                'hair colour': None,
                'eye colour': None,
                'skin colour': None
            })
house_df = pd.DataFrame(physical_attributes(houses_students))


In [20]:
# Scraping for the historical wizards
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
wizard_titles = ['Keepers','Others']
historical_wizards = []
for wizard_title in wizard_titles:
    wizards = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[{wizard_titles.index(wizard_title)+7}]/li/a')
    for wizard in wizards:
        historical_wizards.append({
            'title': wizard_title,
            'name': wizard.text,
            'url': wizard.get_attribute('href'),
            'species': None,
            'gender': None,
            'hair colour': None,
            'eye colour': None,
            'skin colour': None
        })
wizards_df = pd.DataFrame(physical_attributes(historical_wizards))


In [21]:
# Scraping for the villagers in Hogsmead
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
villagers = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[9]/li/a[1]')
hogsmead_villagers = []
for villager in villagers:
    hogsmead_villagers.append({
        'name': villager.text,
        'url':villager.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
villagers_df = pd.DataFrame(physical_attributes(hogsmead_villagers))


In [22]:
# Scraping for enemies 
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
gang_members = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[10]/li/a[1]')
rookwood_gang = []
for gang_member in gang_members:
    rookwood_gang.append({
        'name': gang_member.text,
        'url':gang_member.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
enemies_df = pd.DataFrame(physical_attributes(rookwood_gang))


In [23]:
# Scraping for the different locations 
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
locations = []
for index in range(1,8):
    main_locations = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[12]/li[{index}]/a')
    town_locations = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[12]/li[{index}]/ul/li')
    for main_location in main_locations:
        for town_location in town_locations:
            locations.append({
                'main_location':main_location.text,
                'town_location':town_location.text
            })
locations_df = pd.DataFrame(locations)

In [24]:
# Scraping for the Spells   
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')

spells = []
for index in range(13,16):
    spell_list = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[{index}]/li/a')
    for spell in spell_list:
        spells.append({
            'spell_name': spell.text,
            'url': spell.get_attribute('href'),
            'incantation': None,
            'type': None,
            'light': None,
            'effect': None
        })
spell_atts = ['incantation', 'type', 'hand movement','light', 'effect']
for atts in spell_atts:
    for value in spells:
        driver.get(value['url'])
        titles = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/aside/section/div/h3')
        attributes = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/aside/section/div[{spell_atts.index(atts)+1}]/div')
        for title in titles:
            if title.text.lower() == atts:
                for attribute in attributes:
                    value[atts] = attribute.text
spells_df = pd.DataFrame(spells)

In [25]:
# Scraping for the Creatures
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
creatures = []
for index in range(1,5):
    creature_cats = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[17]/li[{index}]/a')
    creature_list = driver.find_elements(By.XPATH, f'//*[@id="mw-content-text"]/div/ul[17]/li[{index}]/ul/li/a')
    for creature_cat in creature_cats:
        for creature in creature_list:
            creatures.append({
                'creature_type':creature_cat.text,
                'creatures':creature.text
            })
creatures_df = pd.DataFrame(creatures)

In [26]:
# Scrape for the hogwarts staff
driver.get('https://harrypotter.fandom.com/wiki/Hogwarts_Legacy')
others_list = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/ul[11]/li/a')
other_characters = []
for people in others_list:
    other_characters.append({
        'name': people.text,
        'url':people.get_attribute('href'),
        'species': None,
        'gender': None,
        'hair colour': None,
        'eye colour': None,
        'skin colour': None
        })
others_df = pd.DataFrame(physical_attributes(other_characters))

In [29]:
data_dir = 'data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

data = {
    'staff': staff_df,
    'house': house_df,
    'enemies': enemies_df,
    'wizards': wizards_df,
    'locations': locations_df,
    'villagers': villagers_df,
    'spells': spells_df,
    'creatures': creatures_df,
    'others': others_df
}

for name, df in data.items():
    filename = os.path.join(data_dir, f'{name}.csv')
    df.to_csv(filename, index_label=False)

# Cleaning the data

The following will be done to clean the data we have scraped
- `url` on all the csvs will be dropped
- Remove all the numbers and square brackets in the values in the columns
- Remove the extra /n words in the names 
- Change the naming of some columns to have a uniform naming
- Combine several columns to make a character dataframe
- Check for null entries
- Check for duplicated rows
- Check for the data types
- Check for spelling errors 

In [32]:
enemies_df

Unnamed: 0,name,url,species,gender,hair colour,eye colour,skin colour
0,Ackley Barnes,https://harrypotter.fandom.com/wiki/Ackley_Barnes,Human[1],Male[1],Black[1],Brown[1],Light[1]
1,Catrin Haggarty,https://harrypotter.fandom.com/wiki/Catrin_Hag...,Human[1],Female[1],Black (greying)[1],,
2,Theophilus Harlow,https://harrypotter.fandom.com/wiki/Theophilus...,Human[1],Male[1],,,
3,Iona Morgan,https://harrypotter.fandom.com/wiki/Iona_Morgan,Human,Female,,,
4,Victor Rookwood,https://harrypotter.fandom.com/wiki/Victor_Roo...,Human[2],Male[2],Dark[2],,
5,Silvanus Selwyn,https://harrypotter.fandom.com/wiki/Silvanus_S...,Human,Male,,,
6,Tempeste Thorne,https://harrypotter.fandom.com/wiki/Tempeste_T...,Human,Female,Brown,,
7,Ailsa Travers,https://harrypotter.fandom.com/wiki/Ailsa_Travers,Human[1],Female[1],Black[1],Brown[1],White[1]
8,Gwendolyn Zhou,https://harrypotter.fandom.com/wiki/Gwendolyn_...,Human[2],Female[2],Black[2],,


In [1]:
class DataClean:
    def __init__(self,df):
        self.df = df
        self.info = df.info()
        self.duplicates = df.duplicated().sum()
        self.missing = df.isna().sum()
        self.dtypes = df.dtypes
        self.shape = df.shape 
    def drop_col(self, col='url'):
        return self.df.drop(col, axis=1, inplace=True)

    



In [5]:
b = staff_df

Unnamed: 0.1,Unnamed: 0,name,url,species,gender,hair colour,eye colour,skin colour
0,0,Phineas Nigellus Black,https://harrypotter.fandom.com/wiki/Phineas_Ni...,Human,Male,Black[1],Blue,Pale[1]
1,1,Matilda Weasley,https://harrypotter.fandom.com/wiki/Matilda_We...,,,,,
2,2,Eleazar Fig,https://harrypotter.fandom.com/wiki/Eleazar_Fig,Human[4],Male[4],Grey[4],,
3,3,Aesop Sharp,https://harrypotter.fandom.com/wiki/Aesop_Sharp,Human[4],Male[4],Brown[4],,
4,4,Dinah Hecat,https://harrypotter.fandom.com/wiki/Dinah_Hecat,Human[4],Female[4],Grey[4],Brown[5],Light[4]


In [7]:
a = DataClean(b)
a.dtypes

Unnamed: 0      int64
name           object
url            object
species        object
gender         object
hair colour    object
eye colour     object
skin colour    object
dtype: object

In [33]:
house_df.head()

Unnamed: 0,house,student,url,species,gender,hair colour,eye colour,skin colour
0,Hufflepuff,Adelaide Oakes,https://harrypotter.fandom.com/wiki/Adelaide_O...,Human,Female,Light brown[3],,
1,Hufflepuff,Arthur Plummly,https://harrypotter.fandom.com/wiki/Arthur_Plu...,,Black[4],Brown[4],Dark[4],
2,Hufflepuff,Charlotte Morrison,https://harrypotter.fandom.com/wiki/Charlotte_...,Human,Female,Black,Brown,Dark
3,Hufflepuff,Evangeline Bardsley,https://harrypotter.fandom.com/wiki/Evangeline...,Human,Female,Brown,,
4,Hufflepuff,Lenora Everleigh,https://harrypotter.fandom.com/wiki/Lenora_Eve...,Human[3],Female[3],Dark[3],Dark[3],Light[3]


In [12]:
staff_df

Unnamed: 0,name,url,species,gender,hair colour,eye colour,skin colour
0,Phineas Nigellus Black,https://harrypotter.fandom.com/wiki/Phineas_Ni...,Human,Male,Black[1],Blue,Pale[1]
1,Matilda Weasley,https://harrypotter.fandom.com/wiki/Matilda_We...,,,,,
2,Eleazar Fig,https://harrypotter.fandom.com/wiki/Eleazar_Fig,Human[4],Male[4],Grey[4],,
3,Aesop Sharp,https://harrypotter.fandom.com/wiki/Aesop_Sharp,Human[4],Male[4],Brown[4],,
4,Dinah Hecat,https://harrypotter.fandom.com/wiki/Dinah_Hecat,Human[4],Female[4],Grey[4],Brown[5],Light[4]
5,Mirabel Garlick,https://harrypotter.fandom.com/wiki/Mirabel_Ga...,Human[4],Female[4],Red[4],,
6,Abraham Ronen,https://harrypotter.fandom.com/wiki/Abraham_Ronen,Human[3],Male[3],"6'1""[4]",Grey[3],Brown
7,Cuthbert Binns,https://harrypotter.fandom.com/wiki/Cuthbert_B...,Human (formerly)[2]\nGhost[2],Male[2],White (balding),Black,Pale
8,Bai Howin,https://harrypotter.fandom.com/wiki/Bai_Howin,Human[3],Female[3],Black[3],Brown[3],Light[3]
9,Chiyo Kogawa,https://harrypotter.fandom.com/wiki/Chiyo_Kogawa,Human[3],Female[3],Black[3],,


In [13]:
wizards_df.head()

Unnamed: 0,title,name,url,species,gender,hair colour,eye colour,skin colour
0,Keepers,San Bakar,https://harrypotter.fandom.com/wiki/San_Bakar,Human,Male,,Light[1],
1,Keepers,Niamh Fitzgerald,https://harrypotter.fandom.com/wiki/Niamh_Fitz...,Human,Female,Light brown[4],Blue[4],
2,Keepers,Isidora Morganach,https://harrypotter.fandom.com/wiki/Isidora_Mo...,Human[4],Female[4],Brown[4],,
3,Keepers,Percival Rackham,https://harrypotter.fandom.com/wiki/Percival_R...,Human,Male,White[2],Brown[2],
4,Keepers,Charles Rookwood,https://harrypotter.fandom.com/wiki/Charles_Ro...,Human,Male,White,,


In [21]:
enemies_df.head()

Unnamed: 0,name,url,species,gender,hair colour,eye colour,skin colour
0,Ackley Barnes,https://harrypotter.fandom.com/wiki/Ackley_Barnes,Human[1],Male[1],Black[1],Brown[1],Light[1]
1,Catrin Haggarty,https://harrypotter.fandom.com/wiki/Catrin_Hag...,Human[1],Female[1],Black (greying)[1],,
2,Theophilus Harlow,https://harrypotter.fandom.com/wiki/Theophilus...,Human[1],Male[1],,,
3,Iona Morgan,https://harrypotter.fandom.com/wiki/Iona_Morgan,Human,Female,,,
4,Victor Rookwood,https://harrypotter.fandom.com/wiki/Victor_Roo...,Human[2],Male[2],Dark[2],,


In [23]:
villagers_df

Unnamed: 0,name,url,species,gender,hair colour,eye colour,skin colour
0,Thomas Brown,https://harrypotter.fandom.com/wiki/Thomas_Brown,Human,Male,Grey,Brown,Brown
1,Beatrice Green,https://harrypotter.fandom.com/wiki/Beatrice_G...,Human[1],Female[1],Grey[2],Grey[2],Light[2]
2,Augustus Hill,https://harrypotter.fandom.com/wiki/Augustus_Hill,Human,Male,,,
3,Cassandra Mason,https://harrypotter.fandom.com/wiki/Cassandra_...,Human[2],Female[2],Dark[2],,
4,Gerbold Ollivander,https://harrypotter.fandom.com/wiki/Gerbold_Ol...,Human[2],Male[2],,,
5,Ellie Peck,https://harrypotter.fandom.com/wiki/Ellie_Peck,Human[1],Female[1],Black[1],Brown[1],Dark[1]
6,Parry Pippin,https://harrypotter.fandom.com/wiki/Parry_Pippin,Human,Male,Brown,,
7,Calliope Snelling,https://harrypotter.fandom.com/wiki/Calliope_S...,Human,Female,Brown,Brown,Light
8,Timothy Teasdale,https://harrypotter.fandom.com/wiki/Timothy_Te...,Human,Male,,,
9,Thaddeus Travers,https://harrypotter.fandom.com/wiki/Thaddeus_T...,Human,Male,,,


In [28]:
others_df

Unnamed: 0,name,url,species,gender,hair colour,eye colour,skin colour
0,Leopold Babcocke,https://harrypotter.fandom.com/wiki/Leopold_Ba...,,,,,
1,Bardolph Beaumont,https://harrypotter.fandom.com/wiki/Bardolph_B...,Human (formerly)\nInferius,Male,Dark[2],,
2,Claire Beaumont,https://harrypotter.fandom.com/wiki/Claire_Bea...,Human,Female,,,
3,Bloody Baron,https://harrypotter.fandom.com/wiki/Bloody_Baron,Human[4] (formerly)\nGhost[4],Male[4],,Light[5],
4,Effie Bones,https://harrypotter.fandom.com/wiki/Effie_Bones,Human[1],Female[1],Black[1],Brown[1],Dark[1]
5,Lethia Burbley,https://harrypotter.fandom.com/wiki/Lethia_Bur...,,,,,
6,Agnes Coffey,https://harrypotter.fandom.com/wiki/Agnes_Coffey,Human[2],Female[2],Light brown[2],Brown[2],Light[2]
7,Deek,https://harrypotter.fandom.com/wiki/Deek,,,,,
8,Dorran,https://harrypotter.fandom.com/wiki/Dorran,Centaur[1],Male[1],Black[1],,
9,Crispin Dunne,https://harrypotter.fandom.com/wiki/Crispin_Dunne,Human[1],Male[1],,,
