# Cleaning the data
The following will be done to clean the data we have scraped
- Check for any missing data 
- Deal with the missing data 
- Check for duplicate rows 
- Remove all the numbers in the columns 
- Split hogwarts staff into subject 
- Values from sub-categories in Hogwarts staff to be replaced by Professors 
- Edit rows that contain /n in them


In [1]:
from IPython.display import display
import pandas as pd
import sqlite3
import os
import re

In [2]:
# Read data from the csvs
data_dir = 'data'
data = pd.read_csv('data/Hogwarts_legacy.csv')
data.head()

Unnamed: 0,Names,Categories,Sub_Categories
0,Unidentified student,Hogwarts students,Unknown House
1,Isaac Cooper,Hogwarts students,Unknown House
2,Arthur Siggs[20],Hogwarts students,Unknown House
3,Adelaide Oakes,Hogwarts students,Hufflepuff
4,Arthur Plummly,Hogwarts students,Hufflepuff


In [3]:
class DataClean:
    def __init__(self, df):
        self.df = df
        self.info_str = ''
     # Cleaning the Hogwarts staff 
    def staff(self):
        staff_df = self.df['Names'].str.split(',',expand=True).loc[self.df['Categories']=='Hogwarts staff']
        self.df['Names'].loc[self.df['Categories']=='Hogwarts staff'] = staff_df[0]
        self.df['Sub_Categories'].loc[self.df['Categories']=='Hogwarts staff'] = staff_df[1]
        self.df['Subject'] = staff_df[2]
    
    def info(self):
        self.info_str = str(self.df.info())
    
    # Check for duplicated data
    def duplicated_data(self):
        self.df.duplicated().sum()

    # Check for missing data
    def missing_data(self):
        missing = []
        # for col_name, df_data in self.df:
        missing_percent = round(self.df.isna().sum() / len(self.df) * 100, 2)
        missing_percents = pd.DataFrame({'column_name': self.df.columns, 'percent_missing': missing_percent})
        missing_percents.drop(missing_percents[missing_percents["percent_missing"] == 0].index, inplace=True)
        missing.append(missing_percents)
        display(pd.concat(missing))

    # Replace missing data with not applicable
    def replace_missing(self):
        self.df.fillna('Not Applicable', inplace=True)
    
    # Remove numbers from the columns and words that come after \n
    def rmv_extras(self):
        for col_name in self.df.columns:
            self.df[col_name] = self.df[col_name].map(lambda x : re.sub(pattern='[\d\[\]]',repl=' ',string=x).split('\n',1)[0])    

In [4]:
# Initialize the DataClean class
data_clean = DataClean(data)

In [5]:
# Check for the general data information
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Names           239 non-null    object
 1   Categories      239 non-null    object
 2   Sub_Categories  239 non-null    object
dtypes: object(3)
memory usage: 5.7+ KB


In [6]:
# Restructure the dataframe by adding new values to the Hogwarts staff
data_clean.staff()

In [7]:
# Check for missing data
data_clean.missing_data()

Unnamed: 0,column_name,percent_missing
Subject,Subject,94.98


In [8]:
# Deal with the missing data by replacing missing data with not applicable
data_clean.replace_missing()

In [9]:
# Confirm if the missing data has been removed
data_clean.missing_data()

Unnamed: 0,column_name,percent_missing


In [10]:
# Check if there are any duplicaated data
data_clean.duplicated_data()

In [11]:
# Remove any unwanted values in the dataframe
data_clean.rmv_extras()

In [12]:
# Check if the cleaning steps have been implemented
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Names           239 non-null    object
 1   Categories      239 non-null    object
 2   Sub_Categories  239 non-null    object
 3   Subject         239 non-null    object
dtypes: object(4)
memory usage: 7.6+ KB


Save the cleaned data to csv

In [13]:
data_dir = 'data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
data.to_csv('data/Hogwarts_legacy_cleaned.csv',index=False,columns=data.columns,encoding='UTF-8')