# Muggle to Wizard Data Cleaning

In [5]:
!rm -rf ../data

In [6]:
import os
import sys

module_path = '{}/code'.format(os.path.abspath(os.path.join('..')))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
import utils
from variables import *

!mkdir ../data
utils.download_files(BOOKS)
utils.download_files(EXTRAS)
utils.extract_html_table(SPELLS, na_values=['—'])

Downloaded 'hp1_sorcerers_stone' to 'data/' folder.
Downloaded 'hp2_chamber_of_secrets' to 'data/' folder.
Downloaded 'hp3_prisioner_of_azkaban' to 'data/' folder.
Downloaded 'hp4_globet_of_fire' to 'data/' folder.
Downloaded 'hp5_order_of_the_phoenix' to 'data/' folder.
Downloaded 'hp6_half_blood_prince' to 'data/' folder.
Downloaded 'hp7_deathly_hallows' to 'data/' folder.
Downloaded 'hp_places_list' to 'data/' folder.
Downloaded 'hp_characters_list' to 'data/' folder.
Downloaded 'hp_classes_list' to 'data/' folder.
Downloaded 'hp_spells_list' to 'data/' folder.


In [10]:
import glob
import json
import pandas as pd

books, spells, extras = {}, {}, {}

for f in sorted(glob.glob(os.path.join('../data', '*.txt'))):
    with open(f, 'r', encoding='utf-8', errors='ignore') as file:
        books[os.path.splitext(os.path.basename(f))[0]] = file.read().replace('\n', ' ')
        
for f in sorted(glob.glob(os.path.join('../data', '*.csv'))):
    extras[os.path.splitext(os.path.basename(f))[0]] = pd.read_csv(f, header=None)[0].tolist()
        
spells = pd.read_json(glob.glob(os.path.join('../data', '*.json'))[0], lines=True)
spells = spells[~spells['Resulting Effect'].str.contains('game')]

In [11]:
spells.head(10)

Unnamed: 0,Incantation,Resulting Effect,Type
0,Aberto,Opens objects,Charm
1,Accio,Summons an object,Charm
2,Age Line,Hides things from younger people,Enchanment
3,Aguamenti,Shoots water from wand.,Charm
4,Alarte Ascendare,Shoots things high in the air,Spell
5,Alohomora,Opens locked objects,Charm
6,Anapneo,Clears the target’s airway.,Spell
7,Anteoculatia,Turns head hair into antlers,Hex
8,Anti-Cheating,Prevents Cheating on Exams,Spell
9,Aparecium,Reveals invisible ink,Spell


In [12]:
extras.keys()

dict_keys(['hp_characters_list', 'hp_classes_list', 'hp_places_list'])

In [13]:
books['hp1_sorcerers_stone'][:500]

"Harry Potter and the Sorcerer's Stone   CHAPTER ONE   THE BOY WHO LIVED   Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.   Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large musta"

In [32]:
try:
    import string
    import nltk
    from nltk import word_tokenize, ngrams
    nltk.download('stopwords')
    from nltk.corpus import stopwords as sw
except:
    import string
    import nltk
    from nltk import word_tokenize, ngrams
    nltk.download('stopwords')
    from nltk.corpus import stopwords as sw

[nltk_data] Downloading package stopwords to /home/seraphybr-
[nltk_data]     fun/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [37]:
punc = str.maketrans('', '', string.punctuation)

def normalize_text(text):
    tokens = word_tokenize(text.translate(punc))    
    stopwords = sw.words('english')
    content = [w for w in tokens if w.lower() not in stopwords]
    return content
    
def get_bigrams(text):
    tokens = word_tokenize(text.translate(punc))    
    return ngrams(tokens, 2)
    
def get_trigrams(text):
    tokens = word_tokenize(text.translate(punc))    
    return ngrams(tokens, 3)

def get_wordcount(text):
    tokens = word_tokenize(text.translate(punc))
    return len(tokens)
    
def get_unique_wordcount(text):
    tokens = word_tokenize(text.translate(punc))
    return len(set(tokens))
    
def get_punctuation_count(text):
    punctuation = [c for c in text if c in string.punctuation]
    return len(punctuation)

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/seraphybr-
[nltk_data]     fun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
books_clean = {}

for book in books.keys():
    books_clean[book] = normalize_text(books[book])
    books_clean['{}_bigrams'.format(book)] = get_bigrams(books[book])
    books_clean['{}_trigrams'.format(book)] = get_trigrams(books[book])
    books_clean['{}_wordcount'.format(book)] = get_wordcount(books[book])
    books_clean['{}_unique_wordscount'.format(book)] = get_unique_wordcount(books[book])
    books_clean['{}_punctuation_count'.format(book)] = get_punctuation_count(books[book])
    

In [40]:
def count_harry(l):
    r = 0
    for item in l:
        if item.lower() == 'harry':
            r = r + 1
    print(r)
count_harry(books_clean['hp7_deathly_hallows'])    

2887
