# Harry Potter and the Deathly Hallows
## 1. Import

In [1]:
import nltk
import re
from nltk import CFG
import string


In [2]:
'''
Step 2. Import corpus and segment into sentences
'''

text = ""
f = open('deathly_hallows.txt')
lines = f.readlines()


for line in lines[1:2]:
    text += line[4:].replace('\"', " ").replace('\\', '').strip()


all_sentences = [l.strip().replace("Harry Potter", "HarryPotter") for l in re.split('\.|\?', text) if l]
print(all_sentences)

['The two men appeared out of nowhere, a few yards apart in the narrow, moonlit lane', "For a second they stood quite still, wands directed at each other's chests; then, recognizing each other, they stowed their wands beneath their cloaks and started walking briskly in the same direction", 'News', 'asked the taller of the two', 'The best,  replied Severus Snape', 'The lane was bordered on the left by wild, low-growing brambles, on the right by a high, neatly manicured hedge', "The men's long cloaks flapped around their ankles as they marched", 'Thought I might be late,  said Yaxley, his blunt features sliding in and out of sight as the branches of overhanging trees broke the moonlight', 'It was a little trickier than I expected', 'But I hope he will be satisfied', 'You sound confident that your reception will be good', 'Snape nodded, but did not elaborate', 'They turned right, into a wide driveway that led off the lane', "The high hedge curved into them, running off into the distance b

## 2. Statistics

In [3]:
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

'''
Step 2. Tokenize text
'''
tokens = nltk.word_tokenize(text)
tokens = [token.lower() for token in tokens]

list_of_tokens = nltk.pos_tag(tokens)
# print(list_of_tokens)

print("Aantal woorden:" , len(tokens))

'''
Tokenize sentences
'''
sentences = nltk.sent_tokenize(text)
print("Aantal zinnen: ", len(sentences))
print(sentences)

# normalized_text = "".join([w.lower() for w in text])
# normalized_tokens = nltk.word_tokenize(normalized_text)

porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()


# normalized_tokens = [porter.stem(t) for t in tokens]
# normalized_tokens = [lancaster.stem(t) for t in tokens]

unique_normalized_tokens = set(tokens)

wnl = nltk.WordNetLemmatizer()
vocabulary = [wnl.lemmatize(t) for t in unique_normalized_tokens]
print("Aantal vocabulary: ", len(vocabulary))


# vocab_2 = sorted(set(words))


hapaxes = []
most_frequent = ""
fdist = nltk.FreqDist(tokens)

del fdist[',']
del fdist['.']


sorted_frequencies = sorted(fdist, key=fdist.get, reverse=True)
for word in sorted_frequencies:
    if word not in string.punctuation:
        if fdist[word] == 1:
            hapaxes.append((word, fdist[word]))
#         print(word, fdist[word], end='\n')

print("Most frequent word: ", fdist.most_common(1)[0])
# for hapax in hapaxes:
#     print(hapax)
    
    
# print(fdist.most_common(10))


Aantal woorden: 3661
Aantal zinnen:  215
['The two men appeared out of nowhere, a few yards apart in the narrow, moonlit lane.', "For a second they stood quite still, wands directed at each other's chests; then, recognizing each other, they stowed their wands beneath their cloaks and started walking briskly in the same direction.", 'News?', 'asked the taller of the two.', 'The best,  replied Severus Snape.The lane was bordered on the left by wild, low-growing brambles, on the right by a high, neatly manicured hedge.', "The men's long cloaks flapped around their ankles as they marched.", 'Thought I might be late,  said Yaxley, his blunt features sliding in and out of sight as the branches of overhanging trees broke the moonlight.', 'It was a little trickier than I expected.', 'But I hope he will be satisfied.', 'You sound confident that your reception will be good?', 'Snape nodded, but did not elaborate.', 'They turned right, into a wide driveway that led off the lane.', "The high hedge

Aantal vocabulary:  1032
Most frequent word:  ('the', 195)


## 3. Tokenizing

In [4]:
from collections import defaultdict

tag_dict = defaultdict(list)

# Tag
tokens = nltk.word_tokenize(text)
tokens = [token.lower() for token in tokens]
tagged_sent = nltk.pos_tag(tokens)

for word, tag in tagged_sent:
    if tag not in tag_dict:
        tag_dict[tag].append(word)
    elif word not in tag_dict.get(tag):
        tag_dict[tag].append(word)

lexicon = ""
# Printing to screen
for tag, words in tag_dict.items():
    if ',' in tag or ':' in tag or '.' in tag or '$' in tag:
        continue
    lexicon += tag + " -> "
#     print(tag, "->", end='')
    first_word = True
    for word in words:
        if first_word:
            lexicon += "\"" + word + "\""
#             print("\"" + word + "\"", end='')
            first_word = False
        else:
            lexicon += "| \"" + word + "\""
#             print("| \"" + word + "\"", end='')
#     print('\n')
    lexicon += "\n"

print(lexicon)


RBR -> "more"| "louder"
NN -> "narrow"| "lane"| "direction"| "news"| "taller"| "severus"| "snape.the"| "right"| "hedge"| "cloaks"| "i"| "yaxley"| "blunt"| "sight"| "moonlight"| "hope"| "reception"| "snape"| "driveway"| "distance"| "pair"| "way"| "step"| "silence"| "kind"| "salute"| "dark"| "metal"| "yew"| "sound"| "rustle"| "wand"| "companion"| "head"| "source"| "noise"| "nothing"| "peacock"| "top"| "lucius"| "cloak"| "manor"| "house"| "darkness"| "end"| "drive"| "diamond"| "garden"| "fountain"| "gravel"| "front"| "door"| "inward"| "approach"| "nobody"| "hallway"| "lit"| "carpet"| "stone"| "floor"| "wall"| "wooden"| "room"| "space"| "heartbeat"| "bronze"| "handle.the"| "table"| "furniture"| "illumination"| "fire"| "beneath"| "mantelpiece"| "mirror"| "moment"| "threshold"| "lack"| "light"| "feature"| "scene"| "figure"| "rope"| "bare"| "surface"| "none"| "man"| "minute"| "voice"| "speaker"| "fireplace"| "silhouette"| "nearer"| "face"| "shone"| "gloom"| "hairless"| "snakelike"| "glow"| "v

## 4. Grammar

In [5]:
# print(all_sentences[0:10])
# print()
from nltk.grammar import FeatureGrammar
from nltk.parse import RecursiveDescentParser, FeatureEarleyChartParser
from nltk.parse.generate import generate


tagged_sent = nltk.pos_tag(tokens)
# print(tagged_sent)

extend_grammar = """
    S -> NP
    NP -> DT JJ NN VBD
"""

cfg_1 = CFG.fromstring(extend_grammar + lexicon)
cfg_1_parser = RecursiveDescentParser(cfg_1)

feature_grammar = """
    S -> NP VP
    NP -> DT NN
    VP -> VB NP
"""

cfg_2 = FeatureGrammar.fromstring(feature_grammar + lexicon)
cfg_2_parser = FeatureEarleyChartParser(cfg_2)

In [74]:
# print(cfg_2)

# groucho_grammar = nltk.CFG.fromstring("""
#     S -> NP VP
#     PP -> P NP
#     NP -> Det N | Det N PP | 'I'
#     VP -> V NP | VP PP
#     Det -> 'an' | 'my'
#     N -> 'elephant' | 'pajamas'
#     V -> 'shot'
#     P -> 'in'
# """)

# sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(cfg_2)
for tree in parser.parse(['the', 'two', 'men', 'appeared', 'out', 'of', 'nowhere', 'a', 'few', 'yards', 'apart', 'in', 'the', 'narrow', 'moonlit', 'lane']):
    print(tree)


In [60]:

feature_grammar = """
    S -> NP VP
    NP -> DT NN
    VP -> VB NP
"""

cfg_2 = FeatureGrammar.fromstring(feature_grammar + lexicon)

# The following inputs should produce the corresponding results
for s in generate(cfg_2, n=10): 
    print(' '.join(s))

the narrow be the narrow
the narrow be the lane
the narrow be the direction
the narrow be the news
the narrow be the taller
the narrow be the severus
the narrow be the snape.the
the narrow be the right
the narrow be the hedge
the narrow be the cloaks


In [47]:
tokenized_sentences = []
for sentence in sentences:
    tokenzed_sen = []
    for word in sentence.split(" "):
        for tag in tagged_sent:
            if tag[0] in word.lower().replace(",", ""):
                tokenzed_sen.append(tag[1])
                break
    tokenized_sentences.append(tokenzed_sen)


for idx, sentence in enumerate(tokenized_sentences):
    print(sentence, "\n", all_sentences[idx])

['DT', 'CD', 'NNS', 'VBD', 'IN', 'IN', 'RB', 'DT', 'JJ', 'DT', 'DT', 'IN', 'DT', 'DT', 'JJ', 'DT'] 
 The two men appeared out of nowhere, a few yards apart in the narrow, moonlit lane
['IN', 'DT', 'JJ', 'DT', 'VBD', 'RB', 'RB', 'DT', 'VBD', 'DT', 'DT', 'DT', 'NNS', 'DT', 'IN', 'DT', 'DT', 'DT', 'VBD', 'DT', 'DT', 'DT', 'DT', 'DT', 'DT', 'DT', 'DT', 'RB', 'IN', 'DT', 'DT', '.'] 
 For a second they stood quite still, wands directed at each other's chests; then, recognizing each other, they stowed their wands beneath their cloaks and started walking briskly in the same direction
['NN'] 
 News
['DT', 'DT', 'DT', 'IN', 'DT', 'CD'] 
 asked the taller of the two
['DT', 'JJS', 'VBD', 'NN', 'DT', 'DT', 'DT', 'VBN', 'IN', 'DT', 'VBN', 'IN', 'JJ', 'IN', 'DT', 'IN', 'DT', 'NN', 'IN', 'DT', 'JJ', 'DT', 'DT', '.'] 
 The best,  replied Severus Snape
['DT', 'NNS', 'IN', 'DT', 'DT', 'DT', 'DT', 'DT', 'DT', 'DT', 'DT'] 
 The lane was bordered on the left by wild, low-growing brambles, on the right by a 

## Cool stuff

NameError: name 'c' is not defined