In [1]:
## Basic description (in progress)

import nltk
from nltk import FreqDist
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/home/willem/Documents/TTV'

wordlists = PlaintextCorpusReader(corpus_root, '.*') 
wordlists.fileids()
Narnia = wordlists.words('Narnia')

fdist1 = FreqDist(Narnia)
#print(fdist1.most_common(10))
print("most occcuring non-trivial words:")
print()
print(list(set([word for word in Narnia if len(word)>3 and fdist1[word] > 5])))



most occcuring non-trivial words:

['them', 'into', 'house', 'said', 'There', 'Edmund', 'with', 'Lucy', 'they', 'that', 'were', 'came', 'room']


In [7]:
## Function to check if sentence can be parsed with parser of choice

from nltk import CFG
from nltk.grammar import FeatureGrammar
from nltk.parse import RecursiveDescentParser, FeatureEarleyChartParser, ShiftReduceParser
from nltk.parse.generate import generate

# Function that works for multiple types of parsers (You are free to use something else if you want.)
def check_sentence(parser, sentence):
    print("--------------------------------------------------")
    print("Checking if provided sentence matches the grammar:")
    print(sentence)
    if isinstance(sentence, str):
        sentence = sentence.split()
    tree_found = False
    results = parser.parse(sentence)
    for tree in results:
        tree_found = True
        print(tree)
    if not tree_found:
        print(sentence, "Does not match the provided grammar.")
    print("--------------------------------------------------")
    return tree_found

In [66]:
## The actual context-free grammar 

cfg = FeatureGrammar.fromstring("""
    
    # sentences
    S -> NP VP
    S -> VP
    S -> ADVP VP
    S -> S WHNP
    S -> S Comma CC S
    S -> S CC S
     
    SBAR -> WHNP S 
     
    # constituents
    NP -> Det N
    NP -> Det N N
    NP -> N N
    NP -> NNP
    NP -> Pos N
    NP -> ADJP N
    NP -> Det ADJP N
    NP -> ADJP NP
    NP -> NP Comma NP
    NP -> Pro
    NP -> NP SBAR
    NP -> NN SBAR
    NP -> NP PP
    NP -> NP CC NP
    NP -> JJS
    
    VP -> V NP
    VP -> V ADVP PP
    VP -> V ADJP PP
    VP -> V Adv VP
    VP -> V NP PP
    VP -> V PP
    VP -> V PP ADVP
    VP -> V VP
    
    PP -> TO NP
    PP -> IN NP
    PP -> IN IN NP
    PP -> Prep NP
    PP -> PP CC PP
    PP -> PP PP
    
    ADVP -> Adv

    ADJP -> Adj N
    ADJP -> Adj Pro
    ADJP -> IterAdj
    ADJP -> Adv Adj
    
    IterAdj -> IterAdj IterAdj
    IterAdj -> Adj
    IterAdj -> IterAdj Comma IterAdj
    
    
    WHNP -> Pos N VP
    WHNP -> Det
    WHNP -> WDT
    
    
    # lexicon
    Det ->  'the' | 'an' | 'this' | 'that' | 'no' | 'a'
    V -> 'were' | 'sent' | 'is' | 'happened' | 'lived' | 'had' | 'come' \
         | 'called' | 'was' | 'do' | 'grew'
    Adj -> 'four' | 'old' | 'ten' | 'nearest' | 'two' | 'large' \
            | 'three' | 'mrs.' | 'shaggy' | 'white'
    Adv -> 'away' | 'there' | 'very' | 'not' |  'much'
    IN -> 'from' | 'during' | 'because' | 'of' | 'in' | 'with' | 'into' | 'over' \
          'on'
    TO -> 'to'
    N -> 'children' | 'names' | 'war' | 'air-raids' | 'house' | 'professor' \
          | 'story' | 'heart' | 'country' | 'post' | 'office' | 'railway' | 'station' \
          | 'miles' | 'wife' | 'housekeeper' | 'servants' | 'man' | 'face' | 'hair' \
          | 'head'
    NNP -> 'london'
    NN -> 'something'
    Pos -> 'whose' | 'their' | 'his'
    Pro -> 'peter' | 'susan' | 'edmund' | 'lucy' | 'they' | 'them' | 'he' \
           | 'ivy' | 'margaret' | 'betty' | 'macready'
    CC -> 'and' | 'but'
    Comma -> ','
    Prep -> 'about'
    WDT -> 'which'
    JJS -> 'most'
    

""")

In [67]:
## generating some senteces in non-random fashion

for sentence in generate(cfg, n=4):
    print(' '.join(sentence))

the children were the children
the children were the names
the children were the war
the children were the air-raids


In [84]:
## check if sentences can be parsed

cfg_parser = FeatureEarleyChartParser(cfg)
#check_sentence(cfg_parser, 'There were four children whose names were Peter , edmund , lucy and susan'.lower())
#check_sentence(cfg_parser, 'this story is about something that happened to them'.lower())
#check_sentence(cfg_parser, 'they were sent away from London during the war because of the air-raids'.lower())
#check_sentence(cfg_parser, 'They were sent to the house of an old Professor'.lower())
#check_sentence(cfg_parser, 'The Professor lived in the heart of the country'.lower())
#check_sentence(cfg_parser, 'he lived ten miles from the nearest railway station and he lived two miles from the nearest post office'.lower())
#check_sentence(cfg_parser, 'Their names were Ivy , Margaret and Betty , but they do not come into the story much'.lower())
#check_sentence(cfg_parser, 'the housekeeper was a very old man'.lower())
check_sentence(cfg_parser, 'The professor was a very old man with shaggy white hair'.lower())

--------------------------------------------------
Checking if provided sentence matches the grammar:
the professor was a very old man with shaggy white hair
(S[]
  (NP[] (Det[] the) (N[] professor))
  (VP[]
    (V[] was)
    (NP[]
      (NP[] (Det[] a) (ADJP[] (Adv[] very) (Adj[] old)) (N[] man))
      (PP[]
        (IN[] with)
        (NP[]
          (ADJP[]
            (IterAdj[]
              (IterAdj[] (Adj[] shaggy))
              (IterAdj[] (Adj[] white))))
          (N[] hair))))))
(S[]
  (NP[] (Det[] the) (N[] professor))
  (VP[]
    (V[] was)
    (NP[]
      (NP[] (Det[] a) (ADJP[] (Adv[] very) (Adj[] old)) (N[] man))
      (PP[]
        (IN[] with)
        (NP[]
          (ADJP[] (IterAdj[] (Adj[] shaggy)))
          (NP[] (ADJP[] (IterAdj[] (Adj[] white))) (N[] hair)))))))
(S[]
  (NP[] (Det[] the) (N[] professor))
  (VP[]
    (V[] was)
    (NP[] (Det[] a) (ADJP[] (Adv[] very) (Adj[] old)) (N[] man))
    (PP[]
      (IN[] with)
      (NP[]
        (ADJP[]
          (IterAdj[

True