In [1]:
import nltk

### Grammatical Features

In [2]:
kim = {'CAT': 'NP', 'ORTH': 'Kim', 'REF': 'k'}
chase = {'CAT': 'V', 'ORTH': 'chased', 'REL': 'chase'}

In [3]:
## Cat : grammatical category, Orth : Orthography(spelling), 

In [4]:
## pairing of features and values are known as feature structure.


In [5]:
chase['AGT'] = 'sbj' # Agent
chase['PAT'] = 'obj' # patient

In [6]:
sent = "Kim chased Lee"

In [7]:
tokens = sent.split()

In [8]:
Lee = {'CAT': 'NP', 'ORTH': 'Lee', 'REF': 'l'}

In [9]:
def lex2fs(words):
    for fs in [kim, Lee, chase]:
        if fs['ORTH'] == words:
            return fs

In [10]:
subj, verb, obj = lex2fs(tokens[0]), lex2fs(tokens[1]), lex2fs(tokens[2])
verb['AGT'] = subj['REF']
verb['PAT'] = obj['REF']
for k in ['ORTH', 'REL', 'AGT', 'PAT']:
     print("%-5s => %s" % (k, verb[k]))

ORTH  => chased
REL   => chase
AGT   => k
PAT   => l


In [11]:
surprise = {'CAT': 'V', 'ORTH': 'surprised', 'REL': 'surprise',
             'SRC': 'sbj', 'EXP': 'obj'}

In [12]:
surprise

{'CAT': 'V',
 'ORTH': 'surprised',
 'REL': 'surprise',
 'SRC': 'sbj',
 'EXP': 'obj'}

### Syntactic Agreement

Agreement: the morphological properties of the verb co-vary with syntactic properties of the subject noun phrase.

### Using Attributes and Constraints

In [13]:
nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg')

% start S
# ###################
# Grammar Productions
# ###################
# S expansion productions
S -> NP[NUM=?n] VP[NUM=?n]
# NP expansion productions
NP[NUM=?n] -> N[NUM=?n] 
NP[NUM=?n] -> PropN[NUM=?n] 
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
NP[NUM=pl] -> N[NUM=pl] 
# VP expansion productions
VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
# ###################
# Lexical Productions
# ###################
Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'
PropN[NUM=sg]-> 'Kim' | 'Jody'
N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' 
IV[TENSE=pres,  NUM=sg] -> 'disappears' | 'walks'
TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
IV[TENSE=pres,  NUM=pl] -> 'disappear' | 'walk'
TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
IV[TENSE=past] -> 'disappeared' | 'walked'
TV[TENSE=past] -> 'saw' | 'liked'


In [14]:
## example feature based grammar

In [15]:
## for creating a custom copy use nltk.data.load()

In [16]:
## chart parser with a feature-based grammar.

In [17]:
tokens = 'Kim sees Jody'.split()
from nltk import load_parser
cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2)
for tree in cp.parse(tokens):
    print(tree)

|.Kim .sees.Jody.|
Leaf Init Rule:
|[----]    .    .| [0:1] 'Kim'
|.    [----]    .| [1:2] 'sees'
|.    .    [----]| [2:3] 'Jody'
Feature Bottom Up Predict Combine Rule:
|[----]    .    .| [0:1] PropN[NUM='sg'] -> 'Kim' *
Feature Bottom Up Predict Combine Rule:
|[----]    .    .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
Feature Bottom Up Predict Combine Rule:
|[---->    .    .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
Feature Bottom Up Predict Combine Rule:
|.    [----]    .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'sees' *
Feature Bottom Up Predict Combine Rule:
|.    [---->    .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
Feature Bottom Up Predict Combine Rule:
|.    .    [----]| [2:3] PropN[NUM='sg'] -> 'Jody' *
Feature Bottom Up Predict Combine Rule:
|.    .    [----]| [2:3] NP[NUM='sg'] -> PropN[NUM='sg'] *
Feature Bottom Up Predict Combine Rule:
|.    .    [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
Feature Single Edge Fundament

In [18]:
for tree in cp.parse(tokens): print(tree) # printing the tree in one line

|.Kim .sees.Jody.|
Leaf Init Rule:
|[----]    .    .| [0:1] 'Kim'
|.    [----]    .| [1:2] 'sees'
|.    .    [----]| [2:3] 'Jody'
Feature Bottom Up Predict Combine Rule:
|[----]    .    .| [0:1] PropN[NUM='sg'] -> 'Kim' *
Feature Bottom Up Predict Combine Rule:
|[----]    .    .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
Feature Bottom Up Predict Combine Rule:
|[---->    .    .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
Feature Bottom Up Predict Combine Rule:
|.    [----]    .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'sees' *
Feature Bottom Up Predict Combine Rule:
|.    [---->    .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
Feature Bottom Up Predict Combine Rule:
|.    .    [----]| [2:3] PropN[NUM='sg'] -> 'Jody' *
Feature Bottom Up Predict Combine Rule:
|.    .    [----]| [2:3] NP[NUM='sg'] -> PropN[NUM='sg'] *
Feature Bottom Up Predict Combine Rule:
|.    .    [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
Feature Single Edge Fundament

## Terminology

In [19]:
# simple values are called as atomic and they cannot be decomposed into subparts, boolean values is a special case

In [20]:
## Attritube Value Matrix (AVM) for complex values(person, name, location)

### Processing Feature Structures

In [21]:
fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')

In [22]:
fs1

[NUM='sg', TENSE='past']

In [23]:
fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem')
print(fs1['GND'])

fem


In [24]:
ex = nltk.FeatStruct(Tense='present', NUM = 'pl')

In [25]:
ex

[NUM='pl', Tense='present']

In [26]:
ex1 = nltk.FeatStruct(PER=4, NUM = 'pl', GND = 'male')

In [27]:
ex1

[GND='male', NUM='pl', PER=4]

In [28]:
fs1['CASE'] = 'acc' # assign value to the feature, indexing the item

In [29]:
fs1

[CASE='acc', GND='fem', NUM='pl', PER=3]

In [30]:
fs2 = nltk.FeatStruct(POS='N', AGR=fs1)
print(fs2)

[       [ CASE = 'acc' ] ]
[ AGR = [ GND  = 'fem' ] ]
[       [ NUM  = 'pl'  ] ]
[       [ PER  = 3     ] ]
[                        ]
[ POS = 'N'              ]


In [31]:
print(fs2['AGR'])

[ CASE = 'acc' ]
[ GND  = 'fem' ]
[ NUM  = 'pl'  ]
[ PER  = 3     ]


In [32]:
print(fs2['AGR']['PER'])

3


In [33]:
print(nltk.FeatStruct("[POS='N', AGR=[PER=3, NUM='pl', GND='fem']]"))

[       [ GND = 'fem' ] ]
[ AGR = [ NUM = 'pl'  ] ]
[       [ PER = 3     ] ]
[                       ]
[ POS = 'N'             ]


In [34]:
print(nltk.FeatStruct(NAME='Lee Taemin', TELNO = '56 2739 223', AGE=32, GND='Male', LOC='South Korea'))

[ AGE   = 32            ]
[ GND   = 'Male'        ]
[ LOC   = 'South Korea' ]
[ NAME  = 'Lee Taemin'  ]
[ TELNO = '56 2739 223' ]


In [35]:
## Feature Structures as graphs, directed acyclic graphs(DAGs)

In [36]:
## A Feature path is a sequence of arcs that can be followed from the root node.

In [37]:
# Structure Sharing and Reentrancy: When two paths have the same value they are called as equivalent.

In [38]:
print(nltk.FeatStruct("""[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
...                          SPOUSE=[NAME='Kim', ADDRESS->(1)]]"""))

[ ADDRESS = (1) [ NUMBER = 74           ] ]
[               [ STREET = 'rue Pascal' ] ]
[                                         ]
[ NAME    = 'Lee'                         ]
[                                         ]
[ SPOUSE  = [ ADDRESS -> (1)  ]           ]
[           [ NAME    = 'Kim' ]           ]


In [39]:
## The bracketed integer is sometimes called as tag or a coindex.

In [40]:
# the Choice of integer is not significant. there can be any number of tags within a single feature structure.  

In [41]:
print(nltk.FeatStruct("[A='a', B=(1)[C='c'], D->(1), E->(1)]"))

[ A = 'a'             ]
[                     ]
[ B = (1) [ C = 'c' ] ]
[                     ]
[ D -> (1)            ]
[ E -> (1)            ]


## Subsumption and Unification

In [42]:
## feature structure provides partial information

In [43]:
## Merging information from two feature structures is called unification and is supported by the unify() method.


In [44]:
fs1 = nltk.FeatStruct(NUMBER=9, STREET='Suju')
fs2 = nltk.FeatStruct(CITY='Incheon')
print(fs1.unify(fs2))

[ CITY   = 'Incheon' ]
[ NUMBER = 9         ]
[ STREET = 'Suju'    ]


In [45]:
print(fs2.unify(fs1))

[ CITY   = 'Incheon' ]
[ NUMBER = 9         ]
[ STREET = 'Suju'    ]


In [46]:
fs0 = nltk.FeatStruct(A='a')
fs1 = nltk.FeatStruct(A='b')
fs2 = fs0.unify(fs1)
print(fs2)  # if the path is shared but does not has the value yhen none is returned  

None


In [47]:
fs0 = nltk.FeatStruct("""[NAME=Lee,
                          ADDRESS=[NUMBER=74,
                                   STREET='rue Pascal'],
                          SPOUSE= [NAME=Kim,
                                  ADDRESS=[NUMBER=74,
                                           STREET='rue Pascal']]]""")
print(fs0)

[ ADDRESS = [ NUMBER = 74           ]               ]
[           [ STREET = 'rue Pascal' ]               ]
[                                                   ]
[ NAME    = 'Lee'                                   ]
[                                                   ]
[           [ ADDRESS = [ NUMBER = 74           ] ] ]
[ SPOUSE  = [           [ STREET = 'rue Pascal' ] ] ]
[           [                                     ] ]
[           [ NAME    = 'Kim'                     ] ]


In [48]:
fs1 = nltk.FeatStruct("[SPOUSE = [ADDRESS = [CITY = Paris]]]")
print(fs1.unify(fs0))

[ ADDRESS = [ NUMBER = 74           ]               ]
[           [ STREET = 'rue Pascal' ]               ]
[                                                   ]
[ NAME    = 'Lee'                                   ]
[                                                   ]
[           [           [ CITY   = 'Paris'      ] ] ]
[           [ ADDRESS = [ NUMBER = 74           ] ] ]
[ SPOUSE  = [           [ STREET = 'rue Pascal' ] ] ]
[           [                                     ] ]
[           [ NAME    = 'Kim'                     ] ]


In [49]:
fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
                         SPOUSE=[NAME=Kim, ADDRESS->(1)]]""")
print(fs1.unify(fs2))

[               [ CITY   = 'Paris'      ] ]
[ ADDRESS = (1) [ NUMBER = 74           ] ]
[               [ STREET = 'rue Pascal' ] ]
[                                         ]
[ NAME    = 'Lee'                         ]
[                                         ]
[ SPOUSE  = [ ADDRESS -> (1)  ]           ]
[           [ NAME    = 'Kim' ]           ]


In [50]:
## updating

In [51]:
fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]")
fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]")
print(fs2)

[ ADDRESS1 = ?x ]
[ ADDRESS2 = ?x ]


In [52]:
print(fs2.unify(fs1))  # both the path value got the same

[ ADDRESS1 = (1) [ NUMBER = 74           ] ]
[                [ STREET = 'rue Pascal' ] ]
[                                          ]
[ ADDRESS2 -> (1)                          ]


# Extending a Feature Based Grammar

## SubCategorization

In [53]:
# IV = Intransitive verb and Tv = Transitive verb

## Heads Revisited

In [54]:
nltk.data.show_cfg('grammars/book_grammars/feat1.fcfg')

% start S
# ###################
# Grammar Productions
# ###################
S[-INV] -> NP VP
S[-INV]/?x -> NP VP/?x
S[-INV] -> NP S/NP
S[-INV] -> Adv[+NEG] S[+INV]
S[+INV] -> V[+AUX] NP VP
S[+INV]/?x -> V[+AUX] NP VP/?x
SBar -> Comp S[-INV]
SBar/?x -> Comp S[-INV]/?x
VP -> V[SUBCAT=intrans, -AUX]
VP -> V[SUBCAT=trans, -AUX] NP
VP/?x -> V[SUBCAT=trans, -AUX] NP/?x
VP -> V[SUBCAT=clause, -AUX] SBar
VP/?x -> V[SUBCAT=clause, -AUX] SBar/?x
VP -> V[+AUX] VP
VP/?x -> V[+AUX] VP/?x
# ###################
# Lexical Productions
# ###################
V[SUBCAT=intrans, -AUX] -> 'walk' | 'sing'
V[SUBCAT=trans, -AUX] -> 'see' | 'like'
V[SUBCAT=clause, -AUX] -> 'say' | 'claim'
V[+AUX] -> 'do' | 'can'
NP[-WH] -> 'you' | 'cats'
NP[+WH] -> 'who'
Adv[+NEG] -> 'rarely' | 'never'
NP/NP ->
Comp -> 'that'


In [55]:
tokens = 'who do you claim that you like'.split()
fro nltk import load_parser
cp = load_parser('grammars/book_grammars/feat1.fcfg')
for tree in cp.parse(tokens):
    print(tree)

SyntaxError: invalid syntax (<ipython-input-55-0a98d430af60>, line 2)

In [None]:
tokens = 'you claim that you like cats'.split()
for tree in cp.parser(tokens):
    print(tree)

In [None]:
tokens = 'rarely do you sing'.split()
for tree in cp.parse(tokens):
    print(tree)

## Case And Gender in German

In [None]:
nltk.data.show_cfg('grammars/book_grammars/german.fcfg')

In [None]:
tokens = 'ich folge den Katzen'.split()
cp = load_parser('grammars/book_grammars/german.fcfg')
for tree in cp.parse(tokens):
    print(tree)

In [None]:
tokens = 'ich folge den Katze'.split()
cp = load_parser('grammars/book_grammars/german.fcfg', trace=2)
for tree in cp.parse(tokens):
    print(tree)

## Summary