In [1]:
# Import required NLTK libraries (example from www.nltk.org/howto/grammar.html)
from nltk import CFG, PCFG
import random, re

In [2]:
# Create a new grammar
grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP
VP -> V NP | VP PP
Det -> 'a' | 'the'
N -> 'dog' | 'cat'
V -> 'chased' | 'sat'
P -> 'on' | 'in'
""")
grammar

<Grammar with 14 productions>

In [3]:
# We can now view all of the possibilities of this grammar, the "productions"
grammar.productions()

[S -> NP VP,
 PP -> P NP,
 NP -> Det N,
 NP -> NP PP,
 VP -> V NP,
 VP -> VP PP,
 Det -> 'a',
 Det -> 'the',
 N -> 'dog',
 N -> 'cat',
 V -> 'chased',
 V -> 'sat',
 P -> 'on',
 P -> 'in']

In [4]:
# We can also define our grammar probabilisticly
prob_grammar = PCFG.fromstring("""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
""")
prob_grammar

<Grammar with 17 productions>

In [5]:
# And again, we can view the probabilistic constructions
prob_grammar.productions()

[S -> NP VP [1.0],
 NP -> Det N [0.5],
 NP -> NP PP [0.25],
 NP -> 'John' [0.1],
 NP -> 'I' [0.15],
 Det -> 'the' [0.8],
 Det -> 'my' [0.2],
 N -> 'man' [0.5],
 N -> 'telescope' [0.5],
 VP -> VP PP [0.1],
 VP -> V NP [0.7],
 VP -> V [0.2],
 V -> 'ate' [0.35],
 V -> 'saw' [0.65],
 PP -> P NP [1.0],
 P -> 'with' [0.61],
 P -> 'under' [0.39]]

In [6]:
# This type of model is useful for creating hidden markov models
# For example, to anticipate what the next word is, given a noun (N) we could do the following.
prob = 0.2
curr_prob = 0
POS = "NP"

print("Our test probability is " + str(prob))
print("Our test POS is " + POS)

# If we want to use the production object we can use the following keys:
#    _lhs, _rhs, _ProbabilisticMixIn__prob

print("\nPotential Matches:")
for production in prob_grammar.productions():
    if (str(production._lhs) == POS and bool(re.search("\'", str(production._rhs)))):
        curr_prob = curr_prob + production._ProbabilisticMixIn__prob
        if (curr_prob >= prob):
            print("Our next word is " + str(production._rhs))
            break
   

Our test probability is 0.2
Our test POS is NP

Potential Matches:
Our next word is (u'I',)
