In [5]:
import nltk
from nltk import CFG
from nltk.corpus import stopwords

In [6]:
# Globals
stop_words = stopwords.words('english')

In [7]:
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

In [8]:
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word)
    return word

In [9]:
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40 and word.lower() not in stop_words)
    return accepted

In [10]:
def get_terms(tree):
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        yield term

In [11]:
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital
computer or the gears of a cycle transmission as he does at the top of a mountain
or in the petals of a flower. To think otherwise is to demean the Buddha...which is
to demean oneself."""

text = """Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!"""

#text = """The little yellow dog barked at the cat"""

In [12]:
# Used when tokenizing words
sentence_re = r'''(?x)      # set flag to allow verbose regexps
      ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
    | \w+(-\w+)*            # words with optional internal hyphens
    | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
    | \.\.\.                # ellipsis
    | [][.,;"'?():-_`]      # these are separate tokens
'''

In [13]:
#Taken from Su Nam Kim Paper...
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
'''
grammar = r"""
    NP:
        {<DT>?<JJ>*<NN>}
"""

grammar = r"""
    NP:
        {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
        {<NNP>+}                # chunk sequences of proper nouns
"""

grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
"""

grammar = r"""
    NP:
        {<NN.?><VB.?>?<JJ.?>}
"""

grammar = r"""
    NP: {<DT>? <JJ>* <NN>*} # NP
    P: {<IN>}           # Preposition
    V: {<V.*>}          # Verb
    PP: {<P> <NP>}      # PP -> P NP
    VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
"""

grammar = """
    NP:   {<PRP>?<JJ.*>*<NN.*>+}
    CP:   {<JJR|JJS>}
    VERB: {<VB.*>}
    THAN: {<IN>}
    COMP: {<DT>?<NP><RB>?<VERB><DT>?<CP><THAN><DT>?<NP>}
"""

grammar = r"""
    NP: 
        {<NN><VBD><JJ>(<CC><JJ>)?}
"""
'''

'\ngrammar = r"""\n    NP:\n        {<DT>?<JJ>*<NN>}\n"""\n\ngrammar = r"""\n    NP:\n        {<DT|PP\\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun\n        {<NNP>+}                # chunk sequences of proper nouns\n"""\n\ngrammar = r"""\n  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN\n  PP: {<IN><NP>}               # Chunk prepositions followed by NP\n  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments\n  CLAUSE: {<NP><VP>}           # Chunk NP, VP\n"""\n\ngrammar = r"""\n    NP:\n        {<NN.?><VB.?>?<JJ.?>}\n"""\n\ngrammar = r"""\n    NP: {<DT>? <JJ>* <NN>*} # NP\n    P: {<IN>}           # Preposition\n    V: {<V.*>}          # Verb\n    PP: {<P> <NP>}      # PP -> P NP\n    VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*\n"""\n\ngrammar = """\n    NP:   {<PRP>?<JJ.*>*<NN.*>+}\n    CP:   {<JJR|JJS>}\n    VERB: {<VB.*>}\n    THAN: {<IN>}\n    COMP: {<DT>?<NP><RB>?<VERB><DT>?<CP><THAN><DT>?<NP>}\n"""\n\ngrammar = r"""\n    NP: \n        {<NN>

In [14]:
tokenizer = nltk.tokenize.RegexpTokenizer(sentence_re)
chunker = nltk.RegexpParser(grammar)
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

In [15]:
tokens = nltk.word_tokenize(text)
pos_tokens = nltk.tag.pos_tag(tokens)
#pos_tokens

In [16]:
tree = chunker.parse(pos_tokens)
tree.draw()

In [121]:
terms = get_terms(tree)
for term in terms:
    for word in term:
        print(word)
    print()