# Chapter 7: Context-Aware Text Analysis

## Grammar-Based Feature Extraction

### Context-Free Grammars

In [1]:
import re
# import nltk
from nltk import CFG

from reader import PickledCorpusReader

GRAMMAR = """
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen' | 'George'
    V -> 'looks' | 'burns'
    P -> 'in' | 'for'
    DT -> 'the'
    N -> 'castle' | 'ocean'
    """

cfg = nltk.CFG.fromstring(GRAMMAR)

print(cfg)
print(cfg.start())
print(cfg.productions())

Grammar with 13 productions (start state = S)
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen'
    NNP -> 'George'
    V -> 'looks'
    V -> 'burns'
    P -> 'in'
    P -> 'for'
    DT -> 'the'
    N -> 'castle'
    N -> 'ocean'
S
[S -> NNP VP, VP -> V PP, PP -> P NP, NP -> DT N, NNP -> 'Gwen', NNP -> 'George', V -> 'looks', V -> 'burns', P -> 'in', P -> 'for', DT -> 'the', N -> 'castle', N -> 'ocean']


### Syntactic Parsers

In [2]:
from nltk.chunk.regexp import RegexpParser

In [8]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
chunker = RegexpParser(GRAMMAR)

In [9]:
chunker

<chunk.RegexpParser with 1 stages>

In [4]:
phrase = """
    Dusty Baker proposed a simple solution to the Washington National's 
    early-season bullpen troubles Monday afternoon and it had nothing to 
    do with his maligned group of relievers.
    """

Define our `KeyphraseExtractor` with a grammar and chunker to identify just the *noun phrases* using part-of-speech text.

In [19]:
from unicodedata import category as unicat
from itertools import groupby
from nltk.chunk import tree2conlltags

GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = frozenset(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])

class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)
    
    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)
    
    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [
                    " ".join(word for word, pos, chunk in group).lower()
                    for key, group in groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for phrase in phrases:
                    yield phrase
        
    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))

In [24]:
k = KeyphraseExtractor(phrase)

In [43]:
if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../ATAwP/corpus')
    docs = corpus.docs()

    phrase_extractor = KeyphraseExtractor()
    keyphrases = list(phrase_extractor.fit_transform(docs))
    print(keyphrases[0])

#     entity_extractor = EntityExtractor()
#     entities = list(entity_extractor.fit_transform(docs))
#     print(entities[0])

['lonely city', 'heart piercing wisdom', 'loneliness', 'laing', 'everyone', 'feast later', 'point', 'own hermetic existence in new york', 'danger', 'thankfully', 'lonely city', 'cry for connection', 'overcrowded overstimulated world', 'blueprint of urban loneliness', 'emotion', 'calls', 'city', 'npr jason heller', 'olivia laing', 'lonely city', 'exploration of loneliness', 'others experiences in new york city', 'rumpus', 'review', 'lonely city', 'related posts']


In [44]:
from nltk import ne_chunk

GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])

class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                trees = ne_chunk(sentence)
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)

In [51]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

In [52]:
if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../ATAwP/corpus')
    docs = corpus.docs()

#     phrase_extractor = KeyphraseExtractor()
#     keyphrases = list(phrase_extractor.fit_transform(docs))
#     print(keyphrases[0])

    entity_extractor = EntityExtractor()
    entities = list(entity_extractor.fit_transform(docs))
    print(entities[0])

['lonely city', 'loneliness', 'laing', 'new york', 'lonely city', 'npr', 'jason heller', 'olivia laing', 'lonely city', 'new york city', 'rumpus', 'lonely city', 'related']
