# Entity Recognition



In [1]:
# Basic Imports
import os
from urllib import request
from bs4 import BeautifulSoup
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.chunk import conlltags2tree, tree2conlltags

In [2]:
# Loading the Stanford NER Tagger
stanf_ner = StanfordNERTagger("./models/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz", "./models/stanford-ner/stanford-ner.jar", encoding='utf8')

## Extracting Named Entities from WebSource Textdata

The pipeline includes:

1) Preprocessing of the Textsource
- Downloading the Text from source
- Extracting p and h tags from the DOM
- Cleaning and encoding the Text string

2) NLP Annotation
- Tokenizing the string into word tokens
- Adding Part-of-speech Tags to the Tokens
- Parsing a grammatical Tree from the POS annotated Tokens
- Identifying Named Entitities from the Parsetree
- Converting the Parsetree into BOI annotated Named Entities, to identify multiple word Entities

In [122]:
def parse_page(url, tokenize=True, ne_tag=True, export_tree=False):
    "Opens a url and parses the header and p tag enclosed text and performs NER"
    
    # Return a clean list of extraced NE's
    def structure_ne(ne_tree):
        ne = []
        for subtree in ne_tree:
            if type(subtree) == nltk.tree.Tree:
                ne_label = subtree.label()
                ne_string = " ".join([token for token, pos in subtree.leaves()])
                ne.append((ne_string, ne_label))
        return ne
    
    
    # Make sure tokenization is enabled when tagging
    if ne_tag: tokenize = True
    
    # Open the file and preprocess html text
    response = request.urlopen('http://www.newsweek.com/trump-not-popular-hillary-clinton-even-worse-approval-rating-polls-661070')
    raw = response.read()
    soup = BeautifulSoup(raw, "lxml")
    
    # Extract text from DOM
    title = soup.find('h1').get_text()
    paragraphs = [para.get_text() for para in soup.select('p')]

    if tokenize:
        text =  nltk.word_tokenize(title +" " + ' '.join(paragraphs))
    
    if ne_tag:
        # POS
        tagged_words = nltk.pos_tag(text)
        # Parse Tree
        ne_tagged = nltk.ne_chunk(tagged_words)
        if export_tree:
            return structure_ne(ne_tagged )
        else:
            # Return TExt
            return tree2conlltags(ne_tagged)
        

### Test the Named Entity Extraction 

In [123]:
# 
tokens = parse_page("http://www.newsweek.com/trump-administration-backing-baker-who-refused-make-wedding-cake-gay-couple-661554", 
                    tokenize=True, 
                    ne_tag=True,
                    export_tree=True)
tokens

[('Donald', 'PERSON'),
 ('Trump', 'ORGANIZATION'),
 ('Popular', 'PERSON'),
 ('Hillary Clinton Is', 'PERSON'),
 ('Donald', 'PERSON'),
 ('State', 'ORGANIZATION'),
 ('Clinton', 'PERSON'),
 ('Mediaite', 'ORGANIZATION'),
 ('Trump', 'PERSON'),
 ('Hillary Clinton', 'PERSON'),
 ('American', 'GPE'),
 ('Trump', 'PERSON'),
 ('Donald Trump', 'PERSON'),
 ('Democratic', 'ORGANIZATION'),
 ('Hillary Clinton', 'PERSON'),
 ('Washington University', 'ORGANIZATION'),
 ('St. Louis', 'GPE'),
 ('Rick', 'PERSON'),
 ('NBC', 'ORGANIZATION'),
 ('Trump', 'PERSON'),
 ('Charlottesville', 'GPE'),
 ('Virginia—and', 'GPE'),
 ('U.S.', 'GPE'),
 ('Trump', 'PERSON'),
 ('FiveThirtyEight', 'ORGANIZATION'),
 ('FiveThirtyEight', 'ORGANIZATION'),
 ('Gerald Ford', 'PERSON'),
 ('Trump', 'PERSON'),
 ('Barack Obama', 'PERSON'),
 ('Obama', 'PERSON'),
 ('Clinton', 'PERSON'),
 ('Trump', 'PERSON'),
 ('Trump', 'PERSON'),
 ('James Comey', 'PERSON'),
 ('Hillary Clinton', 'PERSON'),
 ('Trump', 'PERSON'),
 ('Obama', 'PERSON'),
 ('Deferred 