In [2]:
import spacy

In [10]:
# load large english model
# python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

### Entity Extraction

In [11]:
# this text is taken from the wikipedia page regarding London
text = """London is the capital and most populous city of England and the United Kingdom. 
Standing on the River Thames in the south east of the island of Great Britain, 
London has been a major settlement for two millennia. It was founded by the Romans, who named it Londinium"""

# let's parse the text with spaCy. This runs the entire NLP pipeline
doc = nlp(text)

In [12]:
# this will print out all the named entities that were detected
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

London (GPE)
England (GPE)
the United Kingdom (GPE)
the River Thames (FAC)
Great Britain (GPE)

London (ORG)
two millennia (DATE)
Romans (NORP)
Londinium (PERSON)


### TO DO: Text Analytics NER

In [14]:
# Replace a token with "REDACTED" if it is a name
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.string

# Loop throug all the entities in a document and check if they are names 
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence".
In 1957, Noam Chomsky's Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

print(scrub(s))


In 1950, [REDACTED] published his famous article "Computing Machinery and Intelligence".
In 1957, [REDACTED] Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.



### Extracting Facts from Text

In [16]:
import textacy.extract
from pathlib import Path