# Natural Language Processing Practice with Tweets

In [39]:
import spacy  
import pandas as pd 
import os, glob

def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))
    
from IPython.display import Markdown, display


In [None]:
!python -m spacy download en_core_web_lg


In [20]:
nlp = spacy.load('en_core_web_lg') # ;pad language


In [21]:
tweets=('../all_djt_tweets.csv')
data=pd.read_csv(tweets)
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.1,Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,0,Twitter for iPhone,Over 90% approval rating for your all time fav...,Mon Aug 27 00:39:38 +0000 2018,27040,106838.0,False,1.033877e+18
1,1,Twitter for iPhone,“Mainstream Media tries to rewrite history to ...,Sun Aug 26 22:01:33 +0000 2018,21346,76682.0,False,1.033837e+18
2,2,Twitter for iPhone,Fantastic numbers on consumer spending release...,Sun Aug 26 14:31:06 +0000 2018,18960,87334.0,False,1.033724e+18
3,3,Twitter for iPhone,"...And it will get, as I have always said, muc...",Sun Aug 26 14:27:16 +0000 2018,14963,62956.0,False,1.033723e+18
4,4,Twitter for iPhone,RT @realDonaldTrump: Social Media Giants are s...,Sun Aug 26 14:25:47 +0000 2018,50142,0.0,True,1.033722e+18


## Named Entity Recognition (NER)  
> spaCy can recognize various types of entities in a document, by asking the model for a prediction. 

In [24]:
def text_entries_explanation(text):
    doc=nlp(text)
    for ent in doc.ents:
        print(f'Entity: {ent}, Label: {ent.label_}, {spacy.explain(ent.label_)}')

In [26]:
text_entries_explanation(data['text'][12])

Entity: FBI, Label: ORG, Companies, agencies, institutions, etc.
Entity: 3000, Label: CARDINAL, Numerals that do not fall under another type
Entity: 675,000, Label: CARDINAL, Numerals that do not fall under another type
Entity: Hillary Clinton, Label: PERSON, People, including fictional


In [30]:
one_sentence=data['text'][4]
doc=nlp(one_sentence)
spacy.displacy.render(doc, style='ent', jupyter=True)

In [32]:
one_sentence = data['text'][240]
doc = nlp(one_sentence)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [33]:
one_sentence = data['text'][450]
doc = nlp(one_sentence)
spacy.displacy.render(doc, style='ent',jupyter=True)

### Redacting Names  
A simple case for NER is to automatically redact names.  
Examples where it may be useful,  
- to ensure that your company data actually complies with GDPR  
- when journalists want to publish a large set of documents while still hiding the identity of their sources  

We do this redaction by following broad steps:  
   1. find all PERSON names 
   2. replace these by a filler like ["REDACTED"]  


In [50]:
def redact_names(text):
    doc=nlp(text)
    redacted_sentence=[]
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
        
    for token in doc:
        if token.ent_type_ == "PERSON":
            redacted_sentence.append("[REDACTED]")
        else:
            redacted_sentence.append(token.ent_type_)
    return "".join(redacted_sentence)

In [51]:
printmd("**Before**", color="blue")
one_sentence = data['text'][450]
doc = nlp(one_sentence)
spacy.displacy.render(doc, style='ent',jupyter=True)


printmd("**After**", color="blue")
one_sentence = redact_names(data['text'][450])
doc = nlp(one_sentence)
spacy.displacy.render(doc, style='ent',jupyter=True)

printmd("Notice that `Obama W.H.` was removed", color="#6290c8")


<span style='color:blue'>**Before**</span>

<span style='color:blue'>**After**</span>



<span style='color:#6290c8'>Notice that `Obama W.H.` was removed</span>