<a href="https://colab.research.google.com/github/myrondza/Data-Science-Machine-Learning-Deep-Learning-AI-Guide-Algorithms/blob/master/Natural_Language_Processing_(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing (NLP)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words=None,
    token_pattern='(?u)\\b\\w\\w+\\b',
    ngram_range=(1, 1),
    analyzer='word',
    max_df=1.0,
    min_df=1,
    max_features=None,
    vocabulary=None,
    binary=False,
    )

In [0]:
corpus = ['This is an example of NLP',
          'This is the first document.',
          'And the second one.',
          'Is this the first document?']

X = vectorizer.fit_transform(corpus)
X    

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [0]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (['this', 'is', 'text', 'document', 'analyze'])

False

In [0]:
vectorizer.get_feature_names()
X.toarray()


array([[1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1]])

In [0]:
vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [0]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                     token_pattern=r'\b\w+\b', min_df=1)
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

True

In [0]:
X2 = bigram_vectorizer.fit_transform(corpus).toarray()
X2

array([[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        1, 0, 1]])

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
                 use_idf=True)

tfidf = transformer.fit_transform(X.toarray())
tfidf.toarray() 

array([[0.46714844, 0.        , 0.        , 0.46714844, 0.        ,
        0.25208067, 0.46714844, 0.46714844, 0.        , 0.        ,
        0.        , 0.25208067],
       [0.        , 0.        , 0.51741994, 0.        , 0.51741994,
        0.3935112 , 0.        , 0.        , 0.        , 0.        ,
        0.3935112 , 0.3935112 ],
       [0.        , 0.55121857, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.55121857, 0.55121857,
        0.29744623, 0.        ],
       [0.        , 0.        , 0.51741994, 0.        , 0.51741994,
        0.3935112 , 0.        , 0.        , 0.        , 0.        ,
        0.3935112 , 0.3935112 ]])

In [0]:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
ngram_vectorizer.get_feature_names() == ([' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp'])

counts.toarray().astype(int)

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [0]:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
ngram_vectorizer.fit_transform(['jumpy fox'])

<1x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

## Spacy

In [0]:
!pip install spacy
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors

nlp = spacy.load("en")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts

for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'be', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [0]:
import spacy

nlp = spacy.load("en")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [0]:
import spacy

nlp = spacy.load("en")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [0]:
import re
import spacy
from spacy.tokenizer import Tokenizer

prefix_re = re.compile(r'''^[[("']''')
suffix_re = re.compile(r'''[])"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)

nlp = spacy.load("en")

nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp(u"hello-world.")

print([t.text for t in doc])

['hello', '-', 'world.']


In [0]:
import spacy

nlp = spacy.load("en")
doc = nlp("I live in New York")

print("Before:", [token.text for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new york"})
    
print("After:", [token.text for token in doc])

Before: ['I', 'live', 'in', 'New', 'York']
After: ['I', 'live', 'in', 'New York']


In [0]:
import spacy

nlp = spacy.load("en")
doc = nlp(u"This is a sentence. This is another sentence.")

for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [0]:
from spacy.lang.en import English
from spacy.matcher import Matcher

nlp = English()  # We only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)

pos_emoji = [u"😀", u"😃", u"😂", u"🤣", u"😊", u"😍"]  # Positive emoji
neg_emoji = [u"😞", u"😠", u"😩", u"😢", u"😭", u"😒"]  # Negative emoji

# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

# Function to label the sentiment
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment

matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern
matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern

# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])

doc = nlp(u"Hello world 😀 #MondayMotivation")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #MondayMotivation


In [0]:
import spacy

nlp = spacy.load("en")
doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Alex Smith', 'PERSON'), ('first', 'ORDINAL'), ('Acme Corp Inc.', 'ORG')]


In [0]:
import spacy
from spacy.pipeline import merge_entities
from spacy import displacy

nlp = spacy.load("en")

def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe(merge_entities)
nlp.add_pipe(extract_person_orgs)

doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc, options={'fine_grained': True})

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}


'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="61ef71ef3d4248f4996491a946b1ace0-0" class="displacy" width="750" height="224.5" direction="ltr" style="max-width: none; height: 224.5px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="50">Alex Smith</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">NNP</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="225">worked</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">VBD</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="400">at</tspan>\n    <tspan class="displac

In [0]:
import spacy
from spacy import displacy

text = """But Google is starting from behind. The company made a late push
into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""

nlp = spacy.load("en")
doc = nlp(text)
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [0]:
from spacy.matcher import Matcher, PhraseMatcher
import spacy
nlp = spacy.load("en") 
nlp = spacy.load("en_core_web_sm") 

def on_match(matcher, doc, id, matches):
    print('Matched!', matches)

matcher = PhraseMatcher(nlp.vocab)
assert "Food" not in matcher
matcher.add("Food", on_match,nlp(u"cuisine"),nlp(u"beverages"),nlp(u"fruits"),nlp(u"Bread rolls"),nlp(u"breakfasts"),nlp(u"custard"),nlp(u"fish"),nlp(u"Drinks"),nlp(u"Food"),nlp(u"snack"),nlp(u"Water"),nlp(u"food"),nlp(u"snacks"),nlp(u"lunch"),nlp(u"drinks"),nlp(u"dinner"),nlp(u"meal"),nlp(u"meals"),nlp(u"dairy"),nlp(u"icecream"),nlp(u"noodles"),nlp(u"beer"),nlp(u"wine"),nlp(u"Drink"),nlp(u"lamb"),nlp(u"water"))
assert "Food" in matcher

In [0]:
doc = nlp(u"The airlines provided us with great food food and snacks")
matches=matcher(doc)

print("Key for matched words : ",matches)

Matched! [(13829306678568089077, 6, 7), (13829306678568089077, 7, 8), (13829306678568089077, 9, 10)]
Matched! [(13829306678568089077, 6, 7), (13829306678568089077, 7, 8), (13829306678568089077, 9, 10)]
Matched! [(13829306678568089077, 6, 7), (13829306678568089077, 7, 8), (13829306678568089077, 9, 10)]
Key for matched words :  [(13829306678568089077, 6, 7), (13829306678568089077, 7, 8), (13829306678568089077, 9, 10)]


## NLTK

In [0]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


In [0]:
import nltk
nltk.download('punkt')

from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
   
sentence = "Programers program with programing languages"
words = word_tokenize(sentence) 
   
for w in words: 
    print(w, " : ", ps.stem(w)) 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Programers  :  program
program  :  program
with  :  with
programing  :  program
languages  :  languag


In [0]:
import random 
from nltk.corpus import names 
import nltk 
nltk.download('names')
  
def gender_features(word): 
    return {'last_letter':word[-1]} 
  
# preparing a list of examples and corresponding class labels. 
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+
             [(name, 'female') for name in names.words('female.txt')]) 
  
random.shuffle(labeled_names) 
  
# we use the feature extractor to process the names data. 
featuresets = [(gender_features(n), gender)  
               for (n, gender)in labeled_names] 
  
# Divide the resulting list of feature 
# sets into a training set and a test set. 
train_set, test_set = featuresets[500:], featuresets[:500] 
  
# The training set is used to  
# train a new "naive Bayes" classifier. 
classifier = nltk.NaiveBayesClassifier.train(train_set) 
  
print(classifier.classify(gender_features('Myron'))) 
  
# output should be 'male' 
print(nltk.classify.accuracy(classifier, train_set)) 

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
male
0.7634336378291241


In [0]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     36.5 : 1.0
             last_letter = 'k'              male : female =     32.4 : 1.0
             last_letter = 'f'              male : female =     16.1 : 1.0
             last_letter = 'p'              male : female =     11.3 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0
             last_letter = 'd'              male : female =     10.1 : 1.0
             last_letter = 'm'              male : female =      9.6 : 1.0
             last_letter = 'o'              male : female =      8.6 : 1.0
             last_letter = 'r'              male : female =      6.6 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0


In [0]:
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('wordnet')  
lemmatizer = WordNetLemmatizer() 
  
print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 
  
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a")) 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
rocks : rock
corpora : corpus
better : good


## TextBlob

In [0]:
from textblob import TextBlob
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
 
text = "The service was good"

blob = TextBlob(text)
blob.tags          

blob.noun_phrases   
for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
0.7


In [0]:
from textblob import Word
w = Word('falibility')
w.spellcheck()

[('fallibility', 1.0)]

In [0]:
en_blob = TextBlob(u'Good Morning')
print(en_blob.translate(to='es'))
print(en_blob.translate(to='fr'))
print(en_blob.translate(to='zh-CN'))
print(en_blob.translate(to='ar'))

Buenos días
Bonjour
早上好
صباح الخير


In [0]:
b = TextBlob(u"بسيط هو أفضل من مجمع")
b.detect_language()

'ar'

In [0]:
blob = TextBlob("Now is better than never.")
blob.ngrams(n=3)

[WordList(['Now', 'is', 'better']),
 WordList(['is', 'better', 'than']),
 WordList(['better', 'than', 'never'])]

In [0]:
from textblob import Word
from textblob.wordnet import VERB
word = Word("octopus")
word.synsets
Word("hack").get_synsets(pos=VERB)

[Synset('chop.v.05'),
 Synset('hack.v.02'),
 Synset('hack.v.03'),
 Synset('hack.v.04'),
 Synset('hack.v.05'),
 Synset('hack.v.06'),
 Synset('hack.v.07'),
 Synset('hack.v.08')]

In [0]:
 Word("rain").definitions

['water falling in drops from vapor condensed in the atmosphere',
 'drops of fresh water that fall as precipitation from clouds',
 'anything happening rapidly or in quick successive',
 'precipitate as rain']

## Google Translation

In [0]:
#!pip install googletrans
from googletrans import Translator
translator = Translator(service_urls=[
      'translate.google.com',
      'translate.google.co.uk',
      'translate.google.co.in'
    ])
translations = translator.translate(['صباح الخير', '早上好', 'Buenos días'], dest='en')
for translation in translations:
    print(translation.origin, ' -> ', translation.text)

صباح الخير  ->  good morning
早上好  ->  Good morning
Buenos días  ->  Good Morning


In [0]:
from googletrans import Translator
translator = Translator(service_urls=[
      'translate.google.com',
      'translate.google.co.uk',
      'translate.google.co.in'
    ])
translations = translator.translate('안녕하세요', dest='ja')
print(translations)

Translated(src=ko, dest=ja, text=こんにちは, pronunciation=Kon'nichiwa, extra_data="{'translat...")


In [0]:
a = translator.detect('안녕하세요')
print(a)

Detected(lang=ko, confidence=1)


In [0]:
def google_translator(x):
    translator = Translator()
    translations=translator.translate(x, dest='en')
    return translations.text

In [0]:
a = google_translator('안녕하세요')
print(a)

Hi


## Sentiment Analysis

### Vader Algorithm

In [4]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentences = ["The food was good.",
             "The service was not very good!", 
             "Not bad at all",
             "The service was horrible"]

analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<50} {}".format(sentence, str(vs)))

The food was good.-------------------------------- {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
The service was not very good!-------------------- {'neg': 0.368, 'neu': 0.632, 'pos': 0.0, 'compound': -0.4432}
Not bad at all------------------------------------ {'neg': 0.0, 'neu': 0.513, 'pos': 0.487, 'compound': 0.431}
The service was horrible-------------------------- {'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'compound': -0.5423}


In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentences = ["The food was good. 😍",
             "The service was not very good 😩 ", 
             "Not bad at all 😀",
             "The service was horrible 😢"]

analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<50} {}".format(sentence, str(vs)))

The food was good. 😍------------------------------ {'neg': 0.0, 'neu': 0.504, 'pos': 0.496, 'compound': 0.7096}
The service was not very good 😩 ------------------ {'neg': 0.246, 'neu': 0.564, 'pos': 0.19, 'compound': -0.1538}
Not bad at all 😀---------------------------------- {'neg': 0.0, 'neu': 0.428, 'pos': 0.572, 'compound': 0.6542}
The service was horrible 😢------------------------ {'neg': 0.623, 'neu': 0.377, 'pos': 0.0, 'compound': -0.765}


In [2]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentences = ["😀","🤣","😩","😭","😍","😒","😢"]

analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<50} {}".format(sentence, str(vs)))

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/86/9e/c53e1fc61aac5ee490a6ac5e21b1ac04e55a7c2aba647bb8411c9aadf24e/vaderSentiment-3.2.1-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 2.9MB/s 
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.2.1
😀------------------------------------------------- {'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.3612}
🤣------------------------------------------------- {'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4939}
😩------------------------------------------------- {'neg': 0.677, 'neu': 0.323, 'pos': 0.0, 'compound': -0.2732}
😭------------------------------------------------- {'neg': 0.608, 'neu': 0.392, 'pos': 0.0, 'compound': -0.4767}
😍------------------------------------------------- {'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound': 0.4588}
😒------------------------------------------------- {'neg': 0.0, 'neu': 1.0, 'pos': 0.