In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
example = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(example)
for ent in doc.ents:
    print(ent.text, ent.label_)

Sebastian NORP
Google ORG
2007 DATE
American NORP
Recode ORG
earlier this week DATE


In [4]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'talk', 'say']


In [5]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Sebastian NORP
Google ORG
2007 DATE
American NORP
Recode ORG
earlier this week DATE


In [6]:
for ent in doc.ents:
    print(ent.text, ent.label)

Sebastian 381
Google 383
2007 391
American 381
Recode 383
earlier this week 391


In [7]:
from spacy import displacy
displacy.render(doc,style='ent')

## Working on data from a website; cleaning data; performing some tasks

In [8]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [9]:
url = "https://blog.aureusanalytics.com/blog/5-natural-language-processing-techniques-for-extracting-information"

In [10]:
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
print(soup)

<!DOCTYPE doctype html>
<!-- start coded_template: id:5891637539 path:generated_layouts/5891637533.html --><!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en-us" > <![endif]--><!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en-us" >        <![endif]--><!--[if IE 8]>    <html class="no-js lt-ie9" lang="en-us" >               <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en-us"><!--<![endif]--><head>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="Neeraja Vaidya" name="author"/>
<meta content="There are 5 common techniques used in information extraction. When dealing with information such as text, video, audio and photos, natural language understanding allows us to extract key data that will provide a greater understanding of the customer's sentiment. " name="description"/>
<meta content="HubSpot" name="generator"/>
<title>5 Natural Language Processing Techniques for Extracting Information</ti

In [11]:
new_text = soup.find_all('p')

In [12]:
new_text

[<p>The field of artificial intelligence has always envisioned machines being able to mimic the functioning and abilities of the human mind. Language is considered as one of the most significant achievements of humans that has accelerated the progress of humanity. So, it is not a surprise that there is plenty of work being done to integrate language into the field of artificial intelligence in the form of Natural Language Processing (NLP). Today we see the work being manifested in likes of Alexa and Siri.</p>,
 <p><!--more--></p>,
 <p>NLP primarily comprises of <a href="https://en.wikipedia.org/wiki/Natural-language_understanding" rel="noopener" target="_blank">n<span>atural language understanding</span></a> (human to machine) and <a href="https://medium.com/@AutomatedInsights/the-ultimate-guide-to-natural-language-generation-bdcb457423d6" rel="noopener" target="_blank">n<span>atural language generation</span></a> (machine to human). This article will mainly deal with natural language 

In [13]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [14]:
processed_text = remove_html_tags(str(new_text))

In [15]:
processed_text

'[The field of artificial intelligence has always envisioned machines being able to mimic the functioning and abilities of the human mind. Language is considered as one of the most significant achievements of humans that has accelerated the progress of humanity. So, it is not a surprise that there is plenty of work being done to integrate language into the field of artificial intelligence in the form of Natural Language Processing (NLP). Today we see the work being manifested in likes of Alexa and Siri., , NLP primarily comprises of natural language understanding\xa0(human to machine) and natural language generation\xa0(machine to human). This article will mainly deal with natural language understanding (NLU). In recent years there has been a surge in unstructured data in the form of text, videos, audio and photos. NLU aids in extracting valuable information from text such as social media data, customer surveys, and complaints., Consider the text snippet below from a customer review of

In [16]:
# Converting text to object understand by spacy
specyTXT = nlp(processed_text)

In [17]:
# Detecting sentences
sentences = list(specyTXT.sents)
len(sentences)
for sentence in sentences:
    print (sentence)

[The field of artificial intelligence has always envisioned machines being able to mimic the functioning and abilities of the human mind.
Language is considered as one of the most significant achievements of humans that has accelerated the progress of humanity.
So, it is not a surprise that there is plenty of work being done to integrate language into the field of artificial intelligence in the form of Natural Language Processing (NLP).
Today we see the work being manifested in likes of Alexa and Siri.
, , NLP primarily comprises of natural language understanding (human to machine) and natural language generation (machine to human).
This article will mainly deal with natural language understanding (NLU).
In recent years there has been a surge in unstructured data in the form of text, videos, audio and photos.
NLU aids in extracting valuable information from text such as social media data, customer surveys, and complaints.
, Consider the text snippet below from a customer review of a fi

In [18]:
# Tokenization
tokens = [token.text for token in specyTXT]
tokens[:50]

['[',
 'The',
 'field',
 'of',
 'artificial',
 'intelligence',
 'has',
 'always',
 'envisioned',
 'machines',
 'being',
 'able',
 'to',
 'mimic',
 'the',
 'functioning',
 'and',
 'abilities',
 'of',
 'the',
 'human',
 'mind',
 '.',
 'Language',
 'is',
 'considered',
 'as',
 'one',
 'of',
 'the',
 'most',
 'significant',
 'achievements',
 'of',
 'humans',
 'that',
 'has',
 'accelerated',
 'the',
 'progress',
 'of',
 'humanity',
 '.',
 'So',
 ',',
 'it',
 'is',
 'not',
 'a',
 'surprise']

In [19]:
# removing stop words and punctuation symbols
txt_rm_sw_ps =[token for token in specyTXT if not token.is_stop and not token.is_punct]
txt_rm_sw_ps[:50]

[field,
 artificial,
 intelligence,
 envisioned,
 machines,
 able,
 mimic,
 functioning,
 abilities,
 human,
 mind,
 Language,
 considered,
 significant,
 achievements,
 humans,
 accelerated,
 progress,
 humanity,
 surprise,
 plenty,
 work,
 integrate,
 language,
 field,
 artificial,
 intelligence,
 form,
 Natural,
 Language,
 Processing,
 NLP,
 Today,
 work,
 manifested,
 likes,
 Alexa,
 Siri,
 NLP,
 primarily,
 comprises,
 natural,
 language,
 understanding,
  ,
 human,
 machine,
 natural,
 language,
 generation]

In [20]:
# word frequency
from collections import Counter
word_freq = Counter(txt_rm_sw_ps)

In [21]:
# 5 commonly occurring words
common_words = word_freq.most_common(5)
print (common_words)

# after reading specyTXT it seems that language is used more than once but only this output i got

[(field, 1), (artificial, 1), (intelligence, 1), (envisioned, 1), (machines, 1)]


In [22]:
# Lemmatization
for token,i in zip(specyTXT,range(100)):
    print (token, token.lemma_)  
#printing first 100 lemmas

[ [
The the
field field
of of
artificial artificial
intelligence intelligence
has have
always always
envisioned envision
machines machine
being be
able able
to to
mimic mimic
the the
functioning functioning
and and
abilities ability
of of
the the
human human
mind mind
. .
Language language
is be
considered consider
as as
one one
of of
the the
most most
significant significant
achievements achievement
of of
humans human
that that
has have
accelerated accelerate
the the
progress progress
of of
humanity humanity
. .
So so
, ,
it -PRON-
is be
not not
a a
surprise surprise
that that
there there
is be
plenty plenty
of of
work work
being be
done do
to to
integrate integrate
language language
into into
the the
field field
of of
artificial artificial
intelligence intelligence
in in
the the
form form
of of
Natural Natural
Language Language
Processing Processing
( (
NLP NLP
) )
. .
Today today
we -PRON-
see see
the the
work work
being be
manifested manifest
in in
likes like
of of
Alexa Alexa
and 