In [1]:
#The factors that work in the favor of spaCy are the set of features it offers, the ease of use, and the fact
#that the library is always kept up to date.

In [2]:
import spacy
# nlp = spacy.load('en_core_web_sm')

In [3]:
import en_core_web_sm


In [4]:
nlp = en_core_web_sm.load()
#Here, the nlp object is a language model instance.
#this is for english language.

In [5]:
nlp.pipe_names
#to get an idea what is inside a pipeline components.
# the text has to already go through this pipeline and if want only one functionality we can disable the other one.

['tagger', 'parser', 'ner']

In [6]:
about_text = ('Ram has a work to do but he is into something'
               ' he is working for a London-based Fintech'
               ' company. He is interested in learning'
               ' Natural Language Processing.')
about_doc = nlp(about_text)

#sentence detection. If you see even if i have given 4 sentences individually but spacy has formed 3 sentences only.
#it has formed sentences based on (.) but that is not true always.

In [7]:
#i feel that NLTK is exceeding spacy in terms of sentence tokenization.
sentences = list(about_doc.sents)
sentences

[Ram has a work to do,
 but he is into something he is working for a London-based Fintech company.,
 He is interested in learning Natural Language Processing.]

In [8]:
len(sentences)

3

In [9]:
#Tokenization 
for token in about_doc:
    print(token.text)

Ram
has
a
work
to
do
but
he
is
into
something
he
is
working
for
a
London
-
based
Fintech
company
.
He
is
interested
in
learning
Natural
Language
Processing
.


In [10]:
#removing stopwords
#here as we have considered english Language so we are calculating the stop words.
stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(stopwords)

326

In [11]:
#so the stopwords are removed.
list=[]
for token in about_doc:
    if not token.is_stop:
        list.append(token)
        

In [12]:
list

[Ram,
 work,
 working,
 London,
 -,
 based,
 Fintech,
 company,
 .,
 interested,
 learning,
 Natural,
 Language,
 Processing,
 .]

In [13]:
#pos tagging
for token in list:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.pos_)

Ram --> PROPN
work --> NOUN
working --> VERB
London --> PROPN
- --> PUNCT
based --> VERB
Fintech --> PROPN
company --> NOUN
. --> PUNCT
interested --> ADJ
learning --> VERB
Natural --> PROPN
Language --> PROPN
Processing --> PROPN
. --> PUNCT


In [24]:
#Lemmatization
file=""" Lemmatization is a way of dealing with the fact that while words like connect, connection, connecting, connected, etc.etc. aren’t exactly the same, they all have the same essential meaning: connect. The differences in spelling have grammatical functions in spoken language, but for machine processing, those differences can be confusing, so we need a way to change all the words that are forms of the word connect into the word connect itself."""
file=nlp(file)
for token in file:
    print (token,"--->" ,token.lemma_)
#Unlike verbs and common nouns, there’s no clear base form of a personal pronoun.
#-PRON-, which is used as the lemma for all personal pronouns.

  --->  
Lemmatization ---> Lemmatization
is ---> be
a ---> a
way ---> way
of ---> of
dealing ---> deal
with ---> with
the ---> the
fact ---> fact
that ---> that
while ---> while
words ---> word
like ---> like
connect ---> connect
, ---> ,
connection ---> connection
, ---> ,
connecting ---> connect
, ---> ,
connected ---> connect
, ---> ,
etc.etc ---> etc.etc
. ---> .
are ---> be
n’t ---> not
exactly ---> exactly
the ---> the
same ---> same
, ---> ,
they ---> -PRON-
all ---> all
have ---> have
the ---> the
same ---> same
essential ---> essential
meaning ---> meaning
: ---> :
connect ---> connect
. ---> .
The ---> the
differences ---> difference
in ---> in
spelling ---> spelling
have ---> have
grammatical ---> grammatical
functions ---> function
in ---> in
spoken ---> spoken
language ---> language
, ---> ,
but ---> but
for ---> for
machine ---> machine
processing ---> processing
, ---> ,
those ---> those
differences ---> difference
can ---> can
be ---> be
confusing ---> confusing
, --->

In [15]:
#detecting word frequency
from collections import Counter
file=""" Lemmatization is a way of dealing with the fact that while words like connect, connection, connecting, connected, etc.etc. aren’t exactly the same, they all have the same essential meaning: connect. The differences in spelling have grammatical functions in spoken language, but for machine processing, those differences can be confusing, so we need a way to change all the words that are forms of the word connect into the word connect itself."""
file=nlp(file)

In [16]:
# Remove stop words and punctuation symbols in list comprehension format
words = [token.text for token in file if not token.is_stop and not token.is_punct]
#there are simple functions in spacy

In [17]:
freq=Counter(words)

In [18]:
freq

Counter({' ': 1,
         'Lemmatization': 1,
         'way': 2,
         'dealing': 1,
         'fact': 1,
         'words': 2,
         'like': 1,
         'connect': 4,
         'connection': 1,
         'connecting': 1,
         'connected': 1,
         'etc.etc': 1,
         'exactly': 1,
         'essential': 1,
         'meaning': 1,
         'differences': 2,
         'spelling': 1,
         'grammatical': 1,
         'functions': 1,
         'spoken': 1,
         'language': 1,
         'machine': 1,
         'processing': 1,
         'confusing': 1,
         'need': 1,
         'change': 1,
         'forms': 1,
         'word': 2})

In [19]:
#to print the words by giving a frequency count
freqwords = [word for (word, freq) in freq.items() if freq == 2]
print (freqwords)

['way', 'words', 'differences', 'word']


In [22]:
#named entity recognition
#theory given in the document
doc = nlp("China spent over $71 billion in India in the year of 2018")
# we have the entities 
for e in doc.ents:
    print(e.text, e.label_)

China GPE
over $71 billion MONEY
India GPE
the year of 2018 DATE
