In [1]:
!pip install spacy

!python -m spacy download en

Collecting spacy
  Downloading spacy-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 270 kB/s eta 0:00:01
[?25hCollecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4
  Downloading pydantic-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 189 kB/s eta 0:00:01
[?25hCollecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (490 kB)
[K     |████████████████████████████████| 490 kB 5.4 MB/s eta 0:00:01
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting preshed<3.1.0,>=3.0.2
  D

Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Tokenization

In [13]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

In [14]:
type(my_doc)

spacy.tokens.doc.Doc

In [15]:
# Create list of word tokens
token_list = []

for token in my_doc:
    token_list.append(token.text)

print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [16]:
my_doc.sents

<generator at 0x7f8872d1b0e0>

In [17]:
nlp.add_pipe('sentencizer')

my_doc = nlp(text)

In [18]:
sents_list = []

for sent in my_doc.sents:
    sents_list.append(sent.text)
    
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


## Remove Stop Words

In [19]:
filtered_tokens = []

# filtering stop words and punctuations
for word in my_doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word)

print("Filtered Sentence:",filtered_tokens)

Filtered Sentence: [learning, data, science, discouraged, 
, Challenges, setbacks, failures, journey, got]


## Lemmatization

In [22]:
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

# Implementing lemmatization
lem = nlp("run runs running runner")

# finding lemma for each word
for word in lem:
    print(word.text,"==>" ,word.lemma_)

run ==> run
runs ==> run
running ==> run
runner ==> runner


## Complete Process

In [24]:
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

my_doc = nlp(text)

filtered_tokens = []

# filtering stop words and punctuations
for word in my_doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word)

print("Filtered Sentence:",filtered_tokens)

normalized_tokens=[]

for token in filtered_tokens:
    normalized_tokens.append(token.lemma_)
    
print("Lemmatized Tokens:",normalized_tokens)

Filtered Sentence: [learning, data, science, discouraged, 
, Challenges, setbacks, failures, journey, got]
Lemmatized Tokens: ['learn', 'data', 'science', 'discourage', '\n', 'challenge', 'setback', 'failure', 'journey', 'get']


In [26]:
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

my_doc = nlp(text)

filtered_tokens = []

# filtering stop words and punctuations
for word in my_doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word.lemma_)
            
filtered_tokens 

['learn',
 'data',
 'science',
 'discourage',
 '\n',
 'challenge',
 'setback',
 'failure',
 'journey',
 'get']

## PoS Tagging

In [27]:
for word in my_doc:
    print(word.text,word.pos_)

When SCONJ
learning VERB
data NOUN
science NOUN
, PUNCT
you PRON
should AUX
n't PART
get AUX
discouraged VERB
! PUNCT

 SPACE
Challenges NOUN
and CCONJ
setbacks NOUN
are AUX
n't PART
failures NOUN
, PUNCT
they PRON
're AUX
just ADV
part NOUN
of ADP
the DET
journey NOUN
. PUNCT
You PRON
've AUX
got VERB
this PRON
! PUNCT


## Entity Detection

In [29]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm   

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load() 

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

In [30]:
entities=[(i, i.label_, i.label) for i in nytimes.ents]

print(entities)

[(New York City, 'GPE', 384), (Tuesday, 'DATE', 391), (At least 285, 'CARDINAL', 397), (September, 'DATE', 391), (Brooklyn, 'GPE', 384), (four, 'CARDINAL', 397), (Zip, 'PERSON', 380), (Bill de Blasio, 'PERSON', 380), (Tuesday, 'DATE', 391), (Orthodox, 'NORP', 381), (6 months old, 'DATE', 391), (up to $1,000, 'MONEY', 394)]


In [31]:
#for visualization of Entity detection importing displacy from spacy:
from spacy import displacy

displacy.render(nytimes, style = "ent",jupyter = True)

## Dependency Parsing

In [32]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm   

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load() 

#  "nlp" Object is used to create documents with linguistic annotations.
docp = nlp ("In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [33]:
#for visualization of Entity detection importing displacy from spacy:
from spacy import displacy

displacy.render(nytimes, style = "dep",jupyter = True)

## Word Vector Representation

In [34]:
# import en_core_web_sm small spacy model
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Object is used to create documents with linguistic annotations.
mango = nlp(u'mango')

print(mango.vector.shape)

(96,)


In [35]:
mango.vector

array([ 0.67121375,  0.547875  ,  1.1662204 , -0.8862695 , -0.7446909 ,
        1.4388278 , -0.0310275 ,  0.14206484,  1.4838524 , -1.1957114 ,
        0.08386576, -0.911056  ,  0.05417725, -0.10735506, -1.0579314 ,
        0.1266487 , -0.2706933 ,  1.569655  ,  0.9489083 , -0.14418542,
        0.3911935 , -0.14170304, -0.7126999 ,  1.6709299 ,  0.7659299 ,
        1.1870233 ,  0.08993122,  0.7558858 ,  0.2850564 ,  1.481607  ,
       -0.04087245, -0.34370816, -0.64876366, -1.1423168 , -0.4228799 ,
        1.2094326 , -0.03956398,  0.2813167 ,  0.54050326,  0.3834293 ,
       -0.26618952,  0.14527136, -1.0663483 , -0.21301082, -0.00739029,
       -2.210084  ,  0.540864  ,  1.0819434 ,  0.66898596, -1.1011103 ,
        0.07047433, -1.638876  , -0.7076828 , -1.7068787 ,  0.42300764,
       -0.5747217 , -1.5162356 , -1.0353892 , -0.18434292, -0.32043636,
       -0.588754  , -0.7868452 , -0.9719215 , -0.7482023 , -0.24716026,
       -0.25685   , -0.19451576,  1.3188727 , -0.08707494, -1.34

In [36]:
doc1 = nlp("I like apple.")

doc2 = nlp("I like orange.")

doc1.similarity(doc2) # Cosine Similarity

  doc1.similarity(doc2)


0.8289199322701423