In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x1bb0cf01520>

### Lemmatization
* Lemmatization is convert a text or word to it's root form.
* SpaCy provide robust solution for this.

In [3]:
text = 'He loved walking in the rain.'
nile = nlp(text)

for token in nile:
    print(token.lemma_)

he
love
walk
in
the
rain
.


### String to Hashes

In [4]:
lines = nlp('I love reading')

# check hash for the word of 'reading'
hash_no = nlp.vocab.strings['reading']
hash_no

6597542354426545486

In [5]:
# now using hash_no get the string
string = nlp.vocab.strings[hash_no]
string

'reading'

In [6]:
gf1 = nlp('My girlfriend bought me a shirt')
gf2 = nlp('I bought a shirt today')

for token in gf1:
    hash_val = nlp.vocab.strings[token.text]
    print(token.text, '--', hash_val)

My -- 16248400320781648534
girlfriend -- 4668896997415073472
bought -- 5204146470106475914
me -- 18197037023634208128
a -- 11901859001352538922
shirt -- 682501080797652416


In [7]:
for token in gf2:
    hash_val = nlp.vocab.strings[token.text]
    print(token.text, '--', hash_val)

I -- 4690420944186131903
bought -- 5204146470106475914
a -- 11901859001352538922
shirt -- 682501080797652416
today -- 11042482332948150395


### we can clearly see that, some words hash value is the same which save memory space.

### Lexical Attributes
* is_punct, is_space is attributes in text preprocessing

In [8]:
line = '2022 was my revolution year.'

doc = nlp(line)

for token in doc:
    if token.like_num:
        print(token)

2022


In [9]:
salary = "Stark's passive income monthly 3000$ where Emily's monthly income 3500$, But she earn much more than sara, 3200$"

sal = nlp(salary)

for token in sal:
    if token.like_num:
        index_token = token.i + 1
        next_token = sal[index_token]
        
        if next_token.text == '$':
            print(token.text)

3000
3500
3200


In [10]:
bio = """name: Shakil age: 25 email: shakil1emperor@gmail.com
         name: Chamiya age: 23 email: chamiya.binte@gmail.com"""

bio_doc = nlp(bio)

for token in bio_doc:
    if token.like_email:
        print(token.text)

shakil1emperor@gmail.com
chamiya.binte@gmail.com


### Parts of speech tagging

In [11]:
line = 'Shakil and Chamiya both likes to play football. Chamiya wants to talk about human life while shakil like human behaviour.'

new_line = nlp(line)

for token in new_line:
    print(token.text, '--->', token.pos_)

Shakil ---> PROPN
and ---> CCONJ
Chamiya ---> PROPN
both ---> PRON
likes ---> VERB
to ---> PART
play ---> VERB
football ---> NOUN
. ---> PUNCT
Chamiya ---> PROPN
wants ---> VERB
to ---> PART
talk ---> VERB
about ---> ADP
human ---> ADJ
life ---> NOUN
while ---> SCONJ
shakil ---> NOUN
like ---> ADP
human ---> ADJ
behaviour ---> NOUN
. ---> PUNCT


In [12]:
# I don't know what the means of 'SCONJ'
# we can figure out what is this.

spacy.explain('SCONJ')

'subordinating conjunction'

### Pos tagging with text problems

In [13]:
text = """There are so many good movies like 12 angry man, the bucket list etc where we can
          learn many thins about life i.e. we can saw many beautiful i.e. shots about life etc."""

raw_text = nlp(text)

# Remove junk text/values
for token in raw_text:
    if token.pos_ == 'X':
        print(token.text)

# After remove this junk values
print('After remove this junk values: ')
new_text = [token for token in raw_text if not token.pos_ == 'X']
new_text

etc
i.e.
i.e.
etc
After remove this junk values: 


[There,
 are,
 so,
 many,
 good,
 movies,
 like,
 12,
 angry,
 man,
 ,,
 the,
 bucket,
 list,
 where,
 we,
 can,
 
           ,
 learn,
 many,
 thins,
 about,
 life,
 we,
 can,
 saw,
 many,
 beautiful,
 shots,
 about,
 life,
 .]

### we can also visualize what types of token are present in our text

In [16]:
from spacy import displacy

text = 'M loves reading, writing is her hobby and arguing with me is her favorite thing to do'

doc = nlp(text)

displacy.render(doc, style='dep', jupyter=True)

### Named-Entity recognition

In [27]:
entity = """Tony Stark owns StarkEnterprize.
            Victor Timely owns the company TimeInc"""
entity_doc = nlp(entity)

# print named entities
print(entity_doc.ents)

(Tony Stark, StarkEnterprize, Victor Timely, TimeInc)


In [28]:
# we want entity with category

for entity in entity_doc.ents:
    print(entity.text, '--->', entity.label_)

Tony Stark ---> PERSON
StarkEnterprize ---> ORG
Victor Timely ---> PERSON
TimeInc ---> ORG


In [29]:
# visualize it
displacy.render(entity_doc, style='ent', jupyter=True)

In [46]:
giant_company = """Steve Jobs creates Apple Inc. Bill Gates creates Microsoft Corp. 
Sergei Brinn and Larry Page creates Google LLC.
Jeff Bezos creates Amazon.com. Mark Zuckerberg creates Facebook.
Elon Musk creates SpaceX, corp."""

com_doc = nlp(giant_company)

list_of_org = []

for entity in com_doc.ents:
    if entity.label_ == 'ORG':
        list_of_org.append(entity.text)
print(list_of_org)

['Apple Inc.', 'Microsoft Corp.', 'Google LLC', 'Amazon.com', 'SpaceX, corp']
