In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x184c26b02e0>

### Lemmatization
* Lemmatization is convert a text or word to it's root form.
* SpaCy provide robust solution for this.

In [3]:
text = 'He loved walking in the rain.'
nile = nlp(text)

for token in nile:
    print(token.lemma_)

he
love
walk
in
the
rain
.


### String to Hashes

In [4]:
lines = nlp('I love reading')

# check hash for the word of 'reading'
hash_no = nlp.vocab.strings['reading']
hash_no

6597542354426545486

In [5]:
# now using hash_no get the string
string = nlp.vocab.strings[hash_no]
string

'reading'

In [6]:
gf1 = nlp('My girlfriend bought me a shirt')
gf2 = nlp('I bought a shirt today')

for token in gf1:
    hash_val = nlp.vocab.strings[token.text]
    print(token.text, '--', hash_val)

My -- 16248400320781648534
girlfriend -- 4668896997415073472
bought -- 5204146470106475914
me -- 18197037023634208128
a -- 11901859001352538922
shirt -- 682501080797652416


In [7]:
for token in gf2:
    hash_val = nlp.vocab.strings[token.text]
    print(token.text, '--', hash_val)

I -- 4690420944186131903
bought -- 5204146470106475914
a -- 11901859001352538922
shirt -- 682501080797652416
today -- 11042482332948150395


### we can clearly see that, some words hash value is the same which save memory space.

### Lexical Attributes
* is_punct, is_space is attributes in text preprocessing

In [8]:
line = '2022 was my revolution year.'

doc = nlp(line)

for token in doc:
    if token.like_num:
        print(token)

2022


In [10]:
salary = "Stark's passive income monthly 3000$ where Emily's monthly income 3500$, But she earn much more than sara, 3200$"

sal = nlp(salary)

for token in sal:
    if token.like_num:
        index_token = token.i + 1
        next_token = sal[index_token]
        
        if next_token.text == '$':
            print(token.text)

3000
3500
3200


In [11]:
bio = """name: Shakil age: 25 email: shakil1emperor@gmail.com
         name: Chamiya age: 23 email: chamiya.binte@gmail.com"""

bio_doc = nlp(bio)

for token in bio_doc:
    if token.like_email:
        print(token.text)

shakil1emperor@gmail.com
chamiya.binte@gmail.com
