In [5]:
#4) Strings to Hashes

#when we create doc the words of doc stored in  Vocab.

# ex: 1000 text documents each having information about various clothing items of different brands. The chances are, the words 
#     “shirt” and “pants” are going to be very common. Each time the word “shirt” occurs , if spaCy were to store the exact 
#     string , you’ll end up losing huge memory space.

# But this doesn’t happen. Why ? because spaCy hashes or converts each string to a unique ID that is stored in the StringStore.
#StringStore is a dictionary mapping of hash values to strings, for example 10543432924755684266 –> box
# You can print the hash value if you know the string and vice-versa. This is contained in nlp.vocab.strings as shown below.

In [6]:
word_hash = nlp.vocab.strings["reduce"]  # checking hash value of words and viceversa
print(word_hash)

word = nlp.vocab.strings[word_hash]
print(word)

13655207319209475655
reduce


In [7]:
# Create two different doc with a common word
doc1 = nlp('Raymond shirts are famous')
doc2 = nlp('I washed my shirts ')

# Printing the hash value for each token in the doc
print('-------DOC 1-------')
for token in doc1:
    hash_value=nlp.vocab.strings[token.text]
    print(token.text ,' ',hash_value)

print('-------DOC 2-------')
for token in doc2:
    hash_value=nlp.vocab.strings[token.text]
    print(token.text ,' ',hash_value)
    
    
#You can verify that ‘ shirts ‘ has the same hash value irrespective of which document it occurs in. This saves memory space.

-------DOC 1-------
Raymond   5945540083247941101
shirts   9181315343169869855
are   5012629990875267006
famous   17809293829314912000
-------DOC 2-------
I   4690420944186131903
washed   5520327350569975027
my   227504873216781231
shirts   9181315343169869855


In [8]:
#5) Lexical attributes

#we used is_punct and is_space attributes in Text Preprocessing. They are called as ‘lexical attributes’.

#few more significant lexical attributes.

for token in doc:
    if token.like_num:    #numerical or not
        print(token)
        

54
19
2000
2009
15


In [9]:
# Finding the tokens which are numbers followed by % 
for token in doc:
    if token.like_num:
        index_of_next_token=token.i+ 1
        next_token=doc[index_of_next_token]
        if next_token.text == '%':
            print(token.text)

19


In [10]:
#6)Detecting Email Addresses

for token in doc:
      if token.like_email:
        print(token.text)

shank@rediffmail.com
parmar15@yahoo.com


In [11]:
#7) POS tagging helps you in dealing with text based problems.

# Checking if a particular token is junk through token.pos_ == 'X' and remove them.
print('The junk values are..')
for token in doc:
      if token.pos_=='X':
        print(token.text)

The junk values are..
shank@rediffmail.com
parmar15@yahoo.com


In [12]:
print('After removing junk')
# Removing the tokens whose POS tag is junk.
clean_doc=[token for token in doc if not token.pos_=='X']
print(clean_doc)

After removing junk
[The, economic, situation, of, the, country, is, on, edge, ,, as, the, stock, 
, market, crashed, causing, loss, of, millions.name, :, Shashank, age, :, 54, email, :, Citizens, who, had, their, main, 19, %, investment, 
, in, the, share, -, market, are, facing, a, great, loss, ., Many, 2000, companies, might, lay, off, 
, thousands, of, people, to, 2009, reduce, labor, cost, ,, name, :, pratham, parmar, age, :, 15, email, :, 
                 ]


In [13]:
all_tags = {token.pos: token.pos_ for token in doc}
print(all_tags)

{90: 'DET', 84: 'ADJ', 92: 'NOUN', 85: 'ADP', 87: 'AUX', 97: 'PUNCT', 98: 'SCONJ', 103: 'SPACE', 100: 'VERB', 96: 'PROPN', 93: 'NUM', 101: 'X', 95: 'PRON'}


In [14]:
# displaying tokens with their POS tags
displacy.render(doc,style='dep',jupyter=True)

In [15]:
#8)Named Entity Recognition: “John works at Google1″. In this, ” John ” and ” Google ” are names of a person and a company. These words are referred as named-entities. They are real-world objects like name of a company , place,etc..

# PERSON : Denotes names of people
# GPE : Denotes places like counties, cities, states.
# ORG : Denotes organizations or companies
# WORK_OF_ART : Denotes titles of books, fimls,songs and other arts
# PRODUCT : Denotes products such as vehicles, food items ,furniture and so on.
# EVENT : Denotes historical events like wars, disasters ,etc…
# LANGUAGE : All the recognized languages across the globe.
# etc.,

doc.ents

(54, Citizens, 19%, 2000, thousands, 2009, pratham parmar age, 15)

In [16]:
for entity in doc.ents:
    print(entity.text,'---> ',entity.label_)

54 --->  CARDINAL
Citizens --->  PERSON
19% --->  PERCENT
2000 --->  DATE
thousands --->  CARDINAL
2009 --->  DATE
pratham parmar age --->  ORG
15 --->  CARDINAL


In [17]:
# Using displacy for visualizing NER
displacy.render(doc,style='ent',jupyter=True)

In [18]:
#Extracting brand names with Named Entity Recognition

# List to store name of mobile companies
list_of_org=[]

# Appending entities which havel the label 'ORG' to the list
for entity in doc.ents:
      if entity.label_=='ORG':
        list_of_org.append(entity.text)

print(list_of_org)

['pratham parmar age']


In [19]:
#9)Rule based Matching
   #“Windows 8.0 has become outdated and slow. It’s better to update to Windows 10”. What if you want to extracts all versions
    #of Windows mentioned in the text ?
    
#     where you’ll need extract specific pattern type phrases from the text. This is called Rule-based matching.
# Rule-based matching in spacy allows you write your own rules to find or extract words and phrases in a text. 
# spacy supports three kinds of matching methods :

# Token Matcher
# Phrase Matcher
# Entity Ruler

In [None]:
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler



        
        
#purpose of the word vector is to get a computer system to understand a word. Computers cannot understand text efficiently.
#For this reason, it is important to convert a word into a number.  