In [1]:
import spacy

In [2]:
nlp=spacy.load('en_core_web_sm')

In [4]:
#help(nlp)

In [32]:
text  = "Apple is looking for buying a U.K. startup for $1 billion"

In [33]:
doc = nlp(text)

In [34]:
# help(doc)

In [35]:
#Tokentization
for token in doc:
    print(token.text)

Apple
is
looking
for
buying
a
U.K.
startup
for
$
1
billion


In [36]:
#Parts Of Speech
for token in doc:
    print(token.text, token.pos_)

Apple PROPN
is AUX
looking VERB
for ADP
buying VERB
a DET
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM


In [42]:
for token in doc:
    print(f'{token.text:{15}} {token.pos_:}')

Apple           PROPN
is              AUX
looking         VERB
for             ADP
buying          VERB
a               DET
U.K.            PROPN
startup         NOUN
for             ADP
$               SYM
1               NUM
billion         NUM


In [43]:
#Visualization
from spacy import displacy

In [44]:
doc

Apple is looking for buying a U.K. startup for $1 billion

In [50]:
displacy.render(doc,style="dep",options={'distance':100,'compact':True})

In [51]:
# Named Entity Recognition | NER

In [52]:
doc

Apple is looking for buying a U.K. startup for $1 billion

In [53]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [54]:
displacy.render(doc,style='ent')

In [58]:
doc1 = nlp("Apple is looking for buying a U.K. startup for $1 billion on 2021")

In [59]:
displacy.render(doc1,style='ent')

In [62]:
# Sentence Segementation

In [71]:
text = "Apple is looking for buying a U.K. startup. Government has given permission"

In [72]:
doc = nlp(text)

In [73]:
doc

Apple is looking for buying a U.K. startup. Government has given permission

In [74]:
for sent in doc.sents:
    print(sent)

Apple is looking for buying a U.K. startup.
Government has given permission


In [75]:
# Phrase Matching

In [77]:
from spacy.matcher import Matcher
from spacy.tokens import Span

In [78]:
text = "Hello, world! hello world"

In [79]:
doc = nlp(text)

In [80]:
for token in doc:
    print(token)

Hello
,
world
!
hello
world


In [121]:
pattern = [{'LOWER':'hello'},{'IS_PUNCT':True,'OP':'?'},{'LOWER':'world'}]

In [122]:
matcher = Matcher(nlp.vocab)

In [123]:
matcher.add('id1',None, pattern)

In [124]:
matches = matcher(doc)

In [125]:
matches

[(8767255518751734291, 0, 3), (8767255518751734291, 4, 6)]

In [126]:
for match_id,start,end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id,span)

8767255518751734291 id1 Hello, world
8767255518751734291 id1 hello world


In [127]:
text = 'my phone number is 123. ohh its wrong one. correct one is 1234567890. call me'

In [128]:
import re

In [133]:
re.search(r'\d{3}',text)

<re.Match object; span=(19, 22), match='123'>

In [135]:
re.findall(r'\d{3,9}',text)

['123', '123456789']

In [136]:
re.findall(r'\d+',text)

['123', '1234567890']

In [137]:
re.findall(r'\w+',text)

['my',
 'phone',
 'number',
 'is',
 '123',
 'ohh',
 'its',
 'wrong',
 'one',
 'correct',
 'one',
 'is',
 '1234567890',
 'call',
 'me']

In [144]:
re.findall(r'c...',text)

['corr', 'ct o', 'call']

In [145]:
re.findall(r'2.',text)

['23', '23']

In [149]:
re.findall(r'c.l',text)

['cal']

In [151]:
re.findall(r'[^\d]+',text)

['my phone number is ', '. ohh its wrong one. correct one is ', '. call me']

In [154]:
re.findall(r'[\D]+',text)

['my phone number is ', '. ohh its wrong one. correct one is ', '. call me']

In [158]:
###################################

In [161]:
texts = ['net income was $9.4 million compared to ther prior year of 2.7$ million',
       'revenue exceeds twelve billion dollars with a loss of $1b']

In [162]:
nlp = spacy.load('en_core_web_sm')

In [173]:
# %%timeit
# docs = nlp.pipe(texts,disable=['tagger','parser'])
# for doc in docs:
#     for ent in doc.ents:
#         print(ent.text,ent.label_)
#     print()

In [174]:
# %%timeit
# docs = nlp.pipe(texts)
# for doc in docs:
#     for ent in doc.ents:
#         print(ent.text,ent.label_)
#     print()

In [213]:
pos_emoji = ["😀", "😃","😂","🤣","😊","😍" ]
neg_emoji = ["😞","😔","😭","😫","😡","😒"]

In [214]:
pos = [[{'ORTH':emoji}] for emoji in pos_emoji]

In [215]:
pos

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [216]:
neg = [[{'ORTH':emoji}] for emoji in neg_emoji]

In [217]:
neg

[[{'ORTH': '😞'}],
 [{'ORTH': '😔'}],
 [{'ORTH': '😭'}],
 [{'ORTH': '😫'}],
 [{'ORTH': '😡'}],
 [{'ORTH': '😒'}]]

In [218]:
matcher = Matcher(nlp.vocab)

In [219]:
def label_sentiment(matcher,doc,i,matches):
    match_id,start,end = matches[i]
    if doc.vocab.strings[match_id] == 'happy':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'sad':
        doc.sentiment -=0.1

In [220]:
matcher.add("happy",label_sentiment,*pos)

In [221]:
matcher.add("sad",label_sentiment,*neg)

In [222]:
matcher.add('HASHTAG',None,[{'TEXT':'#'},{'IS_ASCII':True}])

In [223]:
doc = nlp("Hello guys 😀🤣 #kgptalkie")

In [224]:
matches = matcher(doc)

In [225]:
for match_id,start,end in matches:
    string_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id,span.text)

happy 😀
happy 🤣
HASHTAG #kgptalkie


In [226]:
doc.sentiment

0.20000000298023224