In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = nlp('Hello World!')

In [6]:
doc

Hello World!

In [7]:
for token in doc:
    print(token)

Hello
World
!


In [8]:
pattern = [{"LOWER": "hello", 'OP':'?'}, {"IS_PUNCT": True, 'OP':'?'}, {"LOWER": "world"}]

In [9]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [10]:
doc = nlp("Hello, world!")

In [11]:
matches = matcher(doc)

In [12]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [13]:
for token in doc:
    print(token)

Hello
,
world
!


In [14]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world
15578876784678163569 HelloWorld 1 3 , world
15578876784678163569 HelloWorld 2 3 world


In [15]:
text = "my phone number is 1256. Ohh its wrong! Correct one is 1256348790. call me!"

In [16]:
import re

In [17]:
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1256348790'>

In [18]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1256'>

In [19]:
re.findall(r'\d{3,10}', text)

['1256', '1256348790']

In [20]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1256', 'wrong', 'Correct', '1256348790', 'call']

In [21]:
re.findall(r'c..', text)

['ct ', 'cal']

In [22]:
text = "this is cat but not that. i want hat and cat both"

In [23]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [24]:
text = 'hi thanks for watching <3'

In [25]:
re.findall(r'\d$', text)

['3']

In [26]:
text = '3 hi thanks for watching <3'

In [27]:
re.findall(r'^\d', text)

['3']

In [28]:
text

'3 hi thanks for watching <3'

In [29]:
re.findall(r'[^\d]+', text)

[' hi thanks for watching <']

In [30]:
text = 'hi 33 thanks for watching <3'

In [31]:
re.findall(r'[^\D]+', text)

['33', '3']

In [84]:
text = "you can get free-videos on youtube"

In [85]:
re.findall(r'[\w]+-[\w]+', text)

['free-videos']

In [34]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

In [35]:
text

'Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google.'

In [36]:
pattern = [{'TEXT':'Google'}, {'TEXT': 'I'}, {'TEXT':'/'}, {'TEXT':'O'}]

In [37]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [38]:
matcher = Matcher(nlp.vocab)

In [39]:
matcher.add('Google', callback_method, pattern)

In [40]:
doc = nlp(text)

In [41]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 6, 10), (11578853341595296054, 10, 14)]

In [42]:
pattern = [{'TEXT':'Google'}, {'TEXT': 'I', 'OP': '?'}, {'TEXT':'/', 'OP': '?'}, {'TEXT':'O', 'OP': '?'}]

In [43]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [44]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [45]:
doc = nlp(text)

In [46]:
matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


[(11578853341595296054, 0, 1),
 (11578853341595296054, 6, 7),
 (11578853341595296054, 6, 8),
 (11578853341595296054, 6, 9),
 (11578853341595296054, 6, 10),
 (11578853341595296054, 10, 11),
 (11578853341595296054, 10, 12),
 (11578853341595296054, 10, 13),
 (11578853341595296054, 10, 14),
 (11578853341595296054, 23, 24)]

In [47]:
!python -m spacy info

[1m

spaCy version    2.3.2                         
Location         C:\Users\subru\Anaconda3\lib\site-packages\spacy
Platform         Windows-10-10.0.18362-SP0     
Python version   3.7.1                         
Models                                         



In [48]:
matcher = Matcher(nlp.vocab)

In [49]:
matched_sents = []

In [50]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]

In [52]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    match_ents = [{
        'start':span.start_char - sent.start_char,
        'end': span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    matched_sents.append({'text': sent.text, 'ents':match_ents})

In [53]:
matcher.add("fb", callback_method_fb, pattern)

In [54]:
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")

In [55]:
matches = matcher(doc)

In [56]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [57]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '– Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [58]:
displacy.render(matched_sents, style='ent', manual = True)

In [59]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [60]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber", None, pattern)

In [61]:
doc = nlp("Call me at (123) 4560-7890")

In [62]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4560', '-', '7890']


In [63]:
matches = matcher(doc)
matches

[(7978097794922043545, 3, 9)]

In [64]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 4560-7890


In [91]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [92]:
matcher = Matcher(nlp.vocab)
matcher.add("Email", None, pattern)

In [93]:
text = "Email me at email2me@gmail.com and talk.me@gmail.com"

In [94]:
doc = nlp(text)

In [95]:
matches = matcher(doc)

In [96]:
matches

[(11010771136823990775, 3, 4), (11010771136823990775, 5, 6)]

In [90]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

In [72]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

In [73]:
pos_emoji

['😀', '😃', '😂', '🤣', '😊', '😍']

In [74]:
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [75]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [76]:
neg_patterns

[[{'ORTH': '😞'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😩'}],
 [{'ORTH': '😢'}],
 [{'ORTH': '😭'}],
 [{'ORTH': '😒'}]]

In [77]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == 'HAPPY':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'SAD':
        doc.sentiment -= 0.1

In [78]:
matcher = Matcher(nlp.vocab)

In [79]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher.add('SAD', label_sentiment, *neg_patterns)

In [80]:
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])

In [81]:
doc = nlp("Hello world 😀 #Emoji")

In [82]:
matches = matcher(doc)

In [83]:
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #Emoji


In [97]:
from spacy.matcher import PhraseMatcher

In [98]:
matcher = PhraseMatcher(nlp.vocab)

In [99]:
terms = ['BARAC OBAMA', 'ANGELA MERKEL', 'WASHINGTON D.C.']

In [100]:
pattern = [nlp.make_doc(text) for text in terms]

In [101]:
pattern

[BARAC OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [102]:
matcher.add('term', None, *pattern)

In [103]:
doc = nlp("German Chancellor ANGELA MERKEL and US President BARAC OBAMA "
          "converse in the Oval Office inside the White House in WASHINGTON D.C.")

In [104]:
doc

German Chancellor ANGELA MERKEL and US President BARAC OBAMA converse in the Oval Office inside the White House in WASHINGTON D.C.

In [105]:
matches = matcher(doc)

In [106]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

ANGELA MERKEL
BARAC OBAMA
WASHINGTON D.C.


In [107]:
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 19, 21)]

In [108]:
from spacy.pipeline import EntityRuler

In [109]:
nlp = spacy.load('en_core_web_sm')

In [110]:
ruler = EntityRuler(nlp)

In [113]:
patterns = [{"label": "ORG", "pattern": "Google Inc"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]

In [114]:
patterns

[{'label': 'ORG', 'pattern': 'Google Inc'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [115]:
ruler.add_patterns(patterns)

In [116]:
nlp.add_pipe(ruler)

In [117]:
doc = nlp("Google Inc is opening its first big office in San Francisco.")

In [118]:
doc

Google Inc is opening its first big office in San Francisco.

In [119]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Google Inc ORG
first ORDINAL
San Francisco GPE
