# Vocabulary Matching

In [1]:
import spacy

In [2]:
from spacy.matcher import Matcher

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
matcher = Matcher(nlp.vocab)

In [5]:
# SolarPower
# solar-power
# Solar Power
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [6]:
matcher.add('SOLARPower', None,pattern1,pattern2,pattern3)

In [7]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-Power is amazing.")

In [8]:
found_matches = matcher(doc)

In [9]:
print(found_matches)

[(1710097167353574299, 1, 3), (1710097167353574299, 8, 9), (1710097167353574299, 11, 14)]


In [10]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

1710097167353574299 SOLARPower 1 3 Solar Power
1710097167353574299 SOLARPower 8 9 solarpower
1710097167353574299 SOLARPower 11 14 Solar-Power


In [11]:
matcher.remove('SOLARPower') # To remove the matcher values #
# if we want to add new mathcher values and methods for same string, we need to first remove the string and update the matcher values

In [12]:
# solarpower, SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# solar.power
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

In [13]:
matcher.add('SolarPower',None,pattern1,pattern2)

In [14]:
doc2 = nlp(u"Solar--Power is very useful solarpower")

In [15]:
found_matches2 = matcher(doc2)

In [16]:
print(found_matches2)

[(8656102463236116519, 0, 3), (8656102463236116519, 6, 7)]


In [17]:
for match_id, start, end in found_matches2:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 The Solar Power
8656102463236116519 SolarPower 6 7 grow


# Phrase Matching

In [18]:
from spacy.matcher import PhraseMatcher

In [19]:
matcher = PhraseMatcher(nlp.vocab)

In [20]:
with open('D:/learning videos/NLP - Natural Language Processing with Python/1. Introduction/UPDATED_NLP_COURSE/TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [21]:
phrase_list = ['voodo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [22]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [23]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [24]:
found_matches = matcher(doc3)

In [25]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [26]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 
3680293220734633682 EconMatcher 49 53 
3680293220734633682 EconMatcher 61 65 
3680293220734633682 EconMatcher 673 677 
3680293220734633682 EconMatcher 2987 2991 


# POS Tagging

In [27]:
nlp = spacy.load("en_core_web_sm")

In [28]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [29]:
print(doc[4])

jumped


In [30]:
print(doc[4].tag_)

VBD


In [31]:
print(doc[4].pos_)

VERB


In [32]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective 
brown      ADJ        JJ         adjective 
fox        PROPN      NNP        noun, proper singular
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective 
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [33]:
doc = nlp(u"I read books on NLP")

In [34]:
doc.text

'I read books on NLP'

In [35]:
word = doc[1]
word.text

'read'

In [36]:
token = word

In [37]:
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [38]:
doc1 = nlp(u"I read a book on NLP")

In [39]:
word1 = doc1[1]
print(word1.text)

read


In [40]:
print(f"{word1.text:{10}} {word1.pos_:{10}} {word1.tag_:{10}} {spacy.explain(word1.tag_)}")

read       VERB       VBD        verb, past tense


In [41]:
doc2 = nlp(u"I'm reading a book on NLP")

In [42]:
word2 = doc2[2]

In [43]:
print(word2.text)

reading


In [44]:
print(f"{word2.text:{10}} {word2.pos_:{10}} {word2.tag_:{10}} {spacy.explain(word2.tag_)}")

reading    VERB       VBG        verb, gerund or present participle


In [45]:
doc3 = nlp(u"I'll read a book on NLP")

In [46]:
word3 = doc3[2]

In [47]:
print(word3.text)

read


In [48]:
print(f"{word3.text:{10}} {word3.pos_:{10}} {word3.tag_:{10}} {spacy.explain(word3.tag_)}")

read       VERB       VB         verb, base form


## Displaying POS Tagging

In [49]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [50]:
from spacy import displacy

In [51]:
displacy.render(doc, style = 'dep', jupyter = True)

In [52]:
options = {'distance':110, 'compact':'True', 'color':'yellow', 'bg':'#09a3d5', 'font':'Times'  }

In [53]:
displacy.render(doc, style = 'dep', jupyter = True, options = options)

# Named Entity Recognition

In [54]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+ '-' + ent.label_ + '-' +str(spacy.explain(ent.label_)))
    else:
        print('No Entities Found')

In [55]:
doc = nlp(u"Hi how are you?")

In [56]:
show_ents(doc)

No Entities Found


## Adding a entity to the document.

### Single Entity

In [57]:
doc1 = nlp(u"Can I go to washington, DC next May to see the Washington Monument? ")

In [58]:
show_ents(doc1)

washington-GPE-Countries, cities, states
DC-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the Washington Monument-ORG-Companies, agencies, institutions, etc.


In [59]:
doc2 = nlp(u"I have shares worth 50000 rupees of Microsoft Corp.")

In [60]:
show_ents(doc2)

50000-CARDINAL-Numerals that do not fall under another type
Microsoft Corp.-ORG-Companies, agencies, institutions, etc.


In [61]:
doc_ent1 = nlp(u"Mahindra Corp to build a factory at U.K. worth $6 Billion.")

In [62]:
show_ents(doc_ent1)

Mahindra Corp-ORG-Companies, agencies, institutions, etc.
U.K.-GPE-Countries, cities, states
$6 Billion-MONEY-Monetary values, including unit


In [63]:
doc_ent2 = nlp(u"Tata to build a factory at U.K. worth $6 Billion.")

In [64]:
show_ents(doc_ent2)

U.K.-GPE-Countries, cities, states
$6 Billion-MONEY-Monetary values, including unit


#### In the doc_ent1 the spacy entities recognised the Mahindra Corp as a {ORG} which refers to a company.
#### In the doc_ent2 the spacy entities haven't recognied the Tata as a organisation.
#### In order to make the Tata as a organisation, we need to add it.

In [65]:
from spacy.tokens import Span

In [66]:
ORG = doc.vocab.strings[u"ORG"]

In [67]:
ORG

383

In [68]:
new_ent = Span(doc_ent2, 0, 1, label = ORG)

In [69]:
doc_ent2.ents = list(doc_ent2.ents) + [new_ent]

In [70]:
show_ents(doc_ent2)

Tata-ORG-Companies, agencies, institutions, etc.
U.K.-GPE-Countries, cities, states
$6 Billion-MONEY-Monetary values, including unit


### Multiple Entity Updation

In [71]:
doc = nlp(u"Our company developed a brand new vaccum cleaner."
          u"This new vaccum-cleaner is the best in market.")

In [72]:
show_ents(doc)

No Entities Found


In [73]:
from spacy.matcher import PhraseMatcher

In [74]:
matcher = PhraseMatcher(nlp.vocab)

In [75]:
phrase_list = ['vaccum cleaner', 'vaccum-cleaner']

In [76]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [77]:
matcher.add('newproduct', None, *phrase_patterns)

In [78]:
found_matches3 = matcher(doc)

In [79]:
found_matches3

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [80]:
from spacy.tokens import Span

In [81]:
prod = doc.vocab.strings[u"PRODUCT"]

In [82]:
found_matches3

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [83]:
new_ents = [Span(doc, match[1], match[2], label = prod) for match in found_matches3]

In [84]:
doc.ents = list(doc.ents) + new_ents

In [85]:
show_ents(doc)

vaccum cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
vaccum-cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)


### How many times a category is repeated

In [86]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is reduced by 10 dollars.")

In [87]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10 dollars]

In [88]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

## Visualizing the Named Entity Recognisation.

In [89]:
from spacy import displacy

In [90]:
doc = nlp(u"Over the past 5 years the sale of Ipads from Apple company increased by 60% and made a profit of $60 million"
          u"By contrast, Sony only sold 35000 Walkman music players")

In [91]:
displacy.render(doc, style = 'ent', jupyter = True)

In [92]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style = 'ent', jupyter = True)

In [93]:
options = {'ents':['PRODUCT', 'ORG']}

In [94]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

# Sentence Segmentation

In [95]:
import spacy

In [96]:
nlp = spacy.load('en_core_web_sm')

In [97]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [98]:
for sent in doc.sents:   # sents is the spacy library to split the sentences #
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence


In [99]:
doc.sents

<generator at 0x1ecf0f4b708>

#### We cant just print the spitted sentences with just using spacy library, but we need to use the iterator to print the splitted sentences.

In [100]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence]

In [101]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

#### The generated tokens will be "span" if we print the sentences by using the list type.

In [102]:
doc = nlp(u'"Management is doing the right thing; leadership is doing the right thing." -Peter Druker')

In [103]:
doc.text

'"Management is doing the right thing; leadership is doing the right thing." -Peter Druker'

In [104]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right thing; leadership is doing the right thing."


-Peter Druker




## Adding Segmentation Rule.

In [105]:
def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print(token.i)

In [106]:
set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
thing
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
thing
13
.
14
"
15
-Peter
16
Druker
17


In [107]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [108]:
nlp.add_pipe(set_custom_boundaries, before = 'parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [109]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right thing." -Peter Druker')

In [110]:
for sent in doc.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right thing."
-Peter Druker


#### Now the sentence is getting seperated by '.' as well as with ';' {'dot' as well as 'semi-colon'}

## Changing Segmentation Rule.

In [111]:
nlp = spacy.load('en_core_web_sm')

In [112]:
mystring = u"This is a sentence. This is another.\n\nThis is a\nThis is third sentence."

In [113]:
print(mystring)

This is a sentence. This is another.

This is a
This is third sentence.


In [114]:
from spacy.pipeline import SentenceSegmenter

In [115]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [116]:
sbd = SentenceSegmenter(nlp.vocab, strategy = split_on_newlines)

In [117]:
nlp.add_pipe(sbd)

In [118]:
doc = nlp(mystring)

In [119]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another.


This is a

This is third sentence.


# Text Classification

In [120]:
import numpy as np
import pandas as pd

In [121]:
import spacy

In [122]:
nlp = spacy.load('en_core_web_sm')

In [123]:
df = pd.read_csv("D:/learning videos/NLP - Natural Language Processing with Python/1. Introduction/UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv", sep = '\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [124]:
len(df)

2000

In [125]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [126]:
df.label.unique()

array(['neg', 'pos'], dtype=object)

In [127]:
df.dropna(inplace = True)
len(df)

1965

In [128]:
df.label.value_counts()

neg    983
pos    982
Name: label, dtype: int64

In [129]:
blanks = []
for i, lb, rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
print(len(blanks), 'blanks:',blanks )

27 blanks: [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [130]:
df.drop(blanks, inplace=True)

len(df)

1938

In [131]:
df.label.value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [132]:
from sklearn.model_selection import train_test_split

In [133]:
x = df['review']
y = df['label']

In [134]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 0)

In [135]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [136]:
# Naive Bayes
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB())])
# Linear SVC
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                          ('clf', LinearSVC())])

In [137]:
text_clf_nb.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [138]:
predictions1 = text_clf_nb.predict(x_test)

In [139]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

In [140]:
print(confusion_matrix(y_test, predictions1))

[[264   5]
 [190 123]]


In [141]:
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

         neg       0.58      0.98      0.73       269
         pos       0.96      0.39      0.56       313

    accuracy                           0.66       582
   macro avg       0.77      0.69      0.64       582
weighted avg       0.79      0.66      0.64       582



In [142]:
print(accuracy_score(y_test,predictions1))

0.6649484536082474


In [143]:
text_clf_lsvc.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [144]:
predictions2 = text_clf_lsvc.predict(x_test)

In [145]:
print(confusion_matrix(y_test, predictions2))

[[232  37]
 [ 45 268]]


In [146]:
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

         neg       0.84      0.86      0.85       269
         pos       0.88      0.86      0.87       313

    accuracy                           0.86       582
   macro avg       0.86      0.86      0.86       582
weighted avg       0.86      0.86      0.86       582



In [147]:
print(accuracy_score(y_test,predictions2))

0.8591065292096219


### Predicting whether the review is Positive or Negative

In [148]:
review1 = ['how do films like mouse hunt get into theatres...']
review2 = ['this has been an extraordinary year for austra']

In [149]:
print(text_clf_lsvc.predict(['review1']))
print(text_clf_lsvc.predict(['review2']))

['neg']
['neg']
