In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')  

In [2]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [3]:
for token in doc:
    print(token.text, token.lemma_)


Apple Apple
is be
looking look
at at
buying buy
U.K. U.K.
startup startup
for for
$ $
1 1
billion billion


In [4]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [5]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [6]:
dod = nlp("Apple is looking very tasty lets eat it")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))


Apple 0 5 ORG Companies, agencies, institutions, etc.
U.K. 27 31 GPE Countries, cities, states
$1 billion 44 54 MONEY Monetary values, including unit


In [7]:
spacy.explain('LOC')

'Non-GPE locations, mountain ranges, bodies of water'

In [52]:
from spacy.tokens import Span
doc = nlp("i live in india")
span = Span(doc,3,4,label='GPE')
span.text

'india'

In [9]:

from spacy import displacy
# displacy.render(doc,style="dep")
displacy.render(doc,style="ent")
# displacy.serve(doc,style="ent")

In [10]:
import spacy

nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)


dog True 75.254234 False
cat True 63.188496 False
banana True 31.620354 False
afskfsd False 0.0 True


In [12]:
print(nlp.pipe_names)


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [3]:
import spacy
nlp = spacy.blank("en")
doc = nlp("Iron loved the pav which cost $25 per plate")
for token in doc:
    print(token)

Iron
loved
the
pav
which
cost
$
25
per
plate


In [8]:
tt = doc[1:5]
type(tt)

spacy.tokens.span.Span

In [14]:
t1 = doc[-3]
t1.like_num

True

In [16]:
for token in doc:
    print(token, "==>","index :",token.i,
         "is_alpha :",token.is_alpha,
         "is_currenecy :",token.is_currency,
         "is_number :",token.like_num)

Iron ==> index : 0 is_alpha : True is_currenecy : False is_number : False
loved ==> index : 1 is_alpha : True is_currenecy : False is_number : False
the ==> index : 2 is_alpha : True is_currenecy : False is_number : False
pav ==> index : 3 is_alpha : True is_currenecy : False is_number : False
which ==> index : 4 is_alpha : True is_currenecy : False is_number : False
cost ==> index : 5 is_alpha : True is_currenecy : False is_number : False
$ ==> index : 6 is_alpha : False is_currenecy : True is_number : False
25 ==> index : 7 is_alpha : False is_currenecy : False is_number : True
per ==> index : 8 is_alpha : True is_currenecy : False is_number : False
plate ==> index : 9 is_alpha : True is_currenecy : False is_number : False


In [18]:
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme",[
    {ORTH:"gim"},
    {ORTH:"me"}
])
doc = nlp("gimme double cheese extra large healty pizza")
token = [token.text for token in doc]
tokenb

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healty', 'pizza']

In [20]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x20657f09cd0>

In [21]:
nlp.pipe_names

['sentencizer']

In [25]:
doc = nlp("Iron loved the pav. which cost $25 per plate")
for sentence in doc.sents:
    print(sentence)

Iron loved the pav.
which cost $25 per plate


In [28]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
url= []
ft = nlp(text)
for gg in ft:
    if gg.like_url:
        url.append(gg)

print(url)

[http://www.data.gov/, http://www.science, http://data.gov.uk/., http://www3.norc.org/gss+website/, http://www.europeansocialsurvey.org/.]


In [29]:
nlp.pipe_names

['sentencizer']

In [30]:
nlp = spacy.load("en_core_web_sm")

In [31]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x20658db4f50>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x20658db48f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2065928fb50>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2065919ad10>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x206591a2590>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2065928fe60>)]

In [34]:
ar = nlp.get_pipe('attribute_ruler')
ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"BROTHER"})
doc = nlp("Bro , you wanna go ! Brah , don't say no!")
for token in doc:
    print(token.text,"|",token.lemma_)

Bro | BROTHER
, | ,
you | you
wanna | wanna
go | go
! | !
Brah | BROTHER
, | ,
do | do
n't | not
say | say
no | no
! | !


In [35]:
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

doc = nlp(earnings_text)

filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        filtered_tokens.append(token)

In [36]:
filtered_tokens[:10]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter]

In [37]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 13,
 92: 46,
 100: 24,
 90: 9,
 85: 16,
 93: 16,
 97: 27,
 98: 1,
 84: 20,
 103: 10,
 87: 6,
 99: 5,
 89: 12,
 86: 3,
 94: 3,
 95: 2}

In [38]:
for k,v in count.items():
    print(doc.vocab[k].text, "|",v)

PROPN | 13
NOUN | 46
VERB | 24
DET | 9
ADP | 16
NUM | 16
PUNCT | 27
SCONJ | 1
ADJ | 20
SPACE | 10
AUX | 6
SYM | 5
CCONJ | 12
ADV | 3
PART | 3
PRON | 2


In [47]:
doc = nlp("Tesla inc is going to acquire Twitter inc for $45 billion")
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

Tesla inc | ORG | Companies, agencies, institutions, etc.
Twitter inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [48]:
from spacy import displacy
displacy.render(doc,style="ent")

In [49]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [51]:
doc = nlp("Michael Bloomberg founded Bloomberg inc in 1982")
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

Michael Bloomberg | PERSON | People, including fictional
Bloomberg inc | ORG | Companies, agencies, institutions, etc.
1982 | DATE | Absolute or relative dates or periods


In [57]:
import spacy
nlp = spacy.load('en_core_web_md')  # Medium model with GloVe embeddings
apple_vector = nlp("apple").vector
# print(apple_vector)
print(nlp("apple").similarity(nlp("apple")))

1.0


In [69]:
import pandas as pd
import numpy as np
df = pd.read_csv('spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# df["Category"] = encoder.fit_transform(df["Category"])
# cv = CountVectorizer(decode_error="ignore")
# df["Message"].fillna("", inplace=True)
# X = cv.fit_transform(df["Message"])

In [61]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [70]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["det_Category"] = encoder.fit_transform(df["Category"])

In [71]:
df["det_Category"]

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: det_Category, Length: 5572, dtype: int32

In [72]:
df.shape

(5572, 3)

In [73]:
df.head()

Unnamed: 0,Category,Message,det_Category
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [75]:
x = df['Message']
y = df['det_Category']

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [77]:
X_train[:4]

466     great princess! I love giving and receiving or...
4096    I ain't answerin no phone at what is actually ...
908     I.ll give her once i have it. Plus she said gr...
5320                 But we havent got da topic yet rite?
Name: Message, dtype: object

In [78]:
y_train[:4]

466     0
4096    0
908     0
5320    0
Name: det_Category, dtype: int32

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59581 stored elements and shape (4457, 7800)>

In [80]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [81]:
X_train_cv.shape

(4457, 7800)

In [83]:
v.get_feature_names_out()[2222]

'deduct'

In [88]:
v.vocabulary_

{'great': 3259,
 'princess': 5433,
 'love': 4248,
 'giving': 3171,
 'and': 964,
 'receiving': 5653,
 'oral': 4997,
 'doggy': 2409,
 'style': 6585,
 'is': 3773,
 'my': 4678,
 'fave': 2808,
 'position': 5341,
 'how': 3545,
 'about': 763,
 'you': 7759,
 'enjoy': 2618,
 'making': 4348,
 'lt': 4271,
 'gt': 3283,
 'times': 6948,
 'per': 5163,
 'night': 4793,
 'ain': 884,
 'answerin': 988,
 'no': 4811,
 'phone': 5203,
 'at': 1117,
 'what': 7527,
 'actually': 813,
 'pretty': 5421,
 'reasonable': 5640,
 'hour': 3537,
 'but': 1588,
 'sleepy': 6251,
 'll': 4183,
 'give': 3168,
 'her': 3436,
 'once': 4957,
 'have': 3381,
 'it': 3785,
 'plus': 5278,
 'she': 6098,
 'said': 5910,
 'grinule': 3270,
 'greet': 3265,
 'whenever': 7535,
 'we': 7469,
 'speak': 6387,
 'havent': 3383,
 'got': 3226,
 'da': 2150,
 'topic': 7031,
 'yet': 7746,
 'rite': 5826,
 'welcome': 7505,
 'to': 6980,
 'uk': 7170,
 'mobile': 4559,
 'date': 2182,
 'this': 6895,
 'msg': 4630,
 'free': 3006,
 'calling': 1630,
 '08719839835': 1

In [89]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [90]:
np.where(X_train_np[0]!=0)

(array([ 763,  964, 2409, 2618, 2808, 3171, 3259, 3283, 3545, 3773, 4248,
        4271, 4348, 4678, 4793, 4997, 5163, 5341, 5433, 5653, 6585, 6948,
        7759], dtype=int64),)

In [95]:
X_train_np[0][2222]

0

In [96]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [97]:
X_test_cv = v.transform(X_test)

In [98]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       976
           1       0.98      0.94      0.96       139

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [99]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])