In [1]:
import spacy 
nlp = spacy.load("en_core_web_sm")

#**Tokenization**

In [2]:
sent="My name is sahana rao. i am natural language processing student"

In [3]:
x= nlp(sent)
for token in x:
    print(token.text,end=" | ")

My | name | is | sahana | rao | . | i | am | natural | language | processing | student | 

#**Word Frequencies**

In [4]:
from collections import Counter

In [5]:
text = """I am Sahana Rao. I study in college. I am in Bangalore. I am in PES College in bangalore. People are very energetic. Bangalore has the highest number of IT companies."""
doc = nlp(text)
#remove stopwords and punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
word_freq = Counter(words)
common_words = word_freq.most_common(5)
print (common_words)

[('Bangalore', 2), ('Sahana', 1), ('Rao', 1), ('study', 1), ('college', 1)]


#**Part-of-speech tagging**

In [6]:
import spacy
text = """I am Sahana Rao. I study in college. I am in Bangalore. I am in PES College in bangalore. People are very energetic. Bangalore has the highest number of IT companies."""
text = nlp(text)
for w in text:
    print (w, w.pos_)

I PRON
am AUX
Sahana PROPN
Rao PROPN
. PUNCT
I PRON
study VERB
in ADP
college NOUN
. PUNCT
I PRON
am AUX
in ADP
Bangalore PROPN
. PUNCT
I PRON
am AUX
in ADP
PES PROPN
College PROPN
in ADP
bangalore PROPN
. PUNCT
People NOUN
are AUX
very ADV
energetic ADJ
. PUNCT
Bangalore PROPN
has VERB
the DET
highest ADJ
number NOUN
of ADP
IT PROPN
companies NOUN
. PUNCT


#**N-grams**

In [7]:
def n_grams (tokens,n): 
  return[tokens[i:i+n] for i in range(len(tokens)-n+1)]
text = """I am Sahana Rao. I study in college. I am in Bangalore. I am in PES College in bangalore. People are very energetic. Bangalore has the highest number of IT companies."""
tokens = [str(token) for token in nlp(text)]
print (n_grams (tokens,3))

[['I', 'am', 'Sahana'], ['am', 'Sahana', 'Rao'], ['Sahana', 'Rao', '.'], ['Rao', '.', 'I'], ['.', 'I', 'study'], ['I', 'study', 'in'], ['study', 'in', 'college'], ['in', 'college', '.'], ['college', '.', 'I'], ['.', 'I', 'am'], ['I', 'am', 'in'], ['am', 'in', 'Bangalore'], ['in', 'Bangalore', '.'], ['Bangalore', '.', 'I'], ['.', 'I', 'am'], ['I', 'am', 'in'], ['am', 'in', 'PES'], ['in', 'PES', 'College'], ['PES', 'College', 'in'], ['College', 'in', 'bangalore'], ['in', 'bangalore', '.'], ['bangalore', '.', 'People'], ['.', 'People', 'are'], ['People', 'are', 'very'], ['are', 'very', 'energetic'], ['very', 'energetic', '.'], ['energetic', '.', 'Bangalore'], ['.', 'Bangalore', 'has'], ['Bangalore', 'has', 'the'], ['has', 'the', 'highest'], ['the', 'highest', 'number'], ['highest', 'number', 'of'], ['number', 'of', 'IT'], ['of', 'IT', 'companies'], ['IT', 'companies', '.']]


#**Spelling correction**

In [8]:
!pip install contextualSpellCheck

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contextualSpellCheck
  Downloading contextualSpellCheck-0.4.3-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting editdistance==0.6.0
  Downloading editdistance-0.6.0-cp39-cp39-manylinux2010_x86_64.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.0.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.2 MB/s[0m 

In [9]:
import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)
doc = nlp('I study in colege.')

print(len(doc._.suggestions_spellCheck)) # => Number of errors: 3
print(doc._.suggestions_spellCheck)      # => {neww: 'new', firsrt: 'best', neme: 'name'}
print(doc._.outcome_spellCheck) 

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

1
{colege: 'college'}
I study in college.


#**Noun phrase extraction**

In [10]:
text = """I am Sahana Rao. I study in college. I am in Bangalore. I am in PES College in bangalore. People are very energetic. Bangalore has the highest number of IT companies."""
doc = nlp(text)
print("Nouns:", [token.lemma_ for token in doc if token.pos_ == "NOUN"])
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

Nouns: ['college', 'People', 'number', 'company']
Noun phrases: ['I', 'Sahana Rao', 'I', 'college', 'I', 'Bangalore', 'I', 'PES College', 'bangalore', 'People', 'Bangalore', 'the highest number', 'IT companies']


#**WordNet Integration**

In [11]:
!pip install spacy-wordnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-wordnet
  Downloading spacy_wordnet-0.1.0-py2.py3-none-any.whl (652 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m652.1/652.1 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk<3.6,>=3.3
  Downloading nltk-3.5.zip (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434691 sha256=393c8db62880aa5d712461ca15f194d81914279408d08a23e7422517ce5f27f4
  Stored in directory: /root/.cache/pip/wheels/13/ae/bb/5e2a232ebaa1d2f38dd5f587e9fc4cf6ccb12758d14dac14d8
Successfully built nltk
Installing collected packages: nltk, spacy-wordnet
  Attempting u

In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
import spacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacy_wordnet", after='tagger')

<spacy_wordnet.wordnet_annotator.WordnetAnnotator at 0x7f8894fbbdc0>

In [14]:
token = nlp('bank')[0]

In [15]:
# WordNet object links spaCy token with NLTK WordNet interface by giving access to
# synsets and lemmas 
token._.wordnet.synsets()

[Synset('bank.n.01'),
 Synset('depository_financial_institution.n.01'),
 Synset('bank.n.03'),
 Synset('bank.n.04'),
 Synset('bank.n.05'),
 Synset('bank.n.06'),
 Synset('bank.n.07'),
 Synset('savings_bank.n.02'),
 Synset('bank.n.09'),
 Synset('bank.n.10')]

In [16]:
token._.wordnet.lemmas()

[Lemma('bank.n.01.bank'),
 Lemma('depository_financial_institution.n.01.depository_financial_institution'),
 Lemma('depository_financial_institution.n.01.bank'),
 Lemma('depository_financial_institution.n.01.banking_concern'),
 Lemma('depository_financial_institution.n.01.banking_company'),
 Lemma('bank.n.03.bank'),
 Lemma('bank.n.04.bank'),
 Lemma('bank.n.05.bank'),
 Lemma('bank.n.06.bank'),
 Lemma('bank.n.07.bank'),
 Lemma('bank.n.07.cant'),
 Lemma('bank.n.07.camber'),
 Lemma('savings_bank.n.02.savings_bank'),
 Lemma('savings_bank.n.02.coin_bank'),
 Lemma('savings_bank.n.02.money_box'),
 Lemma('savings_bank.n.02.bank'),
 Lemma('bank.n.09.bank'),
 Lemma('bank.n.09.bank_building'),
 Lemma('bank.n.10.bank'),
 Lemma('bank.v.01.bank'),
 Lemma('bank.v.02.bank'),
 Lemma('bank.v.03.bank'),
 Lemma('bank.v.04.bank'),
 Lemma('bank.v.05.bank'),
 Lemma('deposit.v.02.deposit'),
 Lemma('deposit.v.02.bank'),
 Lemma('bank.v.07.bank'),
 Lemma('trust.v.01.trust'),
 Lemma('trust.v.01.swear'),
 Lemma('tr

In [17]:
# And automatically add info about WordNet domains
token._.wordnet.wordnet_domains()

['geology',
 'skiing',
 'geography',
 'diplomacy',
 'book_keeping',
 'administration',
 'numismatics',
 'politics',
 'betting',
 'banking',
 'insurance',
 'social',
 'money',
 'finance',
 'post',
 'law',
 'commerce',
 'enterprise',
 'time_period',
 'industry',
 'economy',
 'tax',
 'philately',
 'exchange',
 'money',
 'finance',
 'betting',
 'card',
 'transport',
 'skiing',
 'town_planning',
 'money',
 'banking',
 'astronomy',
 'aviation',
 'basketball',
 'gas',
 'basketball',
 'transport',
 'aviation',
 'transport',
 'cycling',
 'banking',
 'economy',
 'book_keeping',
 'enterprise',
 'volleyball',
 'card',
 'betting',
 'banking',
 'badminton',
 'finance',
 'banking',
 'exchange',
 'banking',
 'economy']

#**NER**

In [18]:
sent="My name is Sahana Rao. I am natural language processing student"

In [19]:
x= nlp(sent)
for ent in x.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Sahana Rao 11 21 PERSON


#**Lemmatization**

In [20]:
sent1="The kids are playing in the ground. What a wonderful sight seeing and discovery. The seminar is going on,"

In [21]:
x= nlp(sent1)
for y in x:
    print(y,y.lemma_)

The the
kids kid
are be
playing play
in in
the the
ground ground
. .
What what
a a
wonderful wonderful
sight sight
seeing seeing
and and
discovery discovery
. .
The the
seminar seminar
is be
going go
on on
, ,


#**Dependency parsing using spacy**

In [22]:
sent1="The kids are playing in the ground."

In [23]:
doc=nlp(sent1)

In [24]:
for token in doc:
    # Print the token and its part-of-speech tag
    print(token.text, "--", token.dep_)

The -- det
kids -- nsubj
are -- aux
playing -- ROOT
in -- prep
the -- det
ground -- pobj
. -- punct


#**Rule based parsing**

In [25]:
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')
m = Matcher(nlp.vocab)

# Define a pattern to match the phrase "New York City"
pattern = [{'LOWER': 'ground'}]

# Add the pattern to the matcher
m.add('NYC',[pattern])

#Define sentence
sent1="The kids are playing in the ground. GROUND looks big"
# Apply the matcher to a document
doc = nlp(sent1)
matches = m(doc)

# Print the matched spans
for match_id, start, end in matches:
    print(doc[start:end])

ground
GROUND


#**Word Similarity**

In [26]:
!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [27]:
words = "happy glad"
  
tokens = nlp(words)
  
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
  
token1, token2 = tokens[0], tokens[1]
  
print("Similarity:", token1.similarity(token2))

happy True 47.0769 False
glad True 47.30086 False
Similarity: 0.6186302304267883
