In [3]:
from bs4 import BeautifulSoup
import requests
import spacy
from spacy import displacy
import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize

We are using Ratan Tata Wikipedia page for extracting paragraph

In [4]:
url = "https://en.wikipedia.org/wiki/Ratan_Tata"

In [5]:
page = requests.get(url)

We will extract HTML Content from this page.

In [6]:
soup = BeautifulSoup(page.content,'html.parser')

Now we are extracting text from a particular paragraph

In [7]:
tag = soup.find("div", class_="mw-content-ltr mw-parser-output")
text = ""
p = tag.find_all("p")
for i in range(4,7):
    text+=p[i].get_text()    
str = text.replace("\n","")

We can't do stemming with spcay library as spacy provides functions like lemmatization.
For Stemming we will use nltk library.

In [8]:
nlt = PorterStemmer()

Now we will word_tokenize the text that we have extracted.

In [9]:
li = word_tokenize(str)

Now we are doing stemming using stem.

In [10]:
for words in li:
        print(words," | ",nlt.stem(words))

Ratan  |  ratan
Tata  |  tata
was  |  wa
born  |  born
in  |  in
Bombay  |  bombay
,  |  ,
now  |  now
Mumbai  |  mumbai
,  |  ,
during  |  dure
the  |  the
British  |  british
Raj  |  raj
,  |  ,
into  |  into
a  |  a
Parsi  |  parsi
Zoroastrian  |  zoroastrian
family  |  famili
,  |  ,
on  |  on
28  |  28
December  |  decemb
1937  |  1937
.  |  .
[  |  [
8  |  8
]  |  ]
He  |  he
is  |  is
the  |  the
son  |  son
of  |  of
Naval  |  naval
Tata  |  tata
,  |  ,
who  |  who
was  |  wa
born  |  born
in  |  in
Surat  |  surat
and  |  and
later  |  later
adopted  |  adopt
into  |  into
the  |  the
Tata  |  tata
family  |  famili
,  |  ,
and  |  and
Sooni  |  sooni
Tata  |  tata
,  |  ,
the  |  the
niece  |  niec
of  |  of
Tata  |  tata
group  |  group
founder  |  founder
Jamsetji  |  jamsetji
Tata  |  tata
.  |  .
Tata  |  tata
's  |  's
biological  |  biolog
grandfather  |  grandfath
,  |  ,
Hormusji  |  hormusji
Tata  |  tata
,  |  ,
was  |  wa
a  |  a
member  |  member
of  |  of
the  |

Now we will create a spacy object and then will iterate over the text using for loop and will do pos tagging.

In [11]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(str)
for tokens in doc:
    if tokens.is_alpha:
        print(tokens," | ",tokens.pos_)
        

Ratan  |  PROPN
Tata  |  PROPN
was  |  AUX
born  |  VERB
in  |  ADP
Bombay  |  PROPN
now  |  ADV
Mumbai  |  PROPN
during  |  ADP
the  |  DET
British  |  PROPN
Raj  |  PROPN
into  |  ADP
a  |  DET
Parsi  |  ADJ
Zoroastrian  |  ADJ
family  |  NOUN
on  |  ADP
December  |  PROPN
He  |  PRON
is  |  AUX
the  |  DET
son  |  NOUN
of  |  ADP
Naval  |  PROPN
Tata  |  PROPN
who  |  PRON
was  |  AUX
born  |  VERB
in  |  ADP
Surat  |  PROPN
and  |  CCONJ
later  |  ADV
adopted  |  VERB
into  |  ADP
the  |  DET
Tata  |  PROPN
family  |  NOUN
and  |  CCONJ
Sooni  |  PROPN
Tata  |  PROPN
the  |  DET
niece  |  NOUN
of  |  ADP
Tata  |  PROPN
group  |  NOUN
founder  |  NOUN
Jamsetji  |  PROPN
Tata  |  PROPN
Tata  |  PROPN
biological  |  ADJ
grandfather  |  NOUN
Hormusji  |  PROPN
Tata  |  PROPN
was  |  AUX
a  |  DET
member  |  NOUN
of  |  ADP
the  |  DET
Tata  |  PROPN
family  |  NOUN
by  |  ADP
blood  |  NOUN
In  |  ADP
when  |  SCONJ
Tata  |  PROPN
was  |  AUX
his  |  PRON
parents  |  NOUN
separated  | 

Now we will iterate over the text using for loop and will do lemmatization.

In [12]:
for tokens in doc:
    if tokens.is_alpha:
        print(tokens," | ",tokens.lemma_)

Ratan  |  Ratan
Tata  |  Tata
was  |  be
born  |  bear
in  |  in
Bombay  |  Bombay
now  |  now
Mumbai  |  Mumbai
during  |  during
the  |  the
British  |  British
Raj  |  Raj
into  |  into
a  |  a
Parsi  |  parsi
Zoroastrian  |  zoroastrian
family  |  family
on  |  on
December  |  December
He  |  he
is  |  be
the  |  the
son  |  son
of  |  of
Naval  |  Naval
Tata  |  Tata
who  |  who
was  |  be
born  |  bear
in  |  in
Surat  |  Surat
and  |  and
later  |  later
adopted  |  adopt
into  |  into
the  |  the
Tata  |  Tata
family  |  family
and  |  and
Sooni  |  Sooni
Tata  |  Tata
the  |  the
niece  |  niece
of  |  of
Tata  |  Tata
group  |  group
founder  |  founder
Jamsetji  |  Jamsetji
Tata  |  Tata
Tata  |  Tata
biological  |  biological
grandfather  |  grandfather
Hormusji  |  Hormusji
Tata  |  Tata
was  |  be
a  |  a
member  |  member
of  |  of
the  |  the
Tata  |  Tata
family  |  family
by  |  by
blood  |  blood
In  |  in
when  |  when
Tata  |  Tata
was  |  be
his  |  his
parents  |

Now we will iterate over the text using for loop and will do Named Entity Recognition.
Here we have used displacy to do entity recognition in a pretty way.

In [13]:
displacy.render(doc,style="ent")