In [34]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy 

In [35]:
text = "Natural Language Processing is fun and powerful!"
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]


print("Original :", tokens)
print("Filtered: ", filtered_tokens)

Original : ['Natural', 'Language', 'Processing', 'is', 'fun', 'and', 'powerful', '!']
Filtered:  ['Natural', 'Language', 'Processing', 'fun', 'powerful', '!']


<h3>Lemmatization and stemming</h3>

In [36]:
stemmer    = PorterStemmer()
lemmatizer =  WordNetLemmatizer()
words = ["running", "playing", "better", "studies"]
print("Stemming :")
for word in words:
    print(word, "->" , stemmer.stem(word))

print("\nLemmatization : ")
for word in words:
    print(word, "->", lemmatizer.lemmatize(word, pos= 'v')) 

Stemming :
running -> run
playing -> play
better -> better
studies -> studi

Lemmatization : 
running -> run
playing -> play
better -> better
studies -> study


lemmatizer.lemmatize(word, pos= 'v') 
You're telling the lemmatizer:

Hey, treat this word as a verb when lemmatizing."
'n'	= noun
'v'	= verb
'a'	= adjective
'r'	= adverb"""

In [37]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billon.")

for token in doc: 
    print(f"{token.text:<12} | POS: {token.pos_:<10} | Lemma: {token.lemma_}")


Apple        | POS: PROPN      | Lemma: Apple
is           | POS: AUX        | Lemma: be
looking      | POS: VERB       | Lemma: look
at           | POS: ADP        | Lemma: at
buying       | POS: VERB       | Lemma: buy
U.K.         | POS: PROPN      | Lemma: U.K.
startup      | POS: VERB       | Lemma: startup
for          | POS: ADP        | Lemma: for
$            | POS: SYM        | Lemma: $
1            | POS: NUM        | Lemma: 1
billon       | POS: NOUN       | Lemma: billon
.            | POS: PUNCT      | Lemma: .


In [43]:
#loading the news article
path = "D:/NLP/Day1/newsArticle.txt"
with open(path, 'r', encoding='utf-8') as f:
    data = f.read()
tokens = word_tokenize(data)

stop_words = set(stopwords.words('english'))
filtered_news_article = [word for word in tokens if word.lower() not in stop_words]

In [48]:
# Display Stemming
print("Stemming:")
for word in filtered_news_article[:15]:
    print(f"{word} -> {stemmer.stem(word)}")

# Display Lemmatization
print("\n Lemmatization:")
for word in filtered_news_article[:15]:
    print(f"{word} -> {lemmatizer.lemmatize(word, pos='v')}")

# spaCy POS + Lemma
print("\n spaCy POS Tagging + Lemmas:")
doc = nlp(" ".join(filtered_news_article[:30]))  # Use a subset for clean output
for token in doc:
    print(f"{token.text:<12} | POS: {token.pos_:<10} | Lemma: {token.lemma_}")

Stemming:
India -> india
’ -> ’
home -> home
season -> season
: -> :
Kolkata -> kolkata
, -> ,
Delhi -> delhi
swap -> swap
Tests -> test
West -> west
Indies -> indi
South -> south
Africa -> africa
understood -> understood

 Lemmatization:
India -> India
’ -> ’
home -> home
season -> season
: -> :
Kolkata -> Kolkata
, -> ,
Delhi -> Delhi
swap -> swap
Tests -> Tests
West -> West
Indies -> Indies
South -> South
Africa -> Africa
understood -> understand

 spaCy POS Tagging + Lemmas:
India        | POS: PROPN      | Lemma: India
’            | POS: PUNCT      | Lemma: '
home         | POS: NOUN       | Lemma: home
season       | POS: NOUN       | Lemma: season
:            | POS: PUNCT      | Lemma: :
Kolkata      | POS: PROPN      | Lemma: Kolkata
,            | POS: PUNCT      | Lemma: ,
Delhi        | POS: PROPN      | Lemma: Delhi
swap         | POS: VERB       | Lemma: swap
Tests        | POS: PROPN      | Lemma: Tests
West         | POS: PROPN      | Lemma: West
Indies       | POS: PR