In [1]:
!pip install -q nltk


In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag


In [4]:
text = """
The Indian government launched a new digital health initiative.
Citizens are encouraged to use online services for better access.
"""


In [6]:
nltk.download('punkt_tab')
sentences = sent_tokenize(text)
print("Sentences:")
sentences

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sentences:


['\nThe Indian government launched a new digital health initiative.',
 'Citizens are encouraged to use online services for better access.']

In [7]:
words = word_tokenize(text)
print("Words:")
words


Words:


['The',
 'Indian',
 'government',
 'launched',
 'a',
 'new',
 'digital',
 'health',
 'initiative',
 '.',
 'Citizens',
 'are',
 'encouraged',
 'to',
 'use',
 'online',
 'services',
 'for',
 'better',
 'access',
 '.']

In [8]:
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w.lower() not in stop_words and w.isalpha()]

filtered_words


['Indian',
 'government',
 'launched',
 'new',
 'digital',
 'health',
 'initiative',
 'Citizens',
 'encouraged',
 'use',
 'online',
 'services',
 'better',
 'access']

In [9]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(w) for w in filtered_words]

stemmed_words


['indian',
 'govern',
 'launch',
 'new',
 'digit',
 'health',
 'initi',
 'citizen',
 'encourag',
 'use',
 'onlin',
 'servic',
 'better',
 'access']

In [10]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]

lemmatized_words


['Indian',
 'government',
 'launched',
 'new',
 'digital',
 'health',
 'initiative',
 'Citizens',
 'encouraged',
 'use',
 'online',
 'service',
 'better',
 'access']

In [12]:
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = pos_tag(filtered_words)
pos_tags

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('Indian', 'JJ'),
 ('government', 'NN'),
 ('launched', 'VBD'),
 ('new', 'JJ'),
 ('digital', 'JJ'),
 ('health', 'NN'),
 ('initiative', 'NN'),
 ('Citizens', 'NNP'),
 ('encouraged', 'VBD'),
 ('use', 'NN'),
 ('online', 'NN'),
 ('services', 'NNS'),
 ('better', 'JJR'),
 ('access', 'NN')]