[NLTK Library](https://www.nltk.org/)

## Downloading Required Resources

In [7]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Tokenization: Word Tokenizer

In [8]:
sentence = "Natural Language Processing, for speech and text."
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sentence)
tokens

['Natural', 'Language', 'Processing', ',', 'for', 'speech', 'and', 'text', '.']

## Tokenization: Custom Regex Tokenizer (Using Gaps)

In [9]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\s+', gaps=True)
tokens = tokenizer.tokenize(sentence)
tokens

['Natural', 'Language', 'Processing,', 'for', 'speech', 'and', 'text.']

## Tokenization: Sentence Tokenizer

In [10]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(sentence)
sentences

['Natural Language Processing, for speech and text.']

## Stemming with Porter Stemmer

In [11]:
from nltk.stem import PorterStemmer
words = ['processed', 'processing', 'processes', 'preprocessed']
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in words]
stems

['process', 'process', 'process', 'preprocess']

## Lemmatization with WordNet Lemmatizer (Verb)

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
words = ["beginning", "began", "begun", "begins"]
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in words]
lemmas

[nltk_data] Downloading package wordnet to /root/nltk_data...


['begin', 'begin', 'begin', 'begin']

In [13]:
lemmas = [lemmatizer.lemmatize(word,pos=wordnet.NOUN) for word in words]
lemmas

['beginning', 'began', 'begun', 'begin']