In [8]:
import nltk

## Sample Text

In [9]:
from nltk.tokenize import sent_tokenize

text = """Machine learning algorithms build a model based on sample data, 
        known as training data, in order to make predictions or decisions 
        without being explicitly programmed to do so. Machine learning algorithms 
        are used in a wide variety of applications, such as in medicine, email 
        filtering, speech recognition, agriculture, and computer vision, where 
        it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks."""

tokenized_text=sent_tokenize(text)
print(tokenized_text)

['Machine learning algorithms build a model based on sample data, \n        known as training data, in order to make predictions or decisions \n        without being explicitly programmed to do so.', 'Machine learning algorithms \n        are used in a wide variety of applications, such as in medicine, email \n        filtering, speech recognition, agriculture, and computer vision, where \n        it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.']


## Tokenization

In [10]:
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Machine', 'learning', 'algorithms', 'build', 'a', 'model', 'based', 'on', 'sample', 'data', ',', 'known', 'as', 'training', 'data', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so', '.', 'Machine', 'learning', 'algorithms', 'are', 'used', 'in', 'a', 'wide', 'variety', 'of', 'applications', ',', 'such', 'as', 'in', 'medicine', ',', 'email', 'filtering', ',', 'speech', 'recognition', ',', 'agriculture', ',', 'and', 'computer', 'vision', ',', 'where', 'it', 'is', 'difficult', 'or', 'unfeasible', 'to', 'develop', 'conventional', 'algorithms', 'to', 'perform', 'the', 'needed', 'tasks', '.']


## Frequency Distribution

In [11]:
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
print(fdist)

<FreqDist with 55 samples and 76 outcomes>


In [12]:
fdist.most_common(2)

[(',', 8), ('to', 4)]

## Stopwords

In [14]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

{'shouldn', 've', 'now', "you've", 'had', 'more', 'the', 'y', 'where', 'each', 'should', 'other', 'that', 'her', 'not', 'a', 'he', 'how', 'our', 'very', 'o', 'out', 'm', 'these', "you'll", 'those', "it's", 'hadn', 'his', 'itself', 'from', 'did', 'you', "you're", 'too', 'mightn', "shouldn't", 'your', 'against', 'needn', 'between', 'with', 'during', 'down', 'himself', 'been', 'own', "hadn't", 'is', 'or', 'yours', 'of', 'all', 're', 'll', 'does', 'haven', 'couldn', "mustn't", 'am', "isn't", 'up', 'doing', 'until', 'over', "didn't", 'yourselves', 'if', 'because', 'then', "haven't", 'having', 'has', 'its', 'so', 'hasn', 'below', "won't", 'were', 'no', 'being', 'me', 'doesn', 'wouldn', 'again', 'why', 'shan', 'above', 'have', 'ain', 'ours', 'can', "wouldn't", "that'll", 'on', "weren't", 'will', "she's", 'do', 'just', "needn't", 's', 'it', 'off', "doesn't", 'ma', 'as', 'to', "should've", 'about', 'be', 'into', 'such', 'but', "you'd", "shan't", 'hers', "hasn't", 'further', 'in', "couldn't", "m

[nltk_data] Downloading package stopwords to /home/admin1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Stopwords Removal

In [18]:
filtered_sent=[]
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:",tokenized_word)
print("\nFilterd Sentence:",filtered_sent)

Tokenized Sentence: ['Machine', 'learning', 'algorithms', 'build', 'a', 'model', 'based', 'on', 'sample', 'data', ',', 'known', 'as', 'training', 'data', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so', '.', 'Machine', 'learning', 'algorithms', 'are', 'used', 'in', 'a', 'wide', 'variety', 'of', 'applications', ',', 'such', 'as', 'in', 'medicine', ',', 'email', 'filtering', ',', 'speech', 'recognition', ',', 'agriculture', ',', 'and', 'computer', 'vision', ',', 'where', 'it', 'is', 'difficult', 'or', 'unfeasible', 'to', 'develop', 'conventional', 'algorithms', 'to', 'perform', 'the', 'needed', 'tasks', '.']

Filterd Sentence: ['Machine', 'learning', 'algorithms', 'build', 'model', 'based', 'sample', 'data', ',', 'known', 'training', 'data', ',', 'order', 'make', 'predictions', 'decisions', 'without', 'explicitly', 'programmed', '.', 'Machine', 'learning', 'algorithms', 'used', 'wide', 'variety', 'applic

## Stemming

In [21]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_sent:
    stemmed_words.append(ps.stem(w))

print("Filtered Sentence:",filtered_sent)
print("\nStemmed Sentence:",stemmed_words)

Filtered Sentence: ['Machine', 'learning', 'algorithms', 'build', 'model', 'based', 'sample', 'data', ',', 'known', 'training', 'data', ',', 'order', 'make', 'predictions', 'decisions', 'without', 'explicitly', 'programmed', '.', 'Machine', 'learning', 'algorithms', 'used', 'wide', 'variety', 'applications', ',', 'medicine', ',', 'email', 'filtering', ',', 'speech', 'recognition', ',', 'agriculture', ',', 'computer', 'vision', ',', 'difficult', 'unfeasible', 'develop', 'conventional', 'algorithms', 'perform', 'needed', 'tasks', '.']

Stemmed Sentence: ['machin', 'learn', 'algorithm', 'build', 'model', 'base', 'sampl', 'data', ',', 'known', 'train', 'data', ',', 'order', 'make', 'predict', 'decis', 'without', 'explicitli', 'program', '.', 'machin', 'learn', 'algorithm', 'use', 'wide', 'varieti', 'applic', ',', 'medicin', ',', 'email', 'filter', ',', 'speech', 'recognit', ',', 'agricultur', ',', 'comput', 'vision', ',', 'difficult', 'unfeas', 'develop', 'convent', 'algorithm', 'perform',

## Lemmatization

In [28]:
import nltk
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

lemma_word_list = []

for word in stemmed_words:
    lemmat = lem.lemmatize(word,"v")
    lemma_word_list.append(lemmat)
    

print("Lemmatized Word:",lemma_word_list)
print("\nStemmed Word:",stemmed_words)

Lemmatized Word: ['machin', 'learn', 'algorithm', 'build', 'model', 'base', 'sampl', 'data', ',', 'know', 'train', 'data', ',', 'order', 'make', 'predict', 'decis', 'without', 'explicitli', 'program', '.', 'machin', 'learn', 'algorithm', 'use', 'wide', 'varieti', 'applic', ',', 'medicin', ',', 'email', 'filter', ',', 'speech', 'recognit', ',', 'agricultur', ',', 'comput', 'vision', ',', 'difficult', 'unfeas', 'develop', 'convent', 'algorithm', 'perform', 'need', 'task', '.']

Stemmed Word: ['machin', 'learn', 'algorithm', 'build', 'model', 'base', 'sampl', 'data', ',', 'known', 'train', 'data', ',', 'order', 'make', 'predict', 'decis', 'without', 'explicitli', 'program', '.', 'machin', 'learn', 'algorithm', 'use', 'wide', 'varieti', 'applic', ',', 'medicin', ',', 'email', 'filter', ',', 'speech', 'recognit', ',', 'agricultur', ',', 'comput', 'vision', ',', 'difficult', 'unfeas', 'develop', 'convent', 'algorithm', 'perform', 'need', 'task', '.']


[nltk_data] Downloading package wordnet to /home/admin1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## POS Tagging

In [30]:
tokens=nltk.word_tokenize(text)
print(tokens)

['Machine', 'learning', 'algorithms', 'build', 'a', 'model', 'based', 'on', 'sample', 'data', ',', 'known', 'as', 'training', 'data', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so', '.', 'Machine', 'learning', 'algorithms', 'are', 'used', 'in', 'a', 'wide', 'variety', 'of', 'applications', ',', 'such', 'as', 'in', 'medicine', ',', 'email', 'filtering', ',', 'speech', 'recognition', ',', 'agriculture', ',', 'and', 'computer', 'vision', ',', 'where', 'it', 'is', 'difficult', 'or', 'unfeasible', 'to', 'develop', 'conventional', 'algorithms', 'to', 'perform', 'the', 'needed', 'tasks', '.']


In [32]:
import nltk
nltk.download('averaged_perceptron_tagger')
  

nltk.pos_tag(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/admin1/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Machine', 'NN'),
 ('learning', 'VBG'),
 ('algorithms', 'JJ'),
 ('build', 'VB'),
 ('a', 'DT'),
 ('model', 'NN'),
 ('based', 'VBN'),
 ('on', 'IN'),
 ('sample', 'NN'),
 ('data', 'NNS'),
 (',', ','),
 ('known', 'VBN'),
 ('as', 'IN'),
 ('training', 'NN'),
 ('data', 'NNS'),
 (',', ','),
 ('in', 'IN'),
 ('order', 'NN'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('predictions', 'NNS'),
 ('or', 'CC'),
 ('decisions', 'NNS'),
 ('without', 'IN'),
 ('being', 'VBG'),
 ('explicitly', 'RB'),
 ('programmed', 'VBN'),
 ('to', 'TO'),
 ('do', 'VB'),
 ('so', 'RB'),
 ('.', '.'),
 ('Machine', 'NNP'),
 ('learning', 'VBG'),
 ('algorithms', 'NNS'),
 ('are', 'VBP'),
 ('used', 'VBN'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('wide', 'JJ'),
 ('variety', 'NN'),
 ('of', 'IN'),
 ('applications', 'NNS'),
 (',', ','),
 ('such', 'JJ'),
 ('as', 'IN'),
 ('in', 'IN'),
 ('medicine', 'NN'),
 (',', ','),
 ('email', 'NN'),
 ('filtering', 'NN'),
 (',', ','),
 ('speech', 'NN'),
 ('recognition', 'NN'),
 (',', ','),
 ('agriculture', 'NN'),
 (',', 