# NLP Preprocessing Steps
### -> Tokenization
### -> Stemming
### -> Lemmatization
### -> Removing stop words
### -> Parts of Speech Tagging

In [1]:
import nltk
import string 

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

# Tokenization & Removing Stopwords

In [2]:
# input text
text = """He said, “When you have that dream once 
it’s a dream; when you have it twice it becomes a desire. And when you see it for 
the third time consecutively, it becomes a passion, an aim and a goal”, and that is 
the passion with which I want to see this fantasy that I have for India 2030.And Abraham 
Lincoln also was a dreamer and you know, but he said one thing that makes most sense in trying 
to achieve this goal that I have dreamed for my nation. He said, “If I have six hours to cut 
down a tree, then I would spend the first four hours sharpening the axe.” There’s a great 
philosophy in that.In this era of instant gratification we just keep thinking we can achieve 
all these goals by just tweaking this, tweaking that, it’s not true. I really believe that a 
missionary zeal is required to make that quantum change, that can make 2030 of what I’m dreaming 
about right now. And just let’s look at India as a country, what a unique nation! Seriously. 
Thousands of years old of culture and tradition, many many invasions, being ruled for many years 
and we still somehow managed to maintain our identity."""

# tokenize the text
tokens = nltk.word_tokenize(text)

lowercased_tokens = [token.lower() for token in tokens]   #converting to lower case

stopwords = nltk.corpus.stopwords.words("english")        

filtered_tokens = [token for token in lowercased_tokens if token.lower() not in stopwords]     #remove stop words

#remove all punctuations
filtered_tokens = [token for token in filtered_tokens if token not in string.punctuation and token != "’" and token != '“' and token !='”']
filtered_tokens

['said',
 'dream',
 'dream',
 'twice',
 'becomes',
 'desire',
 'see',
 'third',
 'time',
 'consecutively',
 'becomes',
 'passion',
 'aim',
 'goal',
 'passion',
 'want',
 'see',
 'fantasy',
 'india',
 '2030.and',
 'abraham',
 'lincoln',
 'also',
 'dreamer',
 'know',
 'said',
 'one',
 'thing',
 'makes',
 'sense',
 'trying',
 'achieve',
 'goal',
 'dreamed',
 'nation',
 'said',
 'six',
 'hours',
 'cut',
 'tree',
 'would',
 'spend',
 'first',
 'four',
 'hours',
 'sharpening',
 'axe.',
 'great',
 'philosophy',
 'that.in',
 'era',
 'instant',
 'gratification',
 'keep',
 'thinking',
 'achieve',
 'goals',
 'tweaking',
 'tweaking',
 'true',
 'really',
 'believe',
 'missionary',
 'zeal',
 'required',
 'make',
 'quantum',
 'change',
 'make',
 '2030',
 'dreaming',
 'right',
 'let',
 'look',
 'india',
 'country',
 'unique',
 'nation',
 'seriously',
 'thousands',
 'years',
 'old',
 'culture',
 'tradition',
 'many',
 'many',
 'invasions',
 'ruled',
 'many',
 'years',
 'still',
 'somehow',
 'managed',


# Stemming

In [3]:
stemmer = nltk.stem.PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
stemmed_tokens

['said',
 'dream',
 'dream',
 'twice',
 'becom',
 'desir',
 'see',
 'third',
 'time',
 'consecut',
 'becom',
 'passion',
 'aim',
 'goal',
 'passion',
 'want',
 'see',
 'fantasi',
 'india',
 '2030.and',
 'abraham',
 'lincoln',
 'also',
 'dreamer',
 'know',
 'said',
 'one',
 'thing',
 'make',
 'sens',
 'tri',
 'achiev',
 'goal',
 'dream',
 'nation',
 'said',
 'six',
 'hour',
 'cut',
 'tree',
 'would',
 'spend',
 'first',
 'four',
 'hour',
 'sharpen',
 'axe.',
 'great',
 'philosophi',
 'that.in',
 'era',
 'instant',
 'gratif',
 'keep',
 'think',
 'achiev',
 'goal',
 'tweak',
 'tweak',
 'true',
 'realli',
 'believ',
 'missionari',
 'zeal',
 'requir',
 'make',
 'quantum',
 'chang',
 'make',
 '2030',
 'dream',
 'right',
 'let',
 'look',
 'india',
 'countri',
 'uniqu',
 'nation',
 'serious',
 'thousand',
 'year',
 'old',
 'cultur',
 'tradit',
 'mani',
 'mani',
 'invas',
 'rule',
 'mani',
 'year',
 'still',
 'somehow',
 'manag',
 'maintain',
 'ident']

# Lemmatization

In [6]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

['said',
 'dream',
 'dream',
 'twice',
 'becomes',
 'desire',
 'see',
 'third',
 'time',
 'consecutively',
 'becomes',
 'passion',
 'aim',
 'goal',
 'passion',
 'want',
 'see',
 'fantasy',
 'india',
 '2030.and',
 'abraham',
 'lincoln',
 'also',
 'dreamer',
 'know',
 'said',
 'one',
 'thing',
 'make',
 'sense',
 'trying',
 'achieve',
 'goal',
 'dreamed',
 'nation',
 'said',
 'six',
 'hour',
 'cut',
 'tree',
 'would',
 'spend',
 'first',
 'four',
 'hour',
 'sharpening',
 'axe.',
 'great',
 'philosophy',
 'that.in',
 'era',
 'instant',
 'gratification',
 'keep',
 'thinking',
 'achieve',
 'goal',
 'tweaking',
 'tweaking',
 'true',
 'really',
 'believe',
 'missionary',
 'zeal',
 'required',
 'make',
 'quantum',
 'change',
 'make',
 '2030',
 'dreaming',
 'right',
 'let',
 'look',
 'india',
 'country',
 'unique',
 'nation',
 'seriously',
 'thousand',
 'year',
 'old',
 'culture',
 'tradition',
 'many',
 'many',
 'invasion',
 'ruled',
 'many',
 'year',
 'still',
 'somehow',
 'managed',
 'mainta

# Parts of Speech Tagging

In [7]:
pos_tokens = nltk.pos_tag(lemmatized_tokens)
pos_tokens

[('said', 'VBD'),
 ('dream', 'NN'),
 ('dream', 'NN'),
 ('twice', 'RB'),
 ('becomes', 'VBZ'),
 ('desire', 'JJ'),
 ('see', 'VB'),
 ('third', 'JJ'),
 ('time', 'NN'),
 ('consecutively', 'RB'),
 ('becomes', 'VBZ'),
 ('passion', 'NN'),
 ('aim', 'NN'),
 ('goal', 'NN'),
 ('passion', 'NN'),
 ('want', 'VBP'),
 ('see', 'NN'),
 ('fantasy', 'JJ'),
 ('india', 'RB'),
 ('2030.and', 'CD'),
 ('abraham', 'NN'),
 ('lincoln', 'NN'),
 ('also', 'RB'),
 ('dreamer', 'NN'),
 ('know', 'NNS'),
 ('said', 'VBD'),
 ('one', 'CD'),
 ('thing', 'NN'),
 ('make', 'VBP'),
 ('sense', 'NN'),
 ('trying', 'VBG'),
 ('achieve', 'JJ'),
 ('goal', 'NN'),
 ('dreamed', 'VBD'),
 ('nation', 'NN'),
 ('said', 'VBD'),
 ('six', 'CD'),
 ('hour', 'NN'),
 ('cut', 'NN'),
 ('tree', 'NN'),
 ('would', 'MD'),
 ('spend', 'VB'),
 ('first', 'RB'),
 ('four', 'CD'),
 ('hour', 'NN'),
 ('sharpening', 'VBG'),
 ('axe.', 'JJ'),
 ('great', 'JJ'),
 ('philosophy', 'NN'),
 ('that.in', 'NN'),
 ('era', 'NN'),
 ('instant', 'JJ'),
 ('gratification', 'NN'),
 ('keep'