### Sample Sentences

In [8]:
sentence1 = "I will walk 500 miles and I would walk 500 more. Just to be the man who walks " + \
            "a thousand miles to fall down at your door!"
sentence2 = "I played the play playfully as the players were playing in the play with playfullness"

### Tokenization

In [9]:
!pip install nltk
from nltk import word_tokenize, sent_tokenize

print('Tokenized words:', word_tokenize(sentence1))
print('\nTokenized sentences:', sent_tokenize(sentence1))

Tokenized words: ['I', 'will', 'walk', '500', 'miles', 'and', 'I', 'would', 'walk', '500', 'more', '.', 'Just', 'to', 'be', 'the', 'man', 'who', 'walks', 'a', 'thousand', 'miles', 'to', 'fall', 'down', 'at', 'your', 'door', '!']

Tokenized sentences: ['I will walk 500 miles and I would walk 500 more.', 'Just to be the man who walks a thousand miles to fall down at your door!']



[notice] A new release of pip available: 22.3 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### POS Tagging

In [10]:
from nltk import pos_tag

token = word_tokenize(sentence1) + word_tokenize(sentence2)
tagged = pos_tag(token)                 

print("Tagging Parts of Speech:", tagged)

Tagging Parts of Speech: [('I', 'PRP'), ('will', 'MD'), ('walk', 'VB'), ('500', 'CD'), ('miles', 'NNS'), ('and', 'CC'), ('I', 'PRP'), ('would', 'MD'), ('walk', 'VB'), ('500', 'CD'), ('more', 'JJR'), ('.', '.'), ('Just', 'NNP'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('man', 'NN'), ('who', 'WP'), ('walks', 'VBZ'), ('a', 'DT'), ('thousand', 'NN'), ('miles', 'NNS'), ('to', 'TO'), ('fall', 'VB'), ('down', 'RP'), ('at', 'IN'), ('your', 'PRP$'), ('door', 'NN'), ('!', '.'), ('I', 'PRP'), ('played', 'VBD'), ('the', 'DT'), ('play', 'NN'), ('playfully', 'RB'), ('as', 'IN'), ('the', 'DT'), ('players', 'NNS'), ('were', 'VBD'), ('playing', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('play', 'NN'), ('with', 'IN'), ('playfullness', 'NN')]


### Stop-Words Removal

In [11]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

token = word_tokenize(sentence1)
cleaned_token = []

for word in token:
    if word not in stop_words:
        cleaned_token.append(word)

print('Unclean version:', token)
print('\nCleaned version:', cleaned_token)

Unclean version: ['I', 'will', 'walk', '500', 'miles', 'and', 'I', 'would', 'walk', '500', 'more', '.', 'Just', 'to', 'be', 'the', 'man', 'who', 'walks', 'a', 'thousand', 'miles', 'to', 'fall', 'down', 'at', 'your', 'door', '!']

Cleaned version: ['I', 'walk', '500', 'miles', 'I', 'would', 'walk', '500', '.', 'Just', 'man', 'walks', 'thousand', 'miles', 'fall', 'door', '!']


### Stemming

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

token = word_tokenize(sentence2)

stemmed = [stemmer.stem(word) for word in token]
print(" ".join(stemmed))

i play the play play as the player were play in the play with playful


### Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

token = word_tokenize(sentence2)

lemmatized_output = [lemmatizer.lemmatize(word) for word in token]
print(" ".join(lemmatized_output))

I played the play playfully a the player were playing in the play with playfullness


In [14]:
# Certainly! Let's go through each step in detail:

# Tokenization
# Tokenization is the process of breaking down text into individual words or sentences, referred to as tokens. In this code snippet, the word_tokenize function from NLTK (Natural Language Toolkit) is used to tokenize the input sentences.

# word_tokenize(sentence1) tokenizes the words in sentence1 and returns a list of individual words.
# sent_tokenize(sentence1) tokenizes the sentences in sentence1 and returns a list of individual sentences.
# POS Tagging
# POS (Part-of-Speech) tagging is the process of assigning grammatical tags to words based on their role and context in a sentence. The pos_tag function from NLTK is used to perform POS tagging on the tokens obtained from the previous step.

# pos_tag(token) takes a list of tokens as input and returns a list of tuples, where each tuple contains a word and its corresponding POS tag.
# Stop-Words Removal
# Stop words are commonly used words (e.g., "the", "is", "in") that do not carry significant meaning and are often removed from text during preprocessing. The NLTK library provides a predefined list of stop words for different languages. In this step, stop words are removed from the tokens obtained from the first step.

# stopwords.words('english') retrieves the list of English stop words from NLTK.
# The tokenized sentence is iterated, and each word is checked against the stop word list. If the word is not a stop word, it is added to the cleaned_token list.
# Stemming
# Stemming is the process of reducing words to their base or root form (e.g., "playing" to "play"). In this step, the Porter stemmer algorithm from NLTK is used to perform stemming on the tokens obtained from the input sentence.

# PorterStemmer() creates an instance of the PorterStemmer class.
# The tokenized sentence is iterated, and each word is stemmed using the stem() method of the stemmer object. The stemmed words are collected in the stemmed list.
# Lemmatization
# Lemmatization is the process of reducing words to their base or dictionary form (e.g., "playing" to "play"). In this step, the WordNet lemmatizer from NLTK is used to perform lemmatization on the tokens obtained from the input sentence.

# WordNetLemmatizer() creates an instance of the WordNetLemmatizer class.
# The tokenized sentence is iterated, and each word is lemmatized using the lemmatize() method of the lemmatizer object. The lemmatized words are collected in the lemmatized_output list.
# Each step in the code performs a specific text preprocessing task, such as tokenization, POS tagging, stop-word removal, stemming, or lemmatization. These tasks are commonly used in natural language processing and text analysis to transform raw text into a more structured format that can be used for further analysis or modeling.