<a href="https://colab.research.google.com/github/Tiwari666/NLP/blob/main/NLP_BASIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Top 14 NLTK preprocessing steps


#1. Tokenization

In [31]:
import nltk
nltk.download('punkt')

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

print("Tokens:", tokens)

Tokens: ['The', 'quick', 'brown', 'fox', 'is', 'jumping', 'over', 'the', 'lazy', 'dog', ',', 'which', 'is', 'located', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'because', 'it', 'was', 'too', 'quick', 'and', 'the', 'lazy', 'dog', 'did', "n't", 'move', 'quickly', 'enough', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#2. Lowercasing:
Converte all text to lowercase, making it case-insensitive.

In [32]:
# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# lowercase the tokens
lowercased_tokens = [token.lower() for token in tokens]

print("Lowercased tokens:", lowercased_tokens)

Lowercased tokens: ['the', 'quick', 'brown', 'fox', 'is', 'jumping', 'over', 'the', 'lazy', 'dog', ',', 'which', 'is', 'located', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'because', 'it', 'was', 'too', 'quick', 'and', 'the', 'lazy', 'dog', 'did', "n't", 'move', 'quickly', 'enough', '.']


#3. Remove punctuation
Removing punctuation marks simplifies the text and make it easier to process.

In [33]:
import nltk
import string

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# remove punctuation
filtered_tokens = [token for token in tokens if token not in string.punctuation]

print("Tokens without punctuation:", filtered_tokens)

Tokens without punctuation: ['The', 'quick', 'brown', 'fox', 'is', 'jumping', 'over', 'the', 'lazy', 'dog', 'which', 'is', 'located', 'at', 'http', '//example.com', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'because', 'it', 'was', 'too', 'quick', 'and', 'the', 'lazy', 'dog', 'did', "n't", 'move', 'quickly', 'enough']


#4. Remove stop words
Removing common words that do not add significant meaning to the text, such as “a,” “an,” and “the.”

To remove common stop words from a list of tokens using NLTK, one can use the nltk.corpus.stopwords.words() function to get a list of stopwords in a specific language and filter the tokens using this list. Example:

In [34]:
import nltk
nltk.download('stopwords')

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# get list of stopwords in English
stopwords = nltk.corpus.stopwords.words("english")

# remove stopwords
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]

print("Tokens without stopwords:", filtered_tokens)

Tokens without stopwords: ['quick', 'brown', 'fox', 'jumping', 'lazy', 'dog', ',', 'located', 'http', ':', '//example.com', ',', "n't", 'catch', 'fish', 'quick', 'lazy', 'dog', "n't", 'move', 'quickly', 'enough', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#5. Remove extra whitespace:

To remove extra white space within a text string with NLTK, one may employ string.strip() to eliminate leading and trailing whitespace, while string.replace() can be utilized to substitute multiple consecutive whitespace characters with a single space.

In [35]:
import nltk
import string

# input text with extra white space
text = "   The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough.   "

# remove leading and trailing white space
text = text.strip()

# replace multiple consecutive white space characters with a single space
text = " ".join(text.split())

print("Cleaned text:", text)

Cleaned text: The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough.


#6. Remove URLs
To remove URLs from a string of text using NLTK, one can use a regular expression pattern to identify URLs and replace them with an empty string. Here is an example of how to do this:

In [36]:
import nltk
import re

# input text with URLs
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# define a regular expression pattern to match URLs
pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

# replace URLs with an empty string
cleaned_text = re.sub(pattern, "", text)

print("Text without URLs:", cleaned_text)

Text without URLs: The quick brown fox is jumping over the lazy dog, which is located at , but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough.


#7. Remove HTML code:
To remove HTML code from a string of text using NLTK, one can use a regular expression pattern to identify HTML tags and replace them with an empty string.

In [37]:
import nltk
import re

# input text with HTML code
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# define a regular expression pattern to match HTML tags
pattern = r"<[^>]+>"

# replace HTML tags with an empty string
cleaned_text = re.sub(pattern, "", text)

print("Text without HTML code:", cleaned_text)

Text without HTML code: The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough.


#8. Remove frequent words:

To remove frequent words (also known as “high-frequency words”) from a list of tokens using NLTK, one can use the nltk.FreqDist() function to calculate the frequency of each word and filter out the most common ones.

In [38]:
import nltk

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# calculate the frequency of each word
fdist = nltk.FreqDist(tokens)

# remove the most common words (e.g., the top 10% of words by frequency)
filtered_tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

print("Tokens without frequent words:", filtered_tokens)

Tokens without frequent words: ['The', 'quick', 'brown', 'fox', 'is', 'jumping', 'over', 'the', 'lazy', 'dog', ',', 'which', 'is', 'located', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'because', 'it', 'was', 'too', 'quick', 'and', 'the', 'lazy', 'dog', 'did', "n't", 'move', 'quickly', 'enough', '.']


#9)  Spelling correction
Correcting misspelt words is sometimes important so that the meaning of a sentence can be interpreted later in the processing.

To perform spelling correction on a list of tokens using NLTK, one can use the nltk.corpus.words.words() function to get a list of English words and the nltk.edit_distance() function to calculate the edit distance between a word and the words in the list.

In [39]:
import nltk
nltk.download('words')

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish beacuse it was too quiick and the lazy dog didn't move quickiy enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# get list of English words
words = nltk.corpus.words.words()

# correct spelling of each word
corrected_tokens = []
for token in tokens:
    # find the word with the lowest edit distance
    corrected_token = min(words, key=lambda x: nltk.edit_distance(x, token))
    corrected_tokens.append(corrected_token)

print("Corrected tokens:", corrected_tokens)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Corrected tokens: ['The', 'quick', 'brown', 'fox', 'is', 'bumping', 'over', 'the', 'lazy', 'dog', 'A', 'which', 'is', 'lobated', 'at', 'atap', 'A', 'example', 'A', 'but', 'it', 'did', 'nat', 'catch', 'the', 'fish', 'Aeacus', 'it', 'was', 'too', 'quick', 'and', 'the', 'lazy', 'dog', 'did', 'nat', 'move', 'quickie', 'enough', 'A']


#10. Stemming
Reducing words to their base form, such as converting “jumping” to “jump.”

To perform stemming on a list of tokens using NLTK, one can use the nltk.stem.PorterStemmer() function to create a stemmer object and the stem() method to stem each token.

In [40]:
import nltk

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# create stemmer object
stemmer = nltk.stem.PorterStemmer()

# stem each token
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['the', 'quick', 'brown', 'fox', 'is', 'jump', 'over', 'the', 'lazi', 'dog', ',', 'which', 'is', 'locat', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'becaus', 'it', 'wa', 'too', 'quick', 'and', 'the', 'lazi', 'dog', 'did', "n't", 'move', 'quickli', 'enough', '.']


#11. Lemmatization
A more complicated and accurate method of reducing words to their base form than stemming.

To perform lemmatization on a token list using NLTK, one can utilize the nltk.stem.WordNetLemmatizer() function to instantiate a lemmatizer object, followed by applying the lemmatize() method to each token for lemmatization.


The WordNet lemmatizer uses the WordNet database of English words to lemmatize the tokens, taking into account the part of speech and the context in which the word is used. One can specify the part of speech of the token using the pos argument of the lemmatize() method (e.g., "v" for verbs, etc, "n" for nouns, etc.).

In [30]:
import nltk
nltk.download('wordnet')


# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language."

# tokenize the text
tokens = nltk.word_tokenize(text)

# create lemmatizer object
lemmatizer = nltk.stem.WordNetLemmatizer()

# lemmatize each token
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['Natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'deal', 'with', 'the', 'interaction', 'between', 'computer', 'and', 'human', '(', 'natural', ')', 'language', '.']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [41]:
import nltk

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# create stemmer object
stemmer = nltk.stem.PorterStemmer()

# stem each token
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['the', 'quick', 'brown', 'fox', 'is', 'jump', 'over', 'the', 'lazi', 'dog', ',', 'which', 'is', 'locat', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'becaus', 'it', 'wa', 'too', 'quick', 'and', 'the', 'lazi', 'dog', 'did', "n't", 'move', 'quickli', 'enough', '.']


#12. Part-of-speech tagging
Identifying the part of speech of each word in the text, such as noun, verb, or adjective.

To perform part of speech (POS) tagging on a list of tokens using NLTK, one can use the nltk.pos_tag() function to tag the tokens with their corresponding POS tags.

In [19]:
import nltk
nltk.download('averaged_perceptron_tagger')

# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language."

# tokenize the text
tokens = nltk.word_tokenize(text)

# tag the tokens with their POS tags
tagged_tokens = nltk.pos_tag(tokens)

print("Tagged tokens:", tagged_tokens)

Tagged tokens: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('that', 'IN'), ('deals', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('interaction', 'NN'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('(', '('), ('natural', 'JJ'), (')', ')'), ('language', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [43]:
import nltk

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenize the text
tokens = nltk.word_tokenize(text)

# create stemmer object
stemmer = nltk.stem.PorterStemmer()

# stem each token
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['the', 'quick', 'brown', 'fox', 'is', 'jump', 'over', 'the', 'lazi', 'dog', ',', 'which', 'is', 'locat', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'becaus', 'it', 'wa', 'too', 'quick', 'and', 'the', 'lazi', 'dog', 'did', "n't", 'move', 'quickli', 'enough', '.']


#13. Named Entity Recognition
Extracting named entities from a text, like a person’s name.

See also  Link Prediction For Graph Neural Networks (GNN) Made Simple & 6 Powerful Tools
To perform named entity recognition (NER) on a list of tokens using NLTK, one can use the nltk.ne_chunk() function to identify and label named entities in the tokens.

In [46]:
import nltk
nltk.download('maxent_ne_chunker')

# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough. Kyal Smith works at Facebook in New York."

# tokenize the text
tokens = nltk.word_tokenize(text)

# tag the tokens with their part of speech
tagged_tokens = nltk.pos_tag(tokens)

# identify named entities
named_entities = nltk.ne_chunk(tagged_tokens)

print("Named entities:", named_entities)

Named entities: (S
  The/DT
  quick/JJ
  brown/NN
  fox/NN
  is/VBZ
  jumping/VBG
  over/IN
  the/DT
  lazy/JJ
  dog/NN
  ,/,
  which/WDT
  is/VBZ
  located/VBN
  at/IN
  http/NN
  :/:
  //example.com/NN
  ,/,
  but/CC
  it/PRP
  did/VBD
  n't/RB
  catch/VB
  the/DT
  fish/NN
  because/IN
  it/PRP
  was/VBD
  too/RB
  quick/JJ
  and/CC
  the/DT
  lazy/JJ
  dog/NN
  did/VBD
  n't/RB
  move/VB
  quickly/RB
  enough/RB
  ./.
  (PERSON Kyal/NNP Smith/NNP)
  works/VBZ
  at/IN
  (ORGANIZATION Facebook/NNP)
  in/IN
  (GPE New/NNP York/NNP)
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


#14. Normalization
Standardising words or phrases that have multiple possible forms or spellings (e.g. “American” and “US” could both be normalised to “United States”). This can be easily done with a list of synonyms or industry-specific terms.

#NLTK preprocessing example code for Sentiment Analysis:

In [2]:
import nltk

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

In [8]:
nltk.download('maxent_ne_chunker')
from nltk import ne_chunk

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


In [10]:
nltk.download('words')
from nltk.corpus import words

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [12]:
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [14]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [52]:
# input text
text = "The quick brown fox is jumping over the lazy dog, which is located at http://example.com, but it didn't catch the fish because it was too quick and the lazy dog didn't move quickly enough."

# tokenization
tokens = nltk.word_tokenize(text)
print("Tokens:", tokens)

# part-of-speech tagging
pos_tags = nltk.pos_tag(tokens)
print("POS tags:", pos_tags)

# named entity recognition
named_entities = nltk.ne_chunk(pos_tags)
print("Named entities:", named_entities)

# lemmatization
lemmatizer = nltk.WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmas:", lemmas)

# stopword removal
stopwords = nltk.corpus.stopwords.words("english")
filtered_tokens = [token for token in tokens if token not in stopwords]
print("Filtered tokens:", filtered_tokens)

# URL Removal --define a regular expression pattern to match URLs
pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

# replace URLs with an empty string
cleaned_text = re.sub(pattern, "", text)

print("Text without URLs:", cleaned_text)

# text classification (example using a simple Naive Bayes classifier)
from nltk.classify import NaiveBayesClassifier

# training data (using a toy dataset for illustration purposes)
training_data = [("It was a great movie.", "pos"), ("I hated the book.", "neg"), ("The book was okay.", "pos")]

# extract features from the training data
def extract_features(text):
    features = {}
    for word in nltk.word_tokenize(text):
        features[word] = True
    return features

# create a list of feature sets and labels
feature_sets = [(extract_features(text), label) for (text, label) in training_data]

# train the classifier
classifier = NaiveBayesClassifier.train(feature_sets)



Tokens: ['The', 'quick', 'brown', 'fox', 'is', 'jumping', 'over', 'the', 'lazy', 'dog', ',', 'which', 'is', 'located', 'at', 'http', ':', '//example.com', ',', 'but', 'it', 'did', "n't", 'catch', 'the', 'fish', 'because', 'it', 'was', 'too', 'quick', 'and', 'the', 'lazy', 'dog', 'did', "n't", 'move', 'quickly', 'enough', '.']
POS tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), (',', ','), ('which', 'WDT'), ('is', 'VBZ'), ('located', 'VBN'), ('at', 'IN'), ('http', 'NN'), (':', ':'), ('//example.com', 'NN'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('catch', 'VB'), ('the', 'DT'), ('fish', 'NN'), ('because', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('too', 'RB'), ('quick', 'JJ'), ('and', 'CC'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('did', 'VBD'), ("n't", 'RB'), ('move', 'VB'), ('quickly', 'RB'), ('enough', 'RB'), ('.', '.')]
Named en

In [55]:
# test the classifier on a new example
test_text = "The movie was full of humors."
print("Sentiment:", classifier.classify(extract_features(test_text)))

Sentiment: pos


In [54]:
# test the classifier on a new example
test_text = "I don't like the movie."
print("Sentiment:", classifier.classify(extract_features(test_text)))

Sentiment: neg
