<a href="https://colab.research.google.com/github/RajarajachozhanVK/RajarajachozhanVK/blob/main/Text_Preprocessing_Techniques_with_NLTK_and_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Tokenization**

In [None]:
import nltk

In [None]:
import spacy

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nlp = spacy.blank("en")
# Input text
doc = nlp("Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (Natural) Language.")
for tokens in doc:
    print("Tokens:", tokens)

Tokens: Natural
Tokens: language
Tokens: processing
Tokens: is
Tokens: a
Tokens: field
Tokens: of
Tokens: artificial
Tokens: intelligence
Tokens: that
Tokens: deals
Tokens: with
Tokens: the
Tokens: interaction
Tokens: between
Tokens: computers
Tokens: and
Tokens: human
Tokens: (
Tokens: Natural
Tokens: )
Tokens: Language
Tokens: .


In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (Natural) Language."
# tokenize the text
tokens = nltk.word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'human', '(', 'Natural', ')', 'Language', '.']


**2. Lowercasing**

In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (Natural) Language."
# tokenize the text
tokens = nltk.word_tokenize(text)
# lowercase the tokens
lowercased_tokens = [token.lower() for token in tokens]
print("Lowercased tokens:", lowercased_tokens)

Lowercased tokens: ['natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'language', '.']


**3. Remove punctuation**

In [None]:
import string

In [None]:
def remove_punctuations(text):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    # Remove punctuations from each word
    words_without_punctuations = [word for word in words if word not in string.punctuation]
    # Join the words back into a string
    text_without_punctuations = " ".join(words_without_punctuations)
    return text_without_punctuations

In [None]:
# Test the function
input_text = input("Enter a string: ")
text_without_punctuations = remove_punctuations(input_text)
print("String without punctuations:", text_without_punctuations)

Enter a string: Hello World!! @ Vel New3$
String without punctuations: Hello World Vel New3


**4. Remove stop words**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
def remove_stop_words(string):
    stop_words = set(stopwords.words('english'))
    words = string.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    new_string = ' '.join(filtered_words)
    return new_string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
input_string = "This is an example sentence to remove stop words from."
result = remove_stop_words(input_string)
print("Original string:", input_string)
print("Modified string:", result)

Original string: This is an example sentence to remove stop words from.
Modified string: example sentence remove stop words from.


In [None]:
def remove_stop_words(string):
    # Load the spaCy English language model
    nlp = spacy.load('en_core_web_sm')
    # Tokenize the string into individual words
    doc = nlp(string)
    # Filter out stop words
    filtered_words = [token.text for token in doc if not token.is_stop]
    # Join the filtered words back into a string
    new_string = ' '.join(filtered_words)
    return new_string

In [None]:
input_string = "This is an example sentence to remove stop words from."
result = remove_stop_words(input_string)
print("Original string:", input_string)
print("Modified string:", result)

Original string: This is an example sentence to remove stop words from.
Modified string: example sentence remove stop words .


**5. Remove extra whitespace**

In [None]:
# input text with extra white space
text = "  Natural   language processing   is   a field   of artificial intelligence   that deals with the interaction between computers and human   (natural)   language.   "
# remove leading and trailing white space
text = text.strip()
# replace multiple consecutive white space characters with a single space
text = " ".join(text.split())
print("Cleaned text:", text)

Cleaned text: Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language.


**6. Remove URLs**

In [None]:
import re

In [None]:
# input text with URLs
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language. Check out this article for more information: https://www.veltech.edu.in/"
# define a regular expression pattern to match URLs
pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
# replace URLs with an empty string
cleaned_text = re.sub(pattern, "", text)
print("Text without URLs:", cleaned_text)

Text without URLs: Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language. Check out this article for more information: 


**7. Remove HTML code**

In [None]:
# input text with HTML code
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language. <b>This is an example of bold text.</b>"
# define a regular expression pattern to match HTML tags
pattern = r"<[^>]+>"
# replace HTML tags with an empty string
cleaned_text = re.sub(pattern, "", text)
print("Text without HTML code:", cleaned_text)

Text without HTML code: Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language. This is an example of bold text.


**8. Remove frequent words**

In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language."
# tokenize the text
tokens = nltk.word_tokenize(text)
# calculate the frequency of each word
fdist = nltk.FreqDist(tokens)
# remove the most common words (e.g., the top 10% of words by frequency)
filtered_tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]
print("Tokens without frequent words:", filtered_tokens)

Tokens without frequent words: ['Natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'language', '.']


**9. Spelling correction**

In [None]:
nltk.download('words')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
text = "Natural langage processing is a field of artificial intelligece that deals with the interaction between computers and human (naturl) langage."
# tokenize the text
tokens = nltk.word_tokenize(text)
# get list of English words
words = nltk.corpus.words.words()
# correct spelling of each word
corrected_tokens = []
for token in tokens:
    # find the word with the lowest edit distance
    corrected_token = min(words, key=lambda x: nltk.edit_distance(x, token))
    corrected_tokens.append(corrected_token)
print("Corrected tokens:", corrected_tokens)

KeyboardInterrupt: 

**10. Stemming**

In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language."
# tokenize the text
tokens = nltk.word_tokenize(text)
# create stemmer object
stemmer = nltk.stem.PorterStemmer()
# stem each token
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['natur', 'languag', 'process', 'is', 'a', 'field', 'of', 'artifici', 'intellig', 'that', 'deal', 'with', 'the', 'interact', 'between', 'comput', 'and', 'human', '(', 'natur', ')', 'languag', '.']


**11. Lemmatization**

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language."
# tokenize the text
tokens = nltk.word_tokenize(text)
# create lemmatizer object
lemmatizer = nltk.stem.WordNetLemmatizer()
# lemmatize each token
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['Natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'deal', 'with', 'the', 'interaction', 'between', 'computer', 'and', 'human', '(', 'natural', ')', 'language', '.']


In [None]:
import spacy
# English pipelines include a rule-based lemmatizer
nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)  # 'rule'
doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']

rule
['I', 'be', 'read', 'the', 'paper', '.']


**12. Part-of-speech tagging**

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language."
# tokenize the text
tokens = nltk.word_tokenize(text)
# tag the tokens with their POS tags
tagged_tokens = nltk.pos_tag(tokens)
print("Tagged tokens:", tagged_tokens)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Tagged tokens: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('that', 'IN'), ('deals', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('interaction', 'NN'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('(', '('), ('natural', 'JJ'), (')', ')'), ('language', 'NN'), ('.', '.')]
Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


**13. Named Entity Recognition**

In [None]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [None]:
# input text
text = "Natural language processing is a field of artificial intelligence that deals with the interaction between computers and human (natural) language. John Smith works at Google in New York."
# tokenize the text
tokens = nltk.word_tokenize(text)
# tag the tokens with their part of speech
tagged_tokens = nltk.pos_tag(tokens)
# identify named entities
named_entities = nltk.ne_chunk(tagged_tokens)
print("Named entities:", named_entities)

Named entities: (S
  Natural/JJ
  language/NN
  processing/NN
  is/VBZ
  a/DT
  field/NN
  of/IN
  artificial/JJ
  intelligence/NN
  that/IN
  deals/NNS
  with/IN
  the/DT
  interaction/NN
  between/IN
  computers/NNS
  and/CC
  human/JJ
  (/(
  natural/JJ
  )/)
  language/NN
  ./.
  (PERSON John/NNP Smith/NNP)
  works/VBZ
  at/IN
  (ORGANIZATION Google/NNP)
  in/IN
  (GPE New/NNP York/NNP)
  ./.)


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
