## Lowercasing

In [1]:
text_lowercasing = "Text Preprocessing: CONVERTING ALL TEXT TO LOWERCASE."
lowercased_text = text_lowercasing.lower()
print("Lowercasing:", lowercased_text)

Lowercasing: text preprocessing: converting all text to lowercase.


## Removing Punctuations

In [2]:
import re

text_punctuation = "Text preprocessing involves removing punctuation, which can be noisy!!!"

# Removing Punctuation
no_punctuation_text = re.sub(r'[^\w\s]', '', text_punctuation) #^\w matches any non-word character. ^\s matches any non-whitespace character.
print("Removing Punctuation:", no_punctuation_text)

Removing Punctuation: Text preprocessing involves removing punctuation which can be noisy


## Stop Words

In [7]:
import nltk
import re
from nltk.corpus import stopwords

# nltk.download('punkt')
# nltk.download('stopwords')

text = "Text preprocessing is an important step in Natural Language Processing!"
stop_words = set(stopwords.words('english'))
filtered_text = [word for word in nltk.word_tokenize(text) if word.lower() not in stop_words]
print("Removing Stop Words:", filtered_text)


Removing Stop Words: ['Text', 'preprocessing', 'important', 'step', 'Natural', 'Language', 'Processing', '!']


## Special Characters

In [9]:
import re

text_special_characters = "Text preprocessing removes special characters, such as @#$%^&*(), and numbers 123."

no_special_characters_text = re.sub('[^A-Za-z ]+', '', text_special_characters)
print("Removing Special Characters and Numbers:", no_special_characters_text)


Removing Special Characters and Numbers: Text preprocessing removes special characters such as  and numbers 


## Stemming

In [11]:
from nltk.stem import PorterStemmer

# Sample text for demonstration
text_stemming = "Text preprocessing involves stemmer, stemming, stemmed words."

# Stemming
stemmer = PorterStemmer()
stemmed_text = ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(text_stemming)])
print("Stemming:", stemmed_text)


Stemming: text preprocess involv stemmer , stem , stem word .


## Lemmatization

In [25]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download NLTK resources
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# Function to get WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Noun (default)

# Sample text for demonstration
text = "Lemmatization is a more advanced text preprocessing technique than stemming."

# Tokenize the text
tokens = word_tokenize(text)

# Perform part-of-speech tagging
pos_tags = pos_tag(tokens)

# Lemmatization with part-of-speech tags
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]

print("Lemmatization:", ' '.join(lemmatized_words))


Lemmatization: Lemmatization be a more advanced text preprocessing technique than stem .


## URL remover

In [18]:
import re

html_text = "<p>This is an example <b>HTML</b> text with <a href='http://example.com'>URL</a>.</p>"

# Removing HTML Tags
no_html_text = re.sub('<[^<]+?>', '', html_text)
print("Removing HTML Tags:", no_html_text)


Removing HTML Tags: This is an example HTML text with URL.


## Handling Abbreviations

In [19]:
text_abbreviations = "Dr. Smith and Mr. Johnson are attending the conference."

# Handling Abbreviations
abbreviation_mapping = {"Dr.": "Doctor", "Mr.": "Mister"}
for abbreviation, expanded in abbreviation_mapping.items():
    text_abbreviations = text_abbreviations.replace(abbreviation, expanded)
print("Handling Abbreviations:", text_abbreviations)


Handling Abbreviations: Doctor Smith and Mister Johnson are attending the conference.


## Spell Checking

In [23]:
!pip install pyspellchecker
from spellchecker import SpellChecker

text_spell_checking = "This is an exampel sentence with misspellled words."

# Tokenize the text
tokens = text_spell_checking.split()

# Spell Checking
spell = SpellChecker()
corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in tokens]
corrected_text = ' '.join(corrected_words)
print("Spell Checking:", corrected_text)

Spell Checking: This is an example sentence with misspelled words


## Removing Rare Words


In [27]:
from collections import Counter
from nltk.tokenize import word_tokenize

# Sample text for demonstration
text_rare_words = "This is a sample text with some repeated and rare words. Sample text."

# Tokenize the text
tokens = word_tokenize(text_rare_words)

# Removing Rare Words
word_frequency = Counter(tokens)
rare_words = [word for word, count in word_frequency.items() if count == 1]

print("Rare Words:", rare_words)


Rare Words: ['This', 'is', 'a', 'sample', 'with', 'some', 'repeated', 'and', 'rare', 'words', 'Sample']
