In [1]:
import re
import nltk
import spacy
from bs4 import BeautifulSoup


In [2]:
# Download required NLTK Package

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/suhas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/suhas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/suhas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Sample text for preprocessing
text = "Hello <b>world!</b> Visit us at https://example.com. I'm soooo happy 😊!!! How r u? It's gr8."

## 1.Lowercasing

In [5]:
def lowercase_text(text):
    return text.lower()

text = lowercase_text(text)
print(text)

hello <b>world!</b> visit us at https://example.com. i'm soooo happy 😊!!! how r u? it's gr8.


## 2. Remove HTML Tags

In [6]:
def remove_html_tags(text):
    return BeautifulSoup(text,'html.parser').get_text()

text = remove_html_tags(text)
print(text)

hello world! visit us at https://example.com. i'm soooo happy 😊!!! how r u? it's gr8.


## 3. Remove URL's

In [7]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+','',text,flags=re.MULTILINE)

text = remove_urls(text)
print(text)

#or using default method

# import preprocessor as p

# text = "Hello <b>world!</b> Visit us at https://example.com. I'm soooo happy 😊!!! How r u? It's gr8."
# clean_text = p.clean(text)  # Removes URLs, mentions, hashtags, emojis, and special characters
# print(clean_text)


hello world! visit us at  i'm soooo happy 😊!!! how r u? it's gr8.


## 4. Remove Punctuation

In [8]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]','',text)

text = remove_punctuation(text)
print(text)

hello world visit us at  im soooo happy  how r u its gr8


## 5. Chat word Treatment

In [9]:
chat_words_dict = {
    'u': 'you',
    'r': 'are',
    'gr8': 'great',
    'hw': 'how',
    'hpy': 'happy'
}

def treat_chat_word(text):
    words = text.split()
    new_text = [chat_words_dict.get(word,word) for word in words]
    return " ".join(new_text)

text = treat_chat_word(text)
print(text)

hello world visit us at im soooo happy how are you its great


## 6. Spelling Correction (simple Example)

In [15]:
textsp = "hello world! visit us at  i'm soooo hapy 😊!!! how r u? it's gr8."

def correct_spelling(text):
    corections = {
        'hapy': 'happy',
        'soooo': 'so'
    }
    words = text.split()
    corrected_text = [corections.get(word,word) for word in words]
    return " ".join(corrected_text)

text = correct_spelling(textsp)
print(text)

hello world! visit us at i'm so happy 😊!!! how r u? it's gr8.


## 7. Removing stop words

In [19]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return " ".join([word for word in words if word.lower() not in stop_words])

text = remove_stop_words(text)
print(text)

hello world ! visit us 'm happy 😊 ! ! ! r u ? 's gr8 .


[nltk_data] Downloading package stopwords to /home/suhas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 8. Handle Emojis

In [20]:
def handle_emojis(text):
    emoji_dict = {
        "😊": "[smiling_face]",
        "😢": "[sad_face]"
    }
    for emoji,meaning in emoji_dict.items():
        text = text.replace(emoji,meaning)
        return text
    
text = handle_emojis(text)
print(text)    

hello world ! visit us 'm happy [smiling_face] ! ! ! r u ? 's gr8 .


## 9. Tokenization

In [24]:
from nltk.tokenize import word_tokenize

text = "He enjoys programming, playing, and running."

def tokenize_text(text):
    return word_tokenize(text)

text = tokenize_text(text)
print(text)

['He', 'enjoys', 'programming', ',', 'playing', ',', 'and', 'running', '.']


## 10. Stemming

In [23]:
from nltk.stem import PorterStemmer

text = "he enjoys programming,playing and running"

def stemming_info(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    return " ".join([stemmer.stem(word) for word in words])

text = stemming_info(text)
print(text)


he enjoy program , play and run


## 11.Lemmatization

In [26]:
from nltk.stem import WordNetLemmatizer

text = "he enjoys programming,playing and running"

# load spacy model for lemmatization
nlp = spacy.load('en_core_web_sm')

def lemmatization_info(text):
    doc = nlp(text)
    return " ".join(token.lemma_ for token in doc)

text = lemmatization_info(text)
print(text)

he enjoy programming , play and running


## Full process at one go

In [None]:
import spacy
from cleantext import clean
import preprocessor as p
from textblob import TextBlob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Sample text for preprocessing
text = "Hetllo <b>world!</b> Vdisit us at https://example.com. I'm soooo happy 😊!!! How r u? It's gr8."

def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()

html_text = remove_html_tags(text)
print("Text after removing HTML tags:")
print(html_text)

# Step 1: Clean the text with preprocessor (removes URLs, mentions, hashtags, emojis, and special characters)
clean_text = p.clean(html_text)

# Step 2: Remove punctuation
clean_text = clean(clean_text, no_punct=True)

# Step 3: Spell check
textblb = TextBlob(clean_text)
clean_text = textblb.correct().string

# Step 4: Use spaCy to remove stop words
doc = nlp(clean_text)
cleaned_words = [token.text for token in doc if not token.is_stop]

# Join the cleaned words back into a string
cleaned_text = " ".join(cleaned_words)

# Step 5: Tokenization
tokenized_text = word_tokenize(cleaned_text)

# Step 6: Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokenized_text]

# Join the stemmed words back into a string
stemmed_text = " ".join(stemmed_words)

# Step 7: Lemmatization
lemmatized_words = [nlp(word)[0].lemma_ for word in tokenized_text]

# Join the lemmatized words back into a string
lemmatized_text = " ".join(lemmatized_words)

print("\nCleaned text with stop words removed:")
print(cleaned_text)

print("\nTokenized text:")
print(tokenized_text)

print("\nStemmed text:")
print(stemmed_text)

print("\nLemmatized text:")
print(lemmatized_text)
