# **03. Universal Text Preprocessing Pipeline**
---


### 🧑‍💼 **Shuvendu Pritam Das**  
*Data Science / ML Enthusiast*  

- **GitHub:** [SPritamDas](https://github.com/SPritamDas/My-Profile)  
- **LinkedIn:** [Shuvendu Pritam Das](https://www.linkedin.com/in/shuvendupritamdas/)  
- **Email:** shuvendupritamdas181@gmail.com  
---

In [1]:
!pip install emoji TextBlob nltk spacy

import pandas as pd
import numpy as np
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob
import emoji

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Ensure NLTK stopwords are downloaded
import nltk
nltk.download('stopwords')

# Initialize the stemmer
stemmer = PorterStemmer()

# Step 1: Lowercasing
def lower_case(text):
    return text.lower()

# Step 2: Remove HTML tags
def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Step 3: Remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# Step 4: Remove punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Step 5: Remove extra whitespace
def remove_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

# Step 6: Correcting misspellings
def correct_spelling(text):
    return str(TextBlob(text).correct())

# Step 7: Handle emojis
def replace_emojis(text):
    return emoji.demojize(text)

# Step 8: Replace shorthand and abbreviations
def replace_shorthand(text):
    shorthand_dict = {
        "btw": "by the way",
        "lmao": "laughing my ass off",
        "omg": "oh my god",
        "brb": "be right back",
        "gr8": "great",
        "4": "for",
        "teh": "the",
        "u": "you",
        "r": "are",
        "c": "see",
        "b": "be"
    }
    for shorthand, full_form in shorthand_dict.items():
        text = re.sub(r'\b' + shorthand + r'\b', full_form, text)
    return text

# Step 9: Tokenization using spaCy
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Step 10: POS tagging and Parsing
def pos_and_parse(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    parse_tree = [(token.text, token.dep_, token.head.text) for token in doc]
    return pos_tags, parse_tree
author='SPritamDas'
# Full Preprocessing Pipeline
def preprocess(text, use_lowercase=True, use_html_removal=True, use_url_removal=True,
              use_punctuation_removal=True, use_whitespace_removal=True,
              use_spelling_correction=True, use_emoji_handling=True,
              use_shorthand_replacement=True, use_stop_words=False,
              use_lemmatization=False, use_stemming=False):

    # Step 1: Lowercasing
    if use_lowercase:
        text = lower_case(text)

    # Step 2: Remove HTML tags
    if use_html_removal:
        text = remove_html(text)

    # Step 3: Remove URLs
    if use_url_removal:
        text = remove_urls(text)

    # Step 4: Remove punctuation
    if use_punctuation_removal:
        text = remove_punctuation(text)

    # Step 5: Remove extra whitespace
    if use_whitespace_removal:
        text = remove_whitespace(text)

    # Step 6: Correcting misspellings
    if use_spelling_correction:
        text = correct_spelling(text)

    # Step 7: Handle emojis
    if use_emoji_handling:
        text = replace_emojis(text)

    # Step 8: Replace shorthand and abbreviations
    if use_shorthand_replacement:
        text = replace_shorthand(text)

    # Tokenization using spaCy
    tokens = tokenize(text)

    # Use NLTK for stop words
    if use_stop_words:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

    # Apply lemmatization if needed
    if use_lemmatization:
        tokens = [nlp(token)[0].lemma_ for token in tokens]  # Lemmatize each token

    # Apply stemming if needed
    if use_stemming:
        tokens = [stemmer.stem(token) for token in tokens]  # Stem each token

    # Get POS tags and parse tree
    pos_tags, parse_tree = pos_and_parse(text)

    return tokens, pos_tags, parse_tree

# Preprocess function for DataFrame input
def preprocess_dataframe(df, column_name, **kwargs):
    df['tokens'], df['pos_tags'], df['parse_tree'] = zip(
        *df[column_name].apply(lambda x: preprocess(x, **kwargs))
    )
    return df



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.DataFrame({
    'text': [
        "I love programming! 😊 Visit https://example.com for more info.",
        "Python is gr8 for NLP! #Python #NLP <html></html>",
        "I hate bugs. 😡 I can’t spell correctly, like teh or bugss.",
        "Btw, can't wait 4 the concert! BRB 😄",
        "OMG! That's so cool! LMAO 😆",
    ]
})

# Preprocess the DataFrame
processed_df = preprocess_dataframe(
    df,
    column_name='text',
    use_lowercase=True,
    use_html_removal=True,
    use_url_removal=True,
    use_punctuation_removal=True,
    use_whitespace_removal=True,
    use_spelling_correction=True,
    use_emoji_handling=True,
    use_shorthand_replacement=True,
    use_stop_words=True,
    use_lemmatization=True,
    use_stemming=True
)

# Display the results
processed_df[['text', 'tokens', 'pos_tags', 'parse_tree']]


Unnamed: 0,text,tokens,pos_tags,parse_tree
0,I love programming! 😊 Visit https://example.co...,"[love, program, visit]","[(i, PRON), (love, VERB), (programming, NOUN),...","[(i, nsubj, love), (love, ROOT, love), (progra..."
1,Python is gr8 for NLP! #Python #NLP <html></html>,"[patron, grm, nap, patron, nap]","[(patron, NOUN), (is, AUX), (grm, NOUN), (for,...","[(patron, nsubj, is), (is, ROOT, is), (grm, at..."
2,"I hate bugs. 😡 I can’t spell correctly, like t...","[hate, bag, spell, correctli, like, bursa]","[(i, PRON), (hate, VERB), (bags, NOUN), (i, PR...","[(i, nsubj, hate), (hate, ROOT, hate), (bags, ..."
3,"Btw, can't wait 4 the concert! BRB 😄","[bow, wait, concert, erb]","[(bow, NOUN), (can, AUX), (wait, VERB), (for, ...","[(bow, nsubj, wait), (can, aux, wait), (wait, ..."
4,OMG! That's so cool! LMAO 😆,"[org, cool, may]","[(org, NOUN), (that, PRON), (so, ADV), (cool, ...","[(org, ROOT, org), (that, meta, org), (so, adv..."
