# Basic NLP Text Preprocessing

## Sample Text

In [25]:
text = "Hello World!!! This is GREAT for learning NLP preprocessing... I'm really helpfull!"
print(text)

Hello World!!! This is GREAT for learning NLP preprocessing... I'm really helpfull!


In [26]:
text

"Hello World!!! This is GREAT for learning NLP preprocessing... I'm really helpfull!"

## 1. Basic Python String Operations

### Lowercasing

In [27]:
lowered = text.lower()
lowered

"hello world!!! this is great for learning nlp preprocessing... i'm really helpfull!"

### Strip Whitespace

In [28]:
messy_text = "  hello  world  "
cleaned = messy_text.strip()
cleaned

'hello  world'

### Split into Words

In [29]:
words = lowered.split()
words

['hello',
 'world!!!',
 'this',
 'is',
 'great',
 'for',
 'learning',
 'nlp',
 'preprocessing...',
 "i'm",
 'really',
 'helpfull!']

### Remove Punctuation

In [30]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
no_punt_list = []
for character in lowered:
    if character not in string.punctuation:
        no_punt_list.append(character)

no_punt = "".join(no_punt_list)
no_punt

'hello world this is great for learning nlp preprocessing im really helpfull'

## 2. Regular Expressions (Regex)

In [32]:
import re

### Remove Multiple Spaces
Replaces multiple consecutive spaces with single space. Text from web scraping often has inconsistent spacing that breaks tokenization.

In [33]:
messy_spacing = "hello    world   test"
fixed_spacing = re.sub(r'\s+', ' ', messy_spacing)
fixed_spacing

'hello world test'

### Remove Numbers
Removes all digits from text. Numbers often add noise unless they're part of meaningful entities like "COVID-19".

In [34]:
text_with_numbers = "I have 5 apples and 10 oranges"
no_numbers = re.sub(r'\d+', '', text_with_numbers)
no_numbers

'I have  apples and  oranges'

### Remove URLs and Emails
Removes web addresses and email addresses since these are usually not relevant for text analysis and create unique tokens.

In [35]:
web_text = "Visit https://example.com or email me at test@email.com"
no_urls = re.sub(r'http\S+|www\S+', '', web_text)
no_emails = re.sub(r'\S+@\S+', '', no_urls)
no_emails

'Visit  or email me at '

### Remove Special Characters
Keeps only alphabetic characters and spaces. Emojis, symbols, and special characters don't contribute to semantic meaning.

In [36]:
special_text = "Hello! 🚀 This has #hashtags and @mentions"
only_letters = re.sub(r'[^a-zA-Z\s]', '', special_text)
only_letters

'Hello  This has hashtags and mentions'

## 3. NLTK Operations

In [37]:
!pip install nltk



In [38]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/sudarshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Better Tokenization
NLTK intelligently splits text into tokens, handling contractions and punctuation better than simple split(). For example, "don't" becomes ["do", "n't"] instead of staying as one token.

In [39]:
from nltk.tokenize import word_tokenize

complex_text = "Don't split contractions badly! It's important."
basic_split = complex_text.split()
nltk_tokens = word_tokenize(complex_text.lower())

print(f"Basic split: {basic_split}")
print(f"NLTK tokens: {nltk_tokens}")

Basic split: ["Don't", 'split', 'contractions', 'badly!', "It's", 'important.']
NLTK tokens: ['do', "n't", 'split', 'contractions', 'badly', '!', 'it', "'s", 'important', '.']


### Remove Stop Words
Removes common function words like "the", "and", "is" that appear frequently but carry little meaning. These words don't help distinguish between different documents.

In [40]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
filtered_tokens = [word for word in tokens if word not in stop_words]

print(f"Original tokens: {tokens}")
print(f"Without stop words: {filtered_tokens}")

Original tokens: ['hello', 'world', '!', '!', '!', 'this', 'is', 'great', 'for', 'learning', 'nlp', 'preprocessing', '...', 'i', "'m", 'really', 'helpfull', '!']
Without stop words: ['hello', 'world', '!', '!', '!', 'great', 'learning', 'nlp', 'preprocessing', '...', "'m", 'really', 'helpfull', '!']


### Stemming
Stemming reduces words to their root form by removing suffixes. It's a crude but fast way to group related words together. The Porter Stemmer uses a set of rules to chop off word endings. Sometimes it creates non-words like "studi" from "studies" or "happi" from "happiness".

In [41]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words_to_stem = ['running', 'runs', 'easily', 'studies', 'happiness']

print("Stemming examples:")
for word in words_to_stem:
    print(f"{word} → {stemmer.stem(word)}")

Stemming examples:
running → run
runs → run
easily → easili
studies → studi
happiness → happi


### Lemmatization
Lemmatization is more sophisticated than stemming. It reduces words to their dictionary base form (lemma) using vocabulary and morphological analysis. It always produces valid words and understands that "better" is the comparative form of "good", while "running" becomes "run".

In [42]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("Lemmatization examples:")
for word in words_to_stem:
    print(f"{word} → {lemmatizer.lemmatize(word)}")

# Compare stemming vs lemmatization
print("\nStemming vs Lemmatization:")
comparison_words = ['better', 'running', 'studies', 'geese', 'feet']
for word in comparison_words:
    stem = stemmer.stem(word)
    lemma = lemmatizer.lemmatize(word)
    print(f"{word} → Stem: {stem}, Lemma: {lemma}")

Lemmatization examples:
running → running
runs → run
easily → easily
studies → study
happiness → happiness

Stemming vs Lemmatization:
better → Stem: better, Lemma: better
running → Stem: run, Lemma: running
studies → Stem: studi, Lemma: study
geese → Stem: gees, Lemma: goose
feet → Stem: feet, Lemma: foot


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 4. Spelling Correction

TextBlob uses statistical models to detect and correct spelling mistakes. It's useful for social media text or OCR output, but be careful as it can sometimes change correct words unintentionally.

In [43]:
!pip install textblob



In [44]:
from textblob import TextBlob

misspelled = "I'm really helpfull with lerning"
corrected = str(TextBlob(misspelled).correct())
print(f"Original: {misspelled}")
print(f"Corrected: {corrected}")

Original: I'm really helpfull with lerning
Corrected: I'm really helpful with leaning


## 5. Handle Contractions

Expanding contractions normalizes different forms of the same words. "Don't" and "do not" should be treated the same way. This is especially important for sentiment analysis and other NLP tasks.

In [45]:
contractions = {
    "don't": "do not",
    "won't": "will not", 
    "can't": "cannot",
    "n't": " not",
    "'re": " are",
    "'ve": " have",
    "'ll": " will",
    "'m": " am"
}

def expand_contractions(text):
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text

contract_text = "I don't think you're ready, but we'll see"
expanded = expand_contractions(contract_text)
print(f"Original: {contract_text}")
print(f"Expanded: {expanded}")

Original: I don't think you're ready, but we'll see
Expanded: I do not think you are ready, but we will see


## 6. Direct Word Replacement

Sometimes you need to standardize domain-specific terms, expand abbreviations, or replace slang with formal equivalents. This is especially useful for social media text or technical documents.

In [46]:
replacements = {
    'u': 'you',
    'ur': 'your', 
    'omg': 'oh my god',
    'lol': 'laugh out loud',
    'AI': 'artificial intelligence',
    'ML': 'machine learning'
}

def replace_words(text):
    words = text.split()
    replaced_words = [replacements.get(word, word) for word in words]
    return ' '.join(replaced_words)

slang_text = "omg ur AI model is lol good"
replaced = replace_words(slang_text)
print(f"Original: {slang_text}")
print(f"Replaced: {replaced}")

Original: omg ur AI model is lol good
Replaced: oh my god your artificial intelligence model is laugh out loud good


## Complete Pipeline Function

In [47]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download required data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Setup preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def clean_text(text):
    """Complete text preprocessing pipeline"""
    text = text.lower().strip()
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to /home/sudarshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
original = "      This is very &&!Good Text. Visit www.testdoc.com for more info or mail us at a@gmail.com! or 981111111 "
cleaned = clean_text(original)

cleaned

'good text visit wwwtestdoccom info mail us agmailcom 981111111'