In [5]:
!pip install nltk spacy emoji
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
import nltk
import spacy
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# A. Tokenization

In [7]:
text = "Natural Language Processing is amazing! It helps computers understand human language."

# NLTK Tokenizer
nltk_tokens = word_tokenize(text)

# spaCy Tokenizer
doc = nlp(text)
spacy_tokens = [token.text for token in doc]

print("NLTK Tokens:")
print(nltk_tokens)

print("\nspaCy Tokens:")
print(spacy_tokens)


NLTK Tokens:
['Natural', 'Language', 'Processing', 'is', 'amazing', '!', 'It', 'helps', 'computers', 'understand', 'human', 'language', '.']

spaCy Tokens:
['Natural', 'Language', 'Processing', 'is', 'amazing', '!', 'It', 'helps', 'computers', 'understand', 'human', 'language', '.']


In [None]:
# B. Stopwords Removal

In [8]:
stop_words = set(stopwords.words("english"))

text = "This is a simple example showing how stopwords are removed from a sentence."

tokens = word_tokenize(text)

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("Before Removing Stopwords:")
print(tokens)

print("\nAfter Removing Stopwords:")
print(filtered_tokens)


Before Removing Stopwords:
['This', 'is', 'a', 'simple', 'example', 'showing', 'how', 'stopwords', 'are', 'removed', 'from', 'a', 'sentence', '.']

After Removing Stopwords:
['simple', 'example', 'showing', 'stopwords', 'removed', 'sentence', '.']


In [None]:
# C. Lemmatization & Stemming

In [14]:
text = "studies studying studied easily flying flies"

tokens = word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

print("Original Words:", tokens)
print("After Stemming:", stemmed)
print("After Lemmatization:", lemmatized)


Original Words: ['studies', 'studying', 'studied', 'easily', 'flying', 'flies']
After Stemming: ['studi', 'studi', 'studi', 'easili', 'fli', 'fli']
After Lemmatization: ['study', 'studying', 'studied', 'easily', 'flying', 'fly']


In [None]:
# D. Handling Punctuation, Special Characters, Emojis

In [10]:
text = "Hello!!! This is awesome ðŸ˜Š #NLP @user123 Let's learn!!!"

# Remove emojis
text_no_emoji = emoji.replace_emoji(text, replace='')

# Remove hashtags, mentions, punctuation
clean_text = re.sub(r'[@#]\w+', '', text_no_emoji)
clean_text = re.sub(r'[^\w\s]', '', clean_text)

print("Original Text:")
print(text)

print("\nCleaned Text:")
print(clean_text)


Original Text:
Hello!!! This is awesome ðŸ˜Š #NLP @user123 Let's learn!!!

Cleaned Text:
Hello This is awesome    Lets learn


In [None]:
# E. Lowercasing & Normalization

In [11]:
text = "  This   Is    A   MIXED Case   Sentence.   "

# Lowercase
text_lower = text.lower()

# Normalize spaces
normalized = re.sub(r'\s+', ' ', text_lower).strip()

print("Original:", text)
print("Normalized:", normalized)


Original:   This   Is    A   MIXED Case   Sentence.   
Normalized: this is a mixed case sentence.


In [None]:
# F. Regex for Text Cleaning

In [12]:
text = "Contact us at support@gmail.com or admin123@yahoo.com. Call 123456 today!"

# Extract emails
emails = re.findall(r'\S+@\S+', text)

# Remove numbers
no_numbers = re.sub(r'\d+', '', text)

# Replace multiple spaces with one
clean_spaces = re.sub(r'\s+', ' ', no_numbers)

print("Extracted Emails:", emails)
print("\nText Without Numbers:", no_numbers)
print("\nText With Clean Spaces:", clean_spaces)


Extracted Emails: ['support@gmail.com', 'admin123@yahoo.com.']

Text Without Numbers: Contact us at support@gmail.com or admin@yahoo.com. Call  today!

Text With Clean Spaces: Contact us at support@gmail.com or admin@yahoo.com. Call today!
