**NLP-PR-2**

2] To perform various preprocessing tasks in NLP:
Perform various basic pre-processing tasks like tokenization, stemming, lemmatization, stop
word removal etc. using inbuilt functions and using regular expressions.

In [8]:
import re
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer


In [10]:
# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Download the punkt_tab resource
nltk.download('punkt_tab') # Download the necessary data for tokenization


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Sample text
text = "Natural Language Processing (NLP) is evolving rapidly, enabling machines to understand human language."


In [4]:
# ============================
# 1. Using Inbuilt Libraries
# ============================

print("\n🔹 Preprocessing Using NLTK & spaCy")



🔹 Preprocessing Using NLTK & spaCy


In [11]:
# Tokenization
tokens = word_tokenize(text)
print(f"\nTokens (NLTK): {tokens}")



Tokens (NLTK): ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'evolving', 'rapidly', ',', 'enabling', 'machines', 'to', 'understand', 'human', 'language', '.']


In [13]:
# Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(f"\nFiltered Tokens (No Stopwords): {filtered_tokens}")



Filtered Tokens (No Stopwords): ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'evolving', 'rapidly', ',', 'enabling', 'machines', 'understand', 'human', 'language', '.']


In [14]:
# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print(f"\nStemmed Tokens: {stemmed}")



Stemmed Tokens: ['natur', 'languag', 'process', '(', 'nlp', ')', 'evolv', 'rapidli', ',', 'enabl', 'machin', 'understand', 'human', 'languag', '.']


In [15]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]
print(f"\nLemmatized Tokens: {lemmatized}")



Lemmatized Tokens: ['natural', 'language', 'processing', '(', 'nlp', ')', 'evolving', 'rapidly', ',', 'enabling', 'machine', 'understand', 'human', 'language', '.']


In [16]:
# ============================
# 2. Using Regular Expressions
# ============================

print("\n🔹 Preprocessing Using Regular Expressions")



🔹 Preprocessing Using Regular Expressions


In [17]:
# Lowercase and remove punctuation
text_clean = re.sub(r'[^\w\s]', '', text.lower())
print(f"\nCleaned Text: {text_clean}")



Cleaned Text: natural language processing nlp is evolving rapidly enabling machines to understand human language


In [18]:
# Tokenization (RegEx-based)
regex_tokens = re.findall(r'\b\w+\b', text_clean)
print(f"\nTokens (Regex): {regex_tokens}")



Tokens (Regex): ['natural', 'language', 'processing', 'nlp', 'is', 'evolving', 'rapidly', 'enabling', 'machines', 'to', 'understand', 'human', 'language']


In [19]:
# Manual Stopword Removal
manual_stopwords = {'is', 'to', 'the', 'and', 'a', 'an', 'of', 'in', 'on'}
manual_filtered = [word for word in regex_tokens if word not in manual_stopwords]
print(f"\nFiltered Tokens (Manual Stopwords): {manual_filtered}")



Filtered Tokens (Manual Stopwords): ['natural', 'language', 'processing', 'nlp', 'evolving', 'rapidly', 'enabling', 'machines', 'understand', 'human', 'language']


In [20]:

# Manual Stemming (basic example)
manual_stemmed = [re.sub(r'(ing|ed|ly|s)$', '', word) for word in manual_filtered]
print(f"\nStemmed Tokens (Regex-based): {manual_stemmed}")


Stemmed Tokens (Regex-based): ['natural', 'language', 'process', 'nlp', 'evolv', 'rapid', 'enabl', 'machine', 'understand', 'human', 'language']
