In [None]:
# pip install nltk
# pip install PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import PyPDF2

In [None]:
#nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# A corpus is a collection of text documents.
# We take a short tweets/reviews pdf as our dataset.
f = open('data.pdf','rb')

corpus = PyPDF2.PdfReader(f)

##**Preprocessing Techniques** - Preprocessing improves accuracy, efficiency, and understanding in NLP tasks such as sentiment analysis, text classification, and topic modeling.

##1. Tokenization

In [None]:
# Tokenization splits text into words.
tokenized_corpus = []

for page in corpus.pages:
    tokens = word_tokenize(page.extract_text().lower())
    tokenized_corpus.append(tokens)

print("Tokenized Corpus:")
for i, tokens in enumerate(tokenized_corpus, 1):
    print(f"D{i}:", tokens)

Tokenized Corpus:
D1: ['i', 'love', 'this', 'phone', '!', 'the', 'camera', 'quality', 'is', 'amazing', 'and', 'captures', 'very', 'clear', 'and', 'vibrant', 'photos', 'even', 'in', 'low', 'light', '.', 'the', 'portrait', 'mode', 'works', 'beautifully', ',', 'and', 'the', 'video', 'stabilization', 'is', 'impressive', '.', 'overall', ',', 'it', 'feels', 'like', 'a', 'premium', 'device', 'and', 'is', 'definitely', 'worth', 't', 'he', 'price', '.', 'worst', 'service', 'ever', '.', 'i', 'am', 'totally', 'disappointed', 'with', 'the', 'delivery', 'experience', '.', 'the', 'package', 'arrived', 'late', ',', 'there', 'were', 'no', 'proper', 'updates', ',', 'and', 'customer', 'support', 'was', 'unhelpful', 'when', 'i', 'tried', 'to', 'track', 'my', 'order', '.', 'this', 'really', 'ruined', 'my', 'overall', 'experience', '.', 'the', 'm', 'ovie', 'was', 'not', 'good', ',', 'but', 'the', 'music', 'was', 'fantastic', '.', 'while', 'the', 'storyline', 'felt', 'weak', 'and', 'predictable', ',', 'the'

##2. Stopword Removal


In [None]:
# Stopword removal eliminates unnecessary words.
stop_words = set(stopwords.words('english'))

filtered_corpus = []

for tokens in tokenized_corpus:
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    filtered_corpus.append(filtered_tokens)

print("\nAfter Stopword Removal:")
for i, tokens in enumerate(filtered_corpus, 1):
    print(f"D{i}:", tokens)


After Stopword Removal:
D1: ['love', 'phone', 'camera', 'quality', 'amazing', 'captures', 'clear', 'vibrant', 'photos', 'even', 'low', 'light', 'portrait', 'mode', 'works', 'beautifully', 'video', 'stabilization', 'impressive', 'overall', 'feels', 'like', 'premium', 'device', 'definitely', 'worth', 'price', 'worst', 'service', 'ever', 'totally', 'disappointed', 'delivery', 'experience', 'package', 'arrived', 'late', 'proper', 'updates', 'customer', 'support', 'unhelpful', 'tried', 'track', 'order', 'really', 'ruined', 'overall', 'experience', 'ovie', 'good', 'music', 'fantastic', 'storyline', 'felt', 'weak', 'predictable', 'background', 'score', 'songs', 'beautifully', 'composed', 'emotionally', 'engaging', 'would', 'listen', 'soundtrack', 'even', 'though', 'rewatch', 'movi', 'fast', 'delivery', 'great', 'packaging', 'satisfied', 'quickly', 'product', 'arrived', 'securely', 'packed', 'everything', 'neatly', 'organized', 'damage', 'kind', 'service', 'really', 'builds', 'trust', 'brand'

##3. Stemming / Lemmatization

In [None]:
# Lemmatization converts words into meaningful base forms.
lemmatizer = WordNetLemmatizer()

lemmatized_corpus = []

for tokens in filtered_corpus:
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_corpus.append(lemmatized_tokens)

print("\nAfter Lemmatization (Final Output):")
for i, tokens in enumerate(lemmatized_corpus, 1):
    print(f"D{i}:", tokens)


After Lemmatization (Final Output):
D1: ['love', 'phone', 'camera', 'quality', 'amazing', 'capture', 'clear', 'vibrant', 'photo', 'even', 'low', 'light', 'portrait', 'mode', 'work', 'beautifully', 'video', 'stabilization', 'impressive', 'overall', 'feel', 'like', 'premium', 'device', 'definitely', 'worth', 'price', 'worst', 'service', 'ever', 'totally', 'disappointed', 'delivery', 'experience', 'package', 'arrived', 'late', 'proper', 'update', 'customer', 'support', 'unhelpful', 'tried', 'track', 'order', 'really', 'ruined', 'overall', 'experience', 'ovie', 'good', 'music', 'fantastic', 'storyline', 'felt', 'weak', 'predictable', 'background', 'score', 'song', 'beautifully', 'composed', 'emotionally', 'engaging', 'would', 'listen', 'soundtrack', 'even', 'though', 'rewatch', 'movi', 'fast', 'delivery', 'great', 'packaging', 'satisfied', 'quickly', 'product', 'arrived', 'securely', 'packed', 'everything', 'neatly', 'organized', 'damage', 'kind', 'service', 'really', 'build', 'trust', 'b