**NLTK**

In [67]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Step 1: Lowercasing & Tokenization


In [68]:
from nltk.tokenize import word_tokenize

In [69]:
text = "While walking through the park and listening to music, she started thinking about everything that had been bothering her lately, including the growing pressure at work, the never-ending meetings, and the feeling that she was losing control — but instead of panicking, she kept breathing deeply and focusing on calming her mind."

In [70]:
tokens=word_tokenize(text.lower())

In [71]:
print(tokens)

['while', 'walking', 'through', 'the', 'park', 'and', 'listening', 'to', 'music', ',', 'she', 'started', 'thinking', 'about', 'everything', 'that', 'had', 'been', 'bothering', 'her', 'lately', ',', 'including', 'the', 'growing', 'pressure', 'at', 'work', ',', 'the', 'never-ending', 'meetings', ',', 'and', 'the', 'feeling', 'that', 'she', 'was', 'losing', 'control', '—', 'but', 'instead', 'of', 'panicking', ',', 'she', 'kept', 'breathing', 'deeply', 'and', 'focusing', 'on', 'calming', 'her', 'mind', '.']


Step 2: Removing Stopwords

In [72]:
from nltk.corpus import stopwords

In [73]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
stop_words = set(stopwords.words('english'))

In [75]:
print(stop_words)

{'my', 'for', 'some', 'hers', 'same', 'theirs', 'am', 'now', 'only', 'who', 'there', 'both', "aren't", 'don', 'been', 'having', 've', 'weren', "don't", 'with', 'because', 'just', 'at', "shan't", "doesn't", 'while', 'aren', "hasn't", "isn't", 'have', "needn't", 'no', 'doing', 'this', 'a', 'where', "you'd", 'she', 'then', 'wouldn', "mustn't", 'from', "she's", "mightn't", "it'd", 'needn', "weren't", 'more', "they're", 'on', 'didn', "it's", 's', "we're", 'nor', 'once', 'ain', "you'll", 'in', "i'm", 'when', "wasn't", 'off', 'couldn', 'an', 'does', 'what', "won't", 'how', 'whom', 'about', 'should', 'were', 'itself', "hadn't", 'him', "we'll", 'ours', 'again', "didn't", 'haven', 'than', 'or', 'our', "you're", 'such', 'd', 'hadn', 'and', 'them', 'too', 'i', 'any', 'of', "i'll", "it'll", 'has', 'not', 'o', 'own', "we've", 'will', 'mightn', 'but', 'is', 'it', "she'll", 'doesn', 'under', 'so', 'isn', 'if', 'her', 'out', 'your', 'into', 're', 'down', 'against', 'y', 'that', 'won', 'he', 'mustn', 'y

In [76]:
filtered = [w for w in tokens if w not in stop_words]

In [77]:
print(filtered)

['walking', 'park', 'listening', 'music', ',', 'started', 'thinking', 'everything', 'bothering', 'lately', ',', 'including', 'growing', 'pressure', 'work', ',', 'never-ending', 'meetings', ',', 'feeling', 'losing', 'control', '—', 'instead', 'panicking', ',', 'kept', 'breathing', 'deeply', 'focusing', 'calming', 'mind', '.']


Step 3: Stemming

In [78]:
from nltk.stem import PorterStemmer

In [79]:
stemmer=PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered]
print(stemmed)

['walk', 'park', 'listen', 'music', ',', 'start', 'think', 'everyth', 'bother', 'late', ',', 'includ', 'grow', 'pressur', 'work', ',', 'never-end', 'meet', ',', 'feel', 'lose', 'control', '—', 'instead', 'panick', ',', 'kept', 'breath', 'deepli', 'focus', 'calm', 'mind', '.']


Step 4: POS Tagging

In [80]:
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = nltk.pos_tag(filtered)
print(pos_tags)

[('walking', 'VBG'), ('park', 'NN'), ('listening', 'VBG'), ('music', 'NN'), (',', ','), ('started', 'VBD'), ('thinking', 'VBG'), ('everything', 'NN'), ('bothering', 'VBG'), ('lately', 'RB'), (',', ','), ('including', 'VBG'), ('growing', 'VBG'), ('pressure', 'NN'), ('work', 'NN'), (',', ','), ('never-ending', 'JJ'), ('meetings', 'NNS'), (',', ','), ('feeling', 'VBG'), ('losing', 'VBG'), ('control', 'NN'), ('—', 'NNP'), ('instead', 'RB'), ('panicking', 'NN'), (',', ','), ('kept', 'VBD'), ('breathing', 'VBG'), ('deeply', 'RB'), ('focusing', 'VBG'), ('calming', 'VBG'), ('mind', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


**Spacy**

In [81]:
!pip install spacy
import spacy



Step 1: Loading & Tokenization

In [82]:
nlp=spacy.load("en_core_web_sm")

In [83]:
doc=nlp(text)
tokens=[token.text for token in doc]
print(tokens)

['While', 'walking', 'through', 'the', 'park', 'and', 'listening', 'to', 'music', ',', 'she', 'started', 'thinking', 'about', 'everything', 'that', 'had', 'been', 'bothering', 'her', 'lately', ',', 'including', 'the', 'growing', 'pressure', 'at', 'work', ',', 'the', 'never', '-', 'ending', 'meetings', ',', 'and', 'the', 'feeling', 'that', 'she', 'was', 'losing', 'control', '—', 'but', 'instead', 'of', 'panicking', ',', 'she', 'kept', 'breathing', 'deeply', 'and', 'focusing', 'on', 'calming', 'her', 'mind', '.']


Step 2: Stopword Removal

In [84]:
filtered = [token.text for token in doc if not token.is_stop]
print(filtered)

['walking', 'park', 'listening', 'music', ',', 'started', 'thinking', 'bothering', 'lately', ',', 'including', 'growing', 'pressure', 'work', ',', '-', 'ending', 'meetings', ',', 'feeling', 'losing', 'control', '—', 'instead', 'panicking', ',', 'kept', 'breathing', 'deeply', 'focusing', 'calming', 'mind', '.']


spaCy - Step 3: Lemmatization

In [85]:
lemmas = [token.lemma_ for token in doc if not token.is_stop]
print(lemmas)

['walk', 'park', 'listen', 'music', ',', 'start', 'think', 'bother', 'lately', ',', 'include', 'grow', 'pressure', 'work', ',', '-', 'end', 'meeting', ',', 'feeling', 'lose', 'control', '—', 'instead', 'panic', ',', 'keep', 'breathe', 'deeply', 'focus', 'calm', 'mind', '.']


Step 4: POS Tagging

In [86]:
for token in doc:
  print(f"{token.text} - {token.pos_} - {token.tag_}")

While - SCONJ - IN
walking - VERB - VBG
through - ADP - IN
the - DET - DT
park - NOUN - NN
and - CCONJ - CC
listening - VERB - VBG
to - ADP - IN
music - NOUN - NN
, - PUNCT - ,
she - PRON - PRP
started - VERB - VBD
thinking - VERB - VBG
about - ADP - IN
everything - PRON - NN
that - PRON - WDT
had - AUX - VBD
been - AUX - VBN
bothering - VERB - VBG
her - PRON - PRP
lately - ADV - RB
, - PUNCT - ,
including - VERB - VBG
the - DET - DT
growing - VERB - VBG
pressure - NOUN - NN
at - ADP - IN
work - NOUN - NN
, - PUNCT - ,
the - DET - DT
never - ADV - RB
- - PUNCT - HYPH
ending - VERB - VBG
meetings - NOUN - NNS
, - PUNCT - ,
and - CCONJ - CC
the - DET - DT
feeling - NOUN - NN
that - SCONJ - IN
she - PRON - PRP
was - AUX - VBD
losing - VERB - VBG
control - NOUN - NN
— - PUNCT - :
but - CCONJ - CC
instead - ADV - RB
of - ADP - IN
panicking - VERB - VBG
, - PUNCT - ,
she - PRON - PRP
kept - VERB - VBD
breathing - VERB - VBG
deeply - ADV - RB
and - CCONJ - CC
focusing - VERB - VBG
on - ADP - 

**Text Representation in NLP**

Bag of Words (BoW)

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
corpus=["I love NLP", "NLP is fun"]
vectorizer=CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['fun' 'is' 'love' 'nlp']
[[0 0 1 1]
 [1 1 0 1]]


TF-IDF (Term Frequency – Inverse Document Frequency)

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=["I love NLP", "NLP is fun"]
vectorizer=TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['fun' 'is' 'love' 'nlp']
[[0.         0.         0.81480247 0.57973867]
 [0.6316672  0.6316672  0.         0.44943642]]
