In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag

In [3]:
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/xantanium/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/xantanium/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/xantanium/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/xantanium/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [5]:
doc = "Tokenization is the first step of text processing. It involves breaking down text into words or sentences"

In [7]:
'''
1. Tokenize
2. Remove stopwords and puncts
3. Stem and lemmatize
4. pos tag
5. tfidf
'''

# Tokenize
words = word_tokenize(doc)
sentences = sent_tokenize(doc)

print(f"Word Tokens: {words}")
print(f"Sentence Tokens: {sentences}")

Word Tokens: ['Tokenization', 'is', 'the', 'first', 'step', 'of', 'text', 'processing', '.', 'It', 'involves', 'breaking', 'down', 'text', 'into', 'words', 'or', 'sentences']
Sentence Tokens: ['Tokenization is the first step of text processing.', 'It involves breaking down text into words or sentences']


In [9]:
# Stopword removal
stops = set(stopwords.words('english'))
puncs = set(string.punctuation)

filtered = [word for word in words if word not in stops and word not in puncs]
print(f"Filtered Words: {filtered}")

Filtered Words: ['Tokenization', 'first', 'step', 'text', 'processing', 'It', 'involves', 'breaking', 'text', 'words', 'sentences']


In [11]:
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

stems = [stemmer.stem(word) for word in filtered]
print(f"Stems: {stems}")

lemms = [lemma.lemmatize(word) for word in filtered]
print(f"Lemmatized Words: {lemms}")

Stems: ['token', 'first', 'step', 'text', 'process', 'it', 'involv', 'break', 'text', 'word', 'sentenc']
Lemmatized Words: ['Tokenization', 'first', 'step', 'text', 'processing', 'It', 'involves', 'breaking', 'text', 'word', 'sentence']


In [13]:
# pos-tagging
pos_tags = pos_tag(filtered)
print(f"POS tags: {pos_tags}")

POS tags: [('Tokenization', 'NN'), ('first', 'RB'), ('step', 'VB'), ('text', 'RB'), ('processing', 'VBG'), ('It', 'PRP'), ('involves', 'VBZ'), ('breaking', 'VBG'), ('text', 'NN'), ('words', 'NNS'), ('sentences', 'NNS')]


In [20]:
# TFIDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc])

feature_names = vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()

print(f"TFIDF: {tfidf_array}")
print(f"Feature names: {feature_names}")

TFIDF: [[0.22941573 0.22941573 0.22941573 0.22941573 0.22941573 0.22941573
  0.22941573 0.22941573 0.22941573 0.22941573 0.22941573 0.22941573
  0.45883147 0.22941573 0.22941573 0.22941573]]
Feature names: ['breaking' 'down' 'first' 'into' 'involves' 'is' 'it' 'of' 'or'
 'processing' 'sentences' 'step' 'text' 'the' 'tokenization' 'words']
