In [2]:
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bhush\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bhush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bhush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\bhush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\a

True

In [6]:
document = "Natural Language Processing (NLP) is a sub-field of Artificial Intelligence that focuses on the interaction between computers and humans using natural language."


In [12]:
tokens = word_tokenize(document)
print("Tokens:", tokens)


Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'sub-field', 'of', 'Artificial', 'Intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.']


In [18]:
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('sub-field', 'NN'), ('of', 'IN'), ('Artificial', 'NNP'), ('Intelligence', 'NNP'), ('that', 'WDT'), ('focuses', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('interaction', 'NN'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('humans', 'NNS'), ('using', 'VBG'), ('natural', 'JJ'), ('language', 'NN'), ('.', '.')]


In [20]:
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = [word for word in tokens if word.lower() not in stop_words]
print("After Stopword Removal:", tokens_no_stopwords)


After Stopword Removal: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'sub-field', 'Artificial', 'Intelligence', 'focuses', 'interaction', 'computers', 'humans', 'using', 'natural', 'language', '.']


In [22]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens_no_stopwords]
print("After Stemming:", stemmed_words)


After Stemming: ['natur', 'languag', 'process', '(', 'nlp', ')', 'sub-field', 'artifici', 'intellig', 'focus', 'interact', 'comput', 'human', 'use', 'natur', 'languag', '.']


In [24]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_no_stopwords]
print("After Lemmatization:", lemmatized_words)


After Lemmatization: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'sub-field', 'Artificial', 'Intelligence', 'focus', 'interaction', 'computer', 'human', 'using', 'natural', 'language', '.']


In [26]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([document])
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


TF-IDF Matrix:
 [[0.19611614 0.19611614 0.19611614 0.19611614 0.19611614 0.19611614
  0.19611614 0.19611614 0.19611614 0.19611614 0.39223227 0.39223227
  0.19611614 0.19611614 0.19611614 0.19611614 0.19611614 0.19611614
  0.19611614 0.19611614]]


In [28]:
print("TF-IDF Feature Names:\n", tfidf_vectorizer.get_feature_names_out())

TF-IDF Feature Names:
 ['and' 'artificial' 'between' 'computers' 'field' 'focuses' 'humans'
 'intelligence' 'interaction' 'is' 'language' 'natural' 'nlp' 'of' 'on'
 'processing' 'sub' 'that' 'the' 'using']
