In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

Download necessary resources 

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\VEDIKA/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VEDIKA/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VEDIKA/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VEDIKA/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
text = "Natural Language Processing (NLP) helps computers understand human language. It is widely used in sentiment analysis, chatbots, and text summarization."

Tokenization

In [6]:
sentences=sent_tokenize(text)
words=word_tokenize(text.lower())
print(sentences)
print(words)

['Natural Language Processing (NLP) helps computers understand human language.', 'It is widely used in sentiment analysis, chatbots, and text summarization.']
['natural', 'language', 'processing', '(', 'nlp', ')', 'helps', 'computers', 'understand', 'human', 'language', '.', 'it', 'is', 'widely', 'used', 'in', 'sentiment', 'analysis', ',', 'chatbots', ',', 'and', 'text', 'summarization', '.']


Remove stop words

In [8]:
stop_words=set(stopwords.words('english'))
print(stop_words)

{'once', 'no', 'there', 'an', 'isn', 'them', "they've", 'through', 'how', 'yourself', 'don', 'mustn', 'very', 'of', 'up', 'd', "i'd", 'needn', 'yourselves', "he'd", 'm', 'more', "weren't", "she's", "that'll", 'some', 'but', 'that', 'been', 'further', 'didn', 'hasn', "won't", 'so', 'few', 'what', 'above', 'whom', "she'd", 'does', 'too', 'it', 'haven', 'just', 'again', 'between', "shan't", 'theirs', 've', 'wouldn', 'the', 'had', "couldn't", 'weren', "haven't", "hasn't", "i'm", 'below', 'not', "they're", 'won', 'and', 'before', 'other', 'because', 'couldn', "he'll", 'hadn', 're', "wasn't", 'your', "i'll", "mustn't", 'from', 'while', 'at', 'those', 'under', 'until', "mightn't", 'or', 'was', "we'd", 'now', 'itself', 'during', 'mightn', 'such', 'where', 'hers', 'he', 'you', 'she', 'on', 'than', 'down', 'ain', 'its', "wouldn't", 'if', 'in', "you'd", 'my', 'our', 'their', "didn't", 'only', 'wasn', 'who', "she'll", "you're", "doesn't", 'myself', 'did', "he's", 'they', "isn't", "it's", 'shan', '

In [10]:
filtered_words=[word for word in words if word not in stop_words and word.isalpha()]
print(filtered_words)

['natural', 'language', 'processing', 'nlp', 'helps', 'computers', 'understand', 'human', 'language', 'widely', 'used', 'sentiment', 'analysis', 'chatbots', 'text', 'summarization']


Stemming and Lemmatization

In [14]:
ps=PorterStemmer()
lemmatizer=WordNetLemmatizer()
stemmed_words=[ps.stem(word) for word in filtered_words]
lemmatize_words=[lemmatizer.lemmatize(word) for word in filtered_words]
print(stemmed_words)
print(lemmatize_words)

['natur', 'languag', 'process', 'nlp', 'help', 'comput', 'understand', 'human', 'languag', 'wide', 'use', 'sentiment', 'analysi', 'chatbot', 'text', 'summar']
['natural', 'language', 'processing', 'nlp', 'help', 'computer', 'understand', 'human', 'language', 'widely', 'used', 'sentiment', 'analysis', 'chatbots', 'text', 'summarization']


Part of Speech tagging

In [15]:
pos_tags=nltk.pos_tag(filtered_words)
print(pos_tags)

[('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('nlp', 'NN'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('widely', 'RB'), ('used', 'VBN'), ('sentiment', 'NN'), ('analysis', 'NN'), ('chatbots', 'NNS'), ('text', 'JJ'), ('summarization', 'NN')]


TF-IDF Representation

In [25]:
corpus=[''.join(lemmatize_words)]
tfidf=TfidfVectorizer()
X=tfidf.fit_transform(corpus).toarray()
print("TF-IDF Matrix:\n",X)

TF-IDF Matrix:
 [[1.]]
