In [24]:
import pandas as pd
import nltk

nltk.download('punkt')
nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

text = "Hello Hello Hello everyone. Welcome to DSBDA lab. We are studying text analytics."
# if there is a text file then use this below code:
# text = open("Text.txt").read()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Problem Statement 10

## Tokenization

In [25]:
# Sentence Tokenization
from nltk.tokenize import sent_tokenize
token_sent = sent_tokenize(text)
token_sent

['Hello Hello Hello everyone.',
 'Welcome to DSBDA lab.',
 'We are studying text analytics.']

In [26]:
# Word Tokenization
from nltk.tokenize import word_tokenize
token_word = word_tokenize(text)
token_word

['Hello',
 'Hello',
 'Hello',
 'everyone',
 '.',
 'Welcome',
 'to',
 'DSBDA',
 'lab',
 '.',
 'We',
 'are',
 'studying',
 'text',
 'analytics',
 '.']

# POS Tagging

In [27]:
nltk.pos_tag(token_word)

[('Hello', 'NNP'),
 ('Hello', 'NNP'),
 ('Hello', 'NNP'),
 ('everyone', 'NN'),
 ('.', '.'),
 ('Welcome', 'NNP'),
 ('to', 'TO'),
 ('DSBDA', 'NNP'),
 ('lab', 'NN'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('studying', 'VBG'),
 ('text', 'IN'),
 ('analytics', 'NNS'),
 ('.', '.')]

# Stop words removal

In [28]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [29]:
filtered_sent = []
for i in token_word:
    if i.lower() not in stopwords:
        filtered_sent.append(i)
        
print("Tokenize", token_word)
print("Filtered", filtered_sent)

Tokenize ['Hello', 'Hello', 'Hello', 'everyone', '.', 'Welcome', 'to', 'DSBDA', 'lab', '.', 'We', 'are', 'studying', 'text', 'analytics', '.']
Filtered ['Hello', 'Hello', 'Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']


# Stemming

In [30]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed = []
for i in filtered_sent:
    stemmed.append(ps.stem(i))
print("Filtered", filtered_sent)
print("Stemmed", stemmed)

Filtered ['Hello', 'Hello', 'Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']
Stemmed ['hello', 'hello', 'hello', 'everyon', '.', 'welcom', 'dsbda', 'lab', '.', 'studi', 'text', 'analyt', '.']


# Lemmatization

In [31]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('omw-1.4')

lem = WordNetLemmatizer()
lemmatize = []
for i in filtered_sent:
    lemmatize.append(lem.lemmatize(i))
print("Filtered", filtered_sent)
print("Lemmatize", lemmatize)

Filtered ['Hello', 'Hello', 'Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']
Lemmatize ['Hello', 'Hello', 'Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sanke\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Term Frequency and Inverse Document Frequency

In [32]:
fre = dict()
for words in filtered_sent:
    if words in fre:
        fre[words] += 1
    else:
        fre[words] = 1

print(fre)

{'Hello': 3, 'everyone': 1, '.': 3, 'Welcome': 1, 'DSBDA': 1, 'lab': 1, 'studying': 1, 'text': 1, 'analytics': 1}


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(filtered_sent)
# Get the IDF values from the vectorizer
idf_values = tfidf.idf_

# Print the terms and their corresponding IDF values
terms = tfidf.get_feature_names()
for term, idf in zip(terms, idf_values):
    print(term, idf)
# print(result)

analytics 2.9459101490553135
dsbda 2.9459101490553135
everyone 2.9459101490553135
hello 2.252762968495368
lab 2.9459101490553135
studying 2.9459101490553135
text 2.9459101490553135
welcome 2.9459101490553135
