In [18]:
import nltk

nltk.download('punkt') # Punkt is used for sentence and word tokenization
nltk.download('stopwords') # Stopwords corpus contains common words like "is", "the", "and"
nltk.download('wordnet')  
# WordNet is required for lemmatization
nltk.download('averaged_perceptron_tagger')# Averaged perceptron tagger is used for POS tagging


[nltk_data] Downloading package punkt to
[nltk_data]     /home/pict-a1-102/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pict-a1-102/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pict-a1-102/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pict-a1-102/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
text = """Tokenization is the first step in text analytics.
The process of breaking down a text paragraph into smaller chunks
such as words or sentences is called Tokenization."""


In [21]:
 #text into sentences
from nltk.tokenize import sent_tokenize
tokenized_text = sent_tokenize(text)
print(tokenized_text)


['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks\nsuch as words or sentences is called Tokenization.']


In [22]:
#text into words
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(text)
print(tokenized_word)


['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [24]:
#Remove Punctuations and Stop Words
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))
text = "How to remove stop words with NLTK library in Python?"

# Remove punctuation and special characters
text = re.sub('[^a-zA-Z]', ' ', text)

# Convert text to lowercase and tokenize
tokens = word_tokenize(text.lower())
filtered_text = []

# Remove stop words
for w in tokens:
    if w not in stop_words:
        filtered_text.append(w)

print("Tokenized Sentence:", tokens)
print("Filtered Sentence:", filtered_text)


Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filtered Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


In [25]:
#Stemming reduces words to their root form by removing suffixes.
from nltk.stem import PorterStemmer

e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()

for w in e_words:
    rootWord = ps.stem(w)
    print(rootWord)


wait
wait
wait
wait


In [8]:
#Lemmatization converts words into their base or dictionary form (lemma).
from nltk.stem import WordNetLemmatizer
import nltk

wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"

tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))


Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


In [28]:
#POS tagging assigns grammatical labels such as noun, verb, adjective, etc., to each word.
import nltk
from nltk.tokenize import word_tokenize

data = "The pink sweater fit her perfectly"
words = word_tokenize(data) #tokenise

for word in words:
    print(nltk.pos_tag([word]))
    


[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


In [10]:
import pandas as pd
import math


In [11]:
documentA = "Jupiter is the largest Planet"
documentB = "Mars is the fourth planet from the Sun"


In [30]:
#Bag of Words represents text as a list of words without considering grammar or order
bagOfWordsA = documentA.split(" ")
bagOfWordsB = documentB.split(" ")


In [31]:
#Unique words from both documents form the vocabulary of the corpus.
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))


In [32]:
#This step counts how many times each word appears in each document
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1


In [33]:
#TF measures how frequently a word appears in a document.
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)


In [16]:
#IDF reduces the importance of commonly occurring words across documents.
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)

    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])


In [34]:
# TF-IDF is the product of TF and IDF.
# It highlights important words in a document while reducing common words.
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)                

df = pd.DataFrame([tfidfA, tfidfB])
print(df)


       from       Sun   is   largest   Jupiter    planet    Planet      Mars  \
0  0.000000  0.000000  0.0  0.138629  0.138629  0.000000  0.138629  0.000000   
1  0.086643  0.086643  0.0  0.000000  0.000000  0.086643  0.000000  0.086643   

     fourth  the  
0  0.000000  0.0  
1  0.086643  0.0  
