In [67]:
import pandas as pd
import nltk  # Natural Language Toolkit for NLP tasks
import re    # Regular Expressions for text preprocessing


In [68]:
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stop word list
nltk.download('wordnet')  # For lemmatization
nltk.download('averaged_perceptron_tagger')  # For POS tagging


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [69]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict


In [70]:
def computeIDF(documents):
    import math
    N = len(documents)  # Total number of documents
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        if val > 0:
            idfDict[word] = math.log(N / float(val))
        else:
            idfDict[word] = 0
    return idfDict


In [71]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf


In [72]:
text= "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."
print('The given sentences are: \n', text)


The given sentences are: 
 Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.


🔹 Sentence Tokenization

In [73]:
from nltk.tokenize import sent_tokenize
tokenized_text = sent_tokenize(text)
print("\n Sentence Tokenization: \n", tokenized_text)



 Sentence Tokenization: 
 ['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


🔹 Word Tokenization

In [74]:
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(text)
print('\nWord Tokeniztion: \n', tokenized_word)



Word Tokeniztion: 
 ['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


🔹 Stop Word Removal

In [75]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


In [76]:
text = "How to remove stop words with NLTK library in Python?"
text = re.sub('[^a-zA-Z]', ' ', text)
tokens = word_tokenize(text.lower())
filtered_text = []
for w in tokens:
    if w not in stop_words:
        filtered_text.append(w)


In [77]:
print ("Tokenized Sentence:", tokens)
print ("Filterd Sentence:", filtered_text)


Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filterd Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


🔹 Stemming

In [78]:
from nltk.stem import PorterStemmer
e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()
for w in e_words:
    rootWord = ps.stem(w)
    print('Stemming for ', w, ': ', rootWord)


Stemming for  wait :  wait
Stemming for  waiting :  wait
Stemming for  waited :  wait
Stemming for  waits :  wait


🔹 Lemmatization

In [79]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))


Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


🔹 TF-IDF from Scratch

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer

documentA = 'Jupiter is the largest planet'
documentB = 'Mars is the fourth planet from the Sun'

bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
numOfWordsA = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsA:
    numOfWordsA[word] += 1
    
numOfWordsB = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsB:
    numOfWordsB[word] += 1


🔹 TF Calculation

In [81]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)


🔹 Display Term Frequencies

In [82]:
print('----------------Term Frequency----------------------')
df = pd.DataFrame([tfA, tfB])
print(df)


----------------Term Frequency----------------------
    Mars   the    Sun  Jupiter  fourth  planet     is  largest   from
0  0.000  0.20  0.000      0.2   0.000   0.200  0.200      0.2  0.000
1  0.125  0.25  0.125      0.0   0.125   0.125  0.125      0.0  0.125


🔹 IDF & TF-IDF Calculation

In [83]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
print('----------------Inverse Document Frequency----------------------')
print(idfs)


tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
print('------------------- TF-IDF--------------------------------------')
df = pd.DataFrame([tfidfA, tfidfB])
print(df)



----------------Inverse Document Frequency----------------------
{'Mars': 0.6931471805599453, 'the': 0.0, 'Sun': 0.6931471805599453, 'Jupiter': 0.6931471805599453, 'fourth': 0.6931471805599453, 'planet': 0.0, 'is': 0.0, 'largest': 0.6931471805599453, 'from': 0.6931471805599453}
------------------- TF-IDF--------------------------------------
       Mars  the       Sun   Jupiter    fourth  planet   is   largest  \
0  0.000000  0.0  0.000000  0.138629  0.000000     0.0  0.0  0.138629   
1  0.086643  0.0  0.086643  0.000000  0.086643     0.0  0.0  0.000000   

       from  
0  0.000000  
1  0.086643  
