In [1]:
import pandas as pd
import nltk
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sentence Tokenization
text = "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."
tokenized_text = nltk.sent_tokenize(text)
print("\n Sentence Tokenization: \n", tokenized_text)

# Word Tokenization
tokenized_word = nltk.word_tokenize(text)
print('\nWord Tokeniztion: \n', tokenized_word)

# POS Tagging
tagged_text = nltk.pos_tag(tokenized_word)
print("\nPOS Tagging: \n", tagged_text)

# Stop words removal
stop_words = set(nltk.corpus.stopwords.words("english"))
text = "How to remove stop words with NLTK library in Python?"
text = re.sub('[^a-zA-Z]', ' ', text)
tokens = nltk.word_tokenize(text.lower())
filtered_text = [w for w in tokens if w not in stop_words]
print ("Tokenized Sentence:", tokens)
print ("Filtered Sentence:", filtered_text)

# Stemming
from nltk.stem import PorterStemmer
e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()
for w in e_words:
    rootWord = ps.stem(w)
    print('Stemming for ', w, ': ', rootWord)

# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))

# Algorithm for Create representation of document by calculating TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Import the necessary libraries.
documentA = 'Jupiter is the largest planet'
documentB = 'Mars is the fourth planet from the Sun'

# Step 2: Initialize the Documents.
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

# Step 3: Create BagofWords (BoW) for Document A and B. word tokenization
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

# Step 4: Create Collection of Unique words from Document A and B.
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1 #How many times each word is repeated
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

# Step 5: Compute the term frequency for each of our documents.
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

# Step 6: Compute the term Inverse Document Frequency.
def computeIDF(documents):
    import math
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:idfDict[word] += 1
    for word, val in idfDict.items():
        if val > 0: idfDict[word] = math.log(N / float(val))
        else: idfDict[word] = 0
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])

# Step 7: Compute the term TF/IDF for all words.
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

print('----------------Term Frequency----------------------')
df = pd.DataFrame([tfA, tfB])
print(df)

print('----------------Inverse Document Frequency----------------------')
print(idfs)

print('------------------- TF-IDF--------------------------------------')
df = pd.DataFrame([tfidfA, tfidfB])
print(df)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tanuja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tanuja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tanuja\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tanuja\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.



 Sentence Tokenization: 
 ['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']

Word Tokeniztion: 
 ['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']

POS Tagging: 
 [('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('first', 'JJ'), ('step', 'NN'), ('in', 'IN'), ('text', 'JJ'), ('analytics', 'NNS'), ('.', '.'), ('The', 'DT'), ('process', 'NN'), ('of', 'IN'), ('breaking', 'VBG'), ('down', 'RP'), ('a', 'DT'), ('text', 'NN'), ('paragraph', 'NN'), ('into', 'IN'), ('smaller', 'JJR'), ('chunks', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('words', 'NNS'), ('or', 'CC'), ('sentences', 'NNS'), ('is', 'VBZ'), ('called', 'VBN'), ('Tokenization', 'NN'), ('.', '.')]
Tokenized