# 7. Text Analytics
1. Extract Sample document and apply following document preprocessing
methods: Tokenization, POS Tagging, stop words removal, Stemming and
Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse
Document Frequency.

In [14]:
import nltk
import re
import math

In [15]:
# Read the docs
# Remove all non-alphanumeric characters
# with a RegEx
with open( "doc_01" , "r" ) as file:
    doc = file.read()
doc = re.sub('[\W_]+', ' ', doc )

with open( "doc_02" , "r" ) as file:
    doc_2 = file.read()
doc_2 = re.sub('[\W_]+', ' ', doc_2 )

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nikhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nikhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nikhil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nikhil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 7.1. Document Preprocessing

### 7.1.1. Tokenization

In [17]:
"""
word tokenization
default tokenizer: Penn Treebank Tokenizer
Ref: https://docs.ropensci.org/tokenizers/reference/ptb-tokenizer.html
This tokenizer uses regular expressions to tokenize text similar to the tokenization used in the Penn Treebank. It assumes that text has already been split into sentences. 
The tokenizer does the following:
- splits common English contractions, e.g. don't is tokenized into do n't and they'll is tokenized into -> they 'll,
- handles punctuation characters as separate tokens,
- splits commas and single quotes off from words, when they are followed by whitespace,
- splits off periods that occur at the end of the sentence.
"""
word_tokens = nltk.word_tokenize( doc )
print( word_tokens )

['Between', '2016', 'and', '2019', 'the', 'state', 'forest', 'department', 'under', 'theÂ', 'BJPÂ', 'government', 'had', 'launched', 'â', 'Green', 'Maharashtraâ', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'trees', 'across', 'the', 'state', 'in', 'the', 'four', 'year', 'period', 'In', 'October', '2019', 'the', 'government', 'had', 'claimed', 'it', 'had', 'surpassed', 'the', 'target', 'by', 'planting', '33', 'crore', 'trees', 'in', 'July', 'September', '2019', 'Â', 'The', 'Indian', 'ExpressÂ', 'had', 'found', 'that', 'non', 'forest', 'agencies', 'â', 'such', 'as', 'gram', 'panchayats', 'â', 'which', 'were', 'tasked', 'with', 'planting', 'trees', 'had', 'not', 'uploaded', 'the', 'mandatory', 'audio', 'visual', 'proof', 'of', 'the', 'tree', 'plantation', 'drives', 'on', 'the', 'specially', 'created', 'portal', 'In', 'Pune', 'Revenue', 'Division', 'it', 'was', 'claimed', 'the', 'gram', 'panchayats', 'planted', '1', '7', 'crore', 'saplings', 'however', 'no', 'evidence', 'wa

In [18]:
"""
sentence tokenization
default tokenizer: Punkt tokenizer
Ref: Unsupervised Multilingual Sentence Boundary Detection (Kiss and Strunk (2005)
"""
sent_tokens = nltk.sent_tokenize( doc )
print( sent_tokens )

['Between 2016 and 2019 the state forest department under theÂ BJPÂ government had launched â Green Maharashtraâ drive with an aim to plant 50 crore trees across the state in the four year period In October 2019 the government had claimed it had surpassed the target by planting 33 crore trees in July September 2019 Â The Indian ExpressÂ had found that non forest agencies â such as gram panchayats â which were tasked with planting trees had not uploaded the mandatory audio visual proof of the tree plantation drives on the specially created portal In Pune Revenue Division it was claimed the gram panchayats planted 1 7 crore saplings however no evidence was uploaded for 87 per cent 1 49 crore saplings Also out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings This year the targets set by the forest department were comparatively modest For example Pune Circle â which comprises three divisions in Pune and Solapur district

### 7.1.2. Stop word removal


In [19]:
stop_words = set(nltk.corpus.stopwords.words('english'))
word_tokens = [ token for token in word_tokens if token not in stop_words ]
print( word_tokens )

['Between', '2016', '2019', 'state', 'forest', 'department', 'theÂ', 'BJPÂ', 'government', 'launched', 'â', 'Green', 'Maharashtraâ', 'drive', 'aim', 'plant', '50', 'crore', 'trees', 'across', 'state', 'four', 'year', 'period', 'In', 'October', '2019', 'government', 'claimed', 'surpassed', 'target', 'planting', '33', 'crore', 'trees', 'July', 'September', '2019', 'Â', 'The', 'Indian', 'ExpressÂ', 'found', 'non', 'forest', 'agencies', 'â', 'gram', 'panchayats', 'â', 'tasked', 'planting', 'trees', 'uploaded', 'mandatory', 'audio', 'visual', 'proof', 'tree', 'plantation', 'drives', 'specially', 'created', 'portal', 'In', 'Pune', 'Revenue', 'Division', 'claimed', 'gram', 'panchayats', 'planted', '1', '7', 'crore', 'saplings', 'however', 'evidence', 'uploaded', '87', 'per', 'cent', '1', '49', 'crore', 'saplings', 'Also', '59', 'government', 'agencies', 'involved', 'drive', 'many', '38', 'submitted', 'survival', 'reports', 'saplings', 'This', 'year', 'targets', 'set', 'forest', 'department', 

### 7.1.3. Parts of Speech Tagging

In [20]:
# default pos tagger: Perceptron Tagger
# Ref: https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
tags = nltk.pos_tag( word_tokens )
print( tags )

[('Between', 'IN'), ('2016', 'CD'), ('2019', 'CD'), ('state', 'NN'), ('forest', 'JJS'), ('department', 'NN'), ('theÂ', 'NN'), ('BJPÂ', 'NNP'), ('government', 'NN'), ('launched', 'VBD'), ('â', 'NNP'), ('Green', 'NNP'), ('Maharashtraâ', 'NNP'), ('drive', 'NN'), ('aim', 'NN'), ('plant', 'NN'), ('50', 'CD'), ('crore', 'NN'), ('trees', 'NNS'), ('across', 'IN'), ('state', 'NN'), ('four', 'CD'), ('year', 'NN'), ('period', 'NN'), ('In', 'IN'), ('October', 'NNP'), ('2019', 'CD'), ('government', 'NN'), ('claimed', 'VBD'), ('surpassed', 'JJ'), ('target', 'NN'), ('planting', 'VBG'), ('33', 'CD'), ('crore', 'NN'), ('trees', 'NNS'), ('July', 'NNP'), ('September', 'NNP'), ('2019', 'CD'), ('Â', 'VBD'), ('The', 'DT'), ('Indian', 'JJ'), ('ExpressÂ', 'NNP'), ('found', 'VBD'), ('non', 'RB'), ('forest', 'JJ'), ('agencies', 'NNS'), ('â', 'VBP'), ('gram', 'JJ'), ('panchayats', 'NNS'), ('â', 'VBP'), ('tasked', 'VBN'), ('planting', 'NN'), ('trees', 'NNS'), ('uploaded', 'VBD'), ('mandatory', 'JJ'), ('audio', 'J

### 7.1.4. Lemmatization

In [21]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [ lemmatizer.lemmatize( token ) for token in word_tokens ]
print( lemmatized_tokens )

['Between', '2016', '2019', 'state', 'forest', 'department', 'theÂ', 'BJPÂ', 'government', 'launched', 'â', 'Green', 'Maharashtraâ', 'drive', 'aim', 'plant', '50', 'crore', 'tree', 'across', 'state', 'four', 'year', 'period', 'In', 'October', '2019', 'government', 'claimed', 'surpassed', 'target', 'planting', '33', 'crore', 'tree', 'July', 'September', '2019', 'Â', 'The', 'Indian', 'ExpressÂ', 'found', 'non', 'forest', 'agency', 'â', 'gram', 'panchayat', 'â', 'tasked', 'planting', 'tree', 'uploaded', 'mandatory', 'audio', 'visual', 'proof', 'tree', 'plantation', 'drive', 'specially', 'created', 'portal', 'In', 'Pune', 'Revenue', 'Division', 'claimed', 'gram', 'panchayat', 'planted', '1', '7', 'crore', 'sapling', 'however', 'evidence', 'uploaded', '87', 'per', 'cent', '1', '49', 'crore', 'sapling', 'Also', '59', 'government', 'agency', 'involved', 'drive', 'many', '38', 'submitted', 'survival', 'report', 'sapling', 'This', 'year', 'target', 'set', 'forest', 'department', 'comparatively'

### 7.1.5. Stemming

In [22]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_tokens = [ stemmer.stem(token) for token in word_tokens ]
print( stemmed_tokens )

['between', '2016', '2019', 'state', 'forest', 'depart', 'theâ', 'bjpâ', 'govern', 'launch', 'â', 'green', 'maharashtraâ', 'drive', 'aim', 'plant', '50', 'crore', 'tree', 'across', 'state', 'four', 'year', 'period', 'in', 'octob', '2019', 'govern', 'claim', 'surpass', 'target', 'plant', '33', 'crore', 'tree', 'juli', 'septemb', '2019', 'â', 'the', 'indian', 'expressâ', 'found', 'non', 'forest', 'agenc', 'â', 'gram', 'panchayat', 'â', 'task', 'plant', 'tree', 'upload', 'mandatori', 'audio', 'visual', 'proof', 'tree', 'plantat', 'drive', 'special', 'creat', 'portal', 'in', 'pune', 'revenu', 'divis', 'claim', 'gram', 'panchayat', 'plant', '1', '7', 'crore', 'sapl', 'howev', 'evid', 'upload', '87', 'per', 'cent', '1', '49', 'crore', 'sapl', 'also', '59', 'govern', 'agenc', 'involv', 'drive', 'mani', '38', 'submit', 'surviv', 'report', 'sapl', 'thi', 'year', 'target', 'set', 'forest', 'depart', 'compar', 'modest', 'for', 'exampl', 'pune', 'circl', 'â', 'compris', 'three', 'divis', 'pune', '

## 7.2. TF and IDF

In [23]:
import numpy as np

# Returns a map containing term-frequencies of each token
# present in `doc`
# tf( token ) = freq( token ) / num_tokens_in_doc
def calc_term_freq(doc):
    word_tokens = nltk.word_tokenize( doc )
    num_tokens = len( word_tokens )
    unique_tokens , freqs = np.unique( word_tokens , return_counts=True )
    # print(unique_tokens)
    # print(len(unique_tokens))
    # print(freqs)
    # print(sum(freqs))
    tf = {}
    for token , freq in zip( unique_tokens , freqs ):
        tf[ token ] = freq / num_tokens
    return tf

tf = calc_term_freq( doc )
tf_2 = calc_term_freq( doc_2 )

In [24]:
import math

doc_1_tokens = nltk.word_tokenize( doc )
doc_2_tokens = nltk.word_tokenize( doc_2 )

# Calculate inverse-document frequency
# IDF( token ) = log( N / (num_docs_where_token_occurs) )
def calc_idf():
    N = 2
    all_tokens = doc_1_tokens + doc_2_tokens
    idf = {}
    for token in all_tokens:
        f = 1  # Theoretical equals 0, but to avoid log(infinity), we add 1
        if token in doc_1_tokens:
            f += 1
        if token in doc_2_tokens:
            f += 1
        idf[ token ] = math.log( N / f )
    return idf

idf = calc_idf()

In [25]:
# TFIDF( token ) = TF( token ) * IDF( token )
doc_1_repr = []
for token in doc_1_tokens:
    doc_1_repr.append( tf[ token ] * idf[token] )
doc_2_repr = []
for token in doc_2_tokens:
    doc_2_repr.append( tf_2[ token ] * idf[token] )

In [26]:
print( doc_1_repr )

[0.0, 0.0, -0.011603135049200256, -0.0025784744553778343, -0.025140125939933883, -0.0019338558415333755, -0.004512330296911209, 0.0, -0.0019338558415333755, 0.0, 0.0, -0.0025784744553778343, 0.0, 0.0, -0.007735423366133502, 0.0, 0.0, -0.0019338558415333755, 0.0, -0.0006446186138444586, 0.0, -0.008380041979977961, 0.0, 0.0, 0.0, -0.0032230930692222926, -0.0006446186138444586, -0.025140125939933883, -0.0019338558415333755, -0.007735423366133502, -0.025140125939933883, -0.0006446186138444586, 0.0, 0.0, 0.0, 0.0, -0.0025784744553778343, -0.025140125939933883, -0.0025784744553778343, 0.0, 0.0, 0.0, 0.0, 0.0, -0.025140125939933883, 0.0, -0.0012892372276889171, -0.0012892372276889171, 0.0, 0.0, -0.0032230930692222926, -0.007735423366133502, 0.0, 0.0, -0.0025784744553778343, -0.005801567524600128, -0.0025784744553778343, -0.0006446186138444586, 0.0, 0.0, 0.0, -0.0006446186138444586, 0.0, -0.004512330296911209, 0.0, -0.007735423366133502, 0.0, -0.0032230930692222926, 0.0, 0.0, -0.00773542336613

In [27]:
print( doc_2_repr )

[0.0, -0.01482117888646032, 0.0, -0.010586556347471656, 0.0, 0.0, -0.0010586556347471656, -0.010586556347471656, -0.0031759669042414965, 0.0, -0.005293278173735828, -0.0031759669042414965, -0.002117311269494331, 0.0, 0.0, 0.0, 0.0, 0.0, -0.008469245077977324, -0.0031759669042414965, -0.004234622538988662, -0.010586556347471656, -0.00741058944323016, -0.0010586556347471656, -0.0010586556347471656, -0.002117311269494331, -0.025407735233931972, 0.0, -0.002117311269494331, 0.0, 0.0, -0.005293278173735828, -0.00741058944323016, 0.0, -0.002117311269494331, 0.0, -0.0031759669042414965, 0.0, 0.0, -0.005293278173735828, -0.004234622538988662, 0.0, 0.0, -0.008469245077977324, 0.0, -0.010586556347471656, -0.00741058944323016, 0.0, -0.010586556347471656, 0.0, 0.0, 0.0, -0.008469245077977324, 0.0, 0.0, 0.0, 0.0, -0.0031759669042414965, -0.025407735233931972, -0.002117311269494331, -0.004234622538988662, -0.00741058944323016, -0.008469245077977324, -0.0031759669042414965, -0.004234622538988662, -0.0