# Text Analytics
  1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, 
     stop words removal, Stemming and Lemmatization.
  2. Create representation of document by calculating Term Frequency and Inverse Document
     Frequency.


In [63]:
import nltk
import pandas as pd
import math

In [9]:
from nltk.corpus import inaugural
corpus = "Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard"
print(corpus)

Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard


# Task1:

## Tokenization 

In [11]:
from nltk.tokenize import word_tokenize,sent_tokenize

## Sentence Tokenization

In [19]:
tokenized_text = sent_tokenize(corpus)
print(tokenized_text)

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]


## Word Tokenization

In [20]:
tokenized_word = word_tokenize(corpus)
print(tokenized_word)

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


## Stopwords Removal

In [22]:
from nltk.corpus import stopwords

In [25]:
stop_words=set(stopwords.words("english"))
print(stop_words)

{"needn't", 'o', 'them', 'before', 'too', "couldn't", 'both', 'those', 'didn', 'once', 'few', 'again', 'over', 'whom', 'he', 'further', 'myself', 'himself', 'out', 'if', 'between', 'no', 'some', "hasn't", 'itself', "weren't", 'themselves', 'shouldn', 'll', 'd', "should've", 'it', 'this', 'then', 'doing', 'just', 's', 'under', 'the', "shouldn't", 'hers', 'have', 'm', 'here', "don't", 'being', "hadn't", 'against', 'needn', 'does', 'by', 'couldn', "wasn't", 'isn', 'are', 'below', 'about', 'don', "won't", 'only', 'her', 'my', 'do', 'will', 'in', 'off', 'very', 'i', 'has', 'aren', 'haven', 'any', 'ourselves', 'at', 'theirs', "you're", 'nor', 'is', 'each', 'him', 'been', 'most', 're', 'how', 't', 'were', 'so', 'was', 'now', 'these', 'during', 'on', 'am', 'doesn', 'ma', 'me', "didn't", 'a', 'not', 'mustn', "shan't", 'that', 'you', 'weren', 'hasn', 'yours', 'of', 'did', 'than', "it's", 'there', 'to', 'wouldn', 'its', 'when', 'yourselves', 'or', 'as', 'for', 'but', 'above', 'from', "mustn't", '

In [33]:
filtered_sent = []
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized words:",tokenized_word)
print("Filterd Sentence:",filtered_sent)

Tokenized words: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']
Filterd Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


## Stemming

In [34]:
from nltk.stem import PorterStemmer

In [37]:
ps = PorterStemmer()
stemmed_words = []

for w in filtered_sent:
    stemmed_words.append(ps.stem(w))

print("Filtered Sentence:",filtered_sent)
print("Stemmed Sentence:",stemmed_words)

Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']
Stemmed Sentence: ['hello', 'mr.', 'smith', ',', 'today', '?', 'the', 'weather', 'great', ',', 'citi', 'awesom', '.', 'the', 'sky', 'pinkish-blu', '.', 'you', "n't", 'eat', 'cardboard']


## Lemmatization

In [38]:
from nltk.stem.wordnet import WordNetLemmatizer

In [43]:
lem = WordNetLemmatizer()
lemmatized_words = []

for w in filtered_sent:
    lemmatized_words.append(lem.lemmatize(w))

print("Filtered Sentence:",filtered_sent)
print("Stemmed Sentence:",lemmatized_words)

Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']
Stemmed Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


In [47]:
word = "flying"
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",ps.stem(word))

Lemmatized Word: fly
Stemmed Word: fli


## POS Tagging

In [50]:
nltk.pos_tag(tokenized_word)

[('Hello', 'NNP'),
 ('Mr.', 'NNP'),
 ('Smith', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('doing', 'VBG'),
 ('today', 'NN'),
 ('?', '.'),
 ('The', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('great', 'JJ'),
 (',', ','),
 ('and', 'CC'),
 ('city', 'NN'),
 ('is', 'VBZ'),
 ('awesome', 'JJ'),
 ('.', '.'),
 ('The', 'DT'),
 ('sky', 'NN'),
 ('is', 'VBZ'),
 ('pinkish-blue', 'JJ'),
 ('.', '.'),
 ('You', 'PRP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('eat', 'VB'),
 ('cardboard', 'NN')]

# Task2:

In [52]:
first_sentence = "Data Science is the hardest job of the 21st century"
second_sentence = "machine learning is the key for data science"

first_sentence = first_sentence.split(" ")
second_sentence = second_sentence.split(" ")

total= set(first_sentence).union(set(second_sentence))
print(total)

{'Data', 'century', '21st', 'science', 'key', 'of', 'learning', 'hardest', 'job', 'machine', 'the', 'Science', 'data', 'is', 'for'}


In [54]:
wordDictA = dict.fromkeys(total, 0) 
wordDictB = dict.fromkeys(total, 0)
for word in first_sentence:
    wordDictA[word]+=1
    
for word in second_sentence:
    wordDictB[word]+=1
    
print(wordDictA)
print(wordDictB)

{'Data': 1, 'century': 1, '21st': 1, 'science': 0, 'key': 0, 'of': 1, 'learning': 0, 'hardest': 1, 'job': 1, 'machine': 0, 'the': 2, 'Science': 1, 'data': 0, 'is': 1, 'for': 0}
{'Data': 0, 'century': 0, '21st': 0, 'science': 1, 'key': 1, 'of': 0, 'learning': 1, 'hardest': 0, 'job': 0, 'machine': 1, 'the': 1, 'Science': 0, 'data': 1, 'is': 1, 'for': 1}


In [57]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,Data,century,21st,science,key,of,learning,hardest,job,machine,the,Science,data,is,for
0,1,1,1,0,0,1,0,1,1,0,2,1,0,1,0
1,0,0,0,1,1,0,1,0,0,1,1,0,1,1,1


# TF (Term Frequency)

In [58]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

In [60]:
#running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_sentence)
tfSecond = computeTF(wordDictB, second_sentence)

#Converting to dataframe for visualization
tf = pd.DataFrame([tfFirst, tfSecond])
print(tf)

   Data  century  21st  science    key   of  learning  hardest  job  machine  \
0   0.1      0.1   0.1    0.000  0.000  0.1     0.000      0.1  0.1    0.000   
1   0.0      0.0   0.0    0.125  0.125  0.0     0.125      0.0  0.0    0.125   

     the  Science   data     is    for  
0  0.200      0.1  0.000  0.100  0.000  
1  0.125      0.0  0.125  0.125  0.125  


# IDF (Inverse Document Frequency)

In [78]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    
    for word, val in idfDict.items():
        cnt = 0
        for doc in docList:
            if(doc[word] != 0):
                cnt += 1
        idfDict[word] = cnt
#     print(idfDict)
        
        
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) ))
        
    return(idfDict)

In [79]:
idfs = computeIDF([wordDictA, wordDictB])
print(idfs)

{'Data': 1, 'century': 1, '21st': 1, 'science': 1, 'key': 1, 'of': 1, 'learning': 1, 'hardest': 1, 'job': 1, 'machine': 1, 'the': 2, 'Science': 1, 'data': 1, 'is': 2, 'for': 1}
{'Data': 0.3010299956639812, 'century': 0.3010299956639812, '21st': 0.3010299956639812, 'science': 0.3010299956639812, 'key': 0.3010299956639812, 'of': 0.3010299956639812, 'learning': 0.3010299956639812, 'hardest': 0.3010299956639812, 'job': 0.3010299956639812, 'machine': 0.3010299956639812, 'the': 0.0, 'Science': 0.3010299956639812, 'data': 0.3010299956639812, 'is': 0.0, 'for': 0.3010299956639812}
