## Text Analysis

In [30]:
# import the necessary libraries
import nltk

text1 = "Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!"
text2 = "They wandered into a strange Tiki bar on the edge of the small beach town. Every manager should be able to recite at least ten nursery rhymes backward. Find bar near beach"
print("Text1: ", text1)
print("Text2: ", text2)

Text1:  Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!
Text2:  They wandered into a strange Tiki bar on the edge of the small beach town. Every manager should be able to recite at least ten nursery rhymes backward. Find bar near beach


In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/TE/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [32]:
# Tokenization

# 1. Word Tokenization
from nltk.tokenize import word_tokenize
print("Text1: ", text1)
print("Word Tokenization of Text1: ", word_tokenize(text1))
print("Text2: ", text2)
print("Word Tokenization of Text2: ", word_tokenize(text2))

Word Tokenization of Text1:  ['Hey', ',', 'did', 'you', 'know', 'that', 'the', 'summer', 'break', 'is', 'coming', '?', 'Amazing', 'right', '!', '!', 'It', '’', 's', 'only', '5', 'more', 'days', '!', '!']
Word Tokenization of Text2:  ['They', 'wandered', 'into', 'a', 'strange', 'Tiki', 'bar', 'on', 'the', 'edge', 'of', 'the', 'small', 'beach', 'town', '.', 'Every', 'manager', 'should', 'be', 'able', 'to', 'recite', 'at', 'least', 'ten', 'nursery', 'rhymes', 'backward', '.', 'Find', 'bar', 'near', 'beach']


In [33]:
# 2. Sentence Tokenization
from nltk.tokenize import sent_tokenize
print("Text1: ", text1)
print("Word Tokenization of Text1: ", sent_tokenize(text1))
print("Text2: ", text2)
print("Word Tokenization of Text2: ", sent_tokenize(text2))

Word Tokenization of Text1:  ['Hey, did you know that the summer break is coming?', 'Amazing right!!', 'It’s only 5 more days!', '!']
Word Tokenization of Text2:  ['They wandered into a strange Tiki bar on the edge of the small beach town.', 'Every manager should be able to recite at least ten nursery rhymes backward.', 'Find bar near beach']


In [12]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/TE/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/TE/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
# POS Tagging
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def pos_tagging(text):
    tokenized = sent_tokenize(text)
    for i in tokenized:
        # Word tokenizers is used to find the words
        # and punctuation in a string
        wordsList = nltk.word_tokenize(i)
    
        # removing stop words from wordList
        wordsList = [w for w in wordsList if not w in stop_words]
    
        #  Using a Tagger. Which is part-of-speech
        # tagger or POS-tagger.
        tagged = nltk.pos_tag(wordsList)
    
        return tagged
    
print("Text1: ", text1)
print("POS Tagging of Text1: ", pos_tagging(text1))
print("Text2: ", text2)
print("POS Tagging of Text2: ", pos_tagging(text2))

POS Tagging of Text1:  [('Hey', 'NNP'), (',', ','), ('know', 'VBP'), ('summer', 'NN'), ('break', 'NN'), ('coming', 'VBG'), ('?', '.')]
POS Tagging of Text2:  [('They', 'PRP'), ('wandered', 'VBD'), ('strange', 'JJ'), ('Tiki', 'NNP'), ('bar', 'NN'), ('edge', 'NN'), ('small', 'JJ'), ('beach', 'NN'), ('town', 'NN'), ('.', '.')]


In [46]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
  
word_tokens = word_tokenize(text1)

# With LowerCase Conversion
# converts the words in word_tokens to lower case and then checks whether they are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
print("Text1: ", text1)
stop_words = set(stopwords.words('english'))
  
word_tokens = word_tokenize(text1)
print(word_tokens)
print(filtered_sentence)
  
# With no lower case conversion
  
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
print("Text1: ", text1)
print(word_tokens)
print(filtered_sentence)

Text1:  Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!
['Hey', ',', 'did', 'you', 'know', 'that', 'the', 'summer', 'break', 'is', 'coming', '?', 'Amazing', 'right', '!', '!', 'It', '’', 's', 'only', '5', 'more', 'days', '!', '!']
['Hey', ',', 'know', 'summer', 'break', 'coming', '?', 'Amazing', 'right', '!', '!', '’', '5', 'days', '!', '!']
Text1:  Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!
['Hey', ',', 'did', 'you', 'know', 'that', 'the', 'summer', 'break', 'is', 'coming', '?', 'Amazing', 'right', '!', '!', 'It', '’', 's', 'only', '5', 'more', 'days', '!', '!']
['Hey', ',', 'know', 'summer', 'break', 'coming', '?', 'Amazing', 'right', '!', '!', 'It', '’', '5', 'days', '!', '!']


In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/TE/nltk_data...


True

In [37]:
# Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
ps = PorterStemmer()

def stemming(text):
    words = word_tokenize(text)
    
    for w in words:
        print(w, " : ", ps.stem(w))

print("Text1: ", text1)
print("Stemming of Text1: ", stemming(text1))
print("Text2: ", text2)
print("Stemming of Text2: ", stemming(text2))

Hey  :  hey
,  :  ,
did  :  did
you  :  you
know  :  know
that  :  that
the  :  the
summer  :  summer
break  :  break
is  :  is
coming  :  come
?  :  ?
Amazing  :  amaz
right  :  right
!  :  !
!  :  !
It  :  it
’  :  ’
s  :  s
only  :  onli
5  :  5
more  :  more
days  :  day
!  :  !
!  :  !
Stemming of Text1:  None
They  :  they
wandered  :  wander
into  :  into
a  :  a
strange  :  strang
Tiki  :  tiki
bar  :  bar
on  :  on
the  :  the
edge  :  edg
of  :  of
the  :  the
small  :  small
beach  :  beach
town  :  town
.  :  .
Every  :  everi
manager  :  manag
should  :  should
be  :  be
able  :  abl
to  :  to
recite  :  recit
at  :  at
least  :  least
ten  :  ten
nursery  :  nurseri
rhymes  :  rhyme
backward  :  backward
.  :  .
Find  :  find
bar  :  bar
near  :  near
beach  :  beach
Stemming of Text2:  None


In [47]:
# Lemmatization.
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatization(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

print("Text1: ", text1)
print("Lemmatization of Text1: ", lemmatization(text1))
print("Text2: ", text2)
print("Lemmatization of Text2: ", lemmatization(text2))

Text1:  Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!
Lemmatization of Text1:  ['Hey', ',', 'do', 'you', 'know', 'that', 'the', 'summer', 'break', 'be', 'come', '?', 'Amazing', 'right', '!', '!', 'It', '’', 's', 'only', '5', 'more', 'days', '!', '!']
Text2:  They wandered into a strange Tiki bar on the edge of the small beach town. Every manager should be able to recite at least ten nursery rhymes backward. Find bar near beach
Lemmatization of Text2:  ['They', 'wander', 'into', 'a', 'strange', 'Tiki', 'bar', 'on', 'the', 'edge', 'of', 'the', 'small', 'beach', 'town', '.', 'Every', 'manager', 'should', 'be', 'able', 'to', 'recite', 'at', 'least', 'ten', 'nursery', 'rhyme', 'backward', '.', 'Find', 'bar', 'near', 'beach']


In [43]:
# Term Frequency
stop_words = set(stopwords.words('english'))
word_tokens1 = word_tokenize(text1)
filtered1 = []
for w in word_tokens1:
    if w not in stop_words:
        filtered1.append(w)
word_tokens2 = word_tokenize(text2)
filtered2 = []
for w in word_tokens2:
    if w not in stop_words:
        filtered2.append(w)
term_frequency1 = {}
term_frequency2 = {}
for i in filtered1:
    term_frequency1.update({i:(filtered1.count(i))/len(filtered1)})
for i in filtered2:
    term_frequency2.update({i:(filtered2.count(i))/len(filtered2)})

print("Text1: ", text1)
print("Term Frequency for text 1")
print(term_frequency1)
print("\nText2: ", text2)
print("Term Frequency for text 2")
print(term_frequency2)

Text1:  Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!
Term Frequency for text 1
{'Hey': 0.058823529411764705, ',': 0.058823529411764705, 'know': 0.058823529411764705, 'summer': 0.058823529411764705, 'break': 0.058823529411764705, 'coming': 0.058823529411764705, '?': 0.058823529411764705, 'Amazing': 0.058823529411764705, 'right': 0.058823529411764705, '!': 0.23529411764705882, 'It': 0.058823529411764705, '’': 0.058823529411764705, '5': 0.058823529411764705, 'days': 0.058823529411764705}

Text2:  They wandered into a strange Tiki bar on the edge of the small beach town. Every manager should be able to recite at least ten nursery rhymes backward. Find bar near beach
Term Frequency for text 2
{'They': 0.041666666666666664, 'wandered': 0.041666666666666664, 'strange': 0.041666666666666664, 'Tiki': 0.041666666666666664, 'bar': 0.08333333333333333, 'edge': 0.041666666666666664, 'small': 0.041666666666666664, 'beach': 0.08333333333333333, 'town': 0.

In [45]:
# Inverse Document Frequency
import math
ifd1 = {}
ifd2 = {}
for i in filtered1:
    cnt = 0
    if filtered1.count(i) > 0:
        cnt += 1
    if filtered2.count(i) > 0:
        cnt += 1
    f = math.log(2/cnt)
    ifd1.update({i:f})
    
for i in filtered2:
    cnt = 0
    if filtered1.count(i) > 0:
        cnt += 1
    if filtered2.count(i) > 0:
        cnt += 1
    f = math.log(2/cnt)
    ifd2.update({i:f})

print("Text1: ", text1)
print("Inverse Document frequency of Text1")
print(ifd1)
print("\nText2: ", text2)
print("\nInverse Document frequency of Text2")
print(ifd2)

Text1:  Hey, did you know that the summer break is coming? Amazing right!! It’s only 5 more days!!
Inverse Document frequency of Document1
{'Hey': 0.6931471805599453, ',': 0.6931471805599453, 'know': 0.6931471805599453, 'summer': 0.6931471805599453, 'break': 0.6931471805599453, 'coming': 0.6931471805599453, '?': 0.6931471805599453, 'Amazing': 0.6931471805599453, 'right': 0.6931471805599453, '!': 0.6931471805599453, 'It': 0.6931471805599453, '’': 0.6931471805599453, '5': 0.6931471805599453, 'days': 0.6931471805599453}

Text2:  They wandered into a strange Tiki bar on the edge of the small beach town. Every manager should be able to recite at least ten nursery rhymes backward. Find bar near beach

Inverse Document frequency of Document2
{'They': 0.6931471805599453, 'wandered': 0.6931471805599453, 'strange': 0.6931471805599453, 'Tiki': 0.6931471805599453, 'bar': 0.6931471805599453, 'edge': 0.6931471805599453, 'small': 0.6931471805599453, 'beach': 0.6931471805599453, 'town': 0.693147180559