# Problem Statement

1.Extract Sample document and apply following document preprocessing methods:
    
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.

2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency.

# Download the required packages

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OM\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\OM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

# Initialize the text

In [4]:
text="Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

# Sentence tokenization
from nltk.tokenize import sent_tokenize
tokenized_text=sent_tokenize(text)
tokenized_text

['Tokenization is the first step in text analytics.',
 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']

# Perform Tokenization

In [5]:
# word tokenization

from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
tokenized_word

['Tokenization',
 'is',
 'the',
 'first',
 'step',
 'in',
 'text',
 'analytics',
 '.',
 'The',
 'process',
 'of',
 'breaking',
 'down',
 'a',
 'text',
 'paragraph',
 'into',
 'smaller',
 'chunks',
 'such',
 'as',
 'words',
 'or',
 'sentences',
 'is',
 'called',
 'Tokenization',
 '.']

# Removing Punctuations and Stop Word

In [6]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [7]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
text= "How to remove stopwords with nltk library in Python?"
import re
text=re.sub('[^a-zA-Z]',' ',text)

In [11]:
text

'How to remove stopwords with nltk library in Python '

In [12]:
tokens=word_tokenize(text.lower())
tokens

['how', 'to', 'remove', 'stopwords', 'with', 'nltk', 'library', 'in', 'python']

In [15]:
filtered_text=[]
for w in tokens:
    if w not in stop_words:
        filtered_text.append(w)

In [16]:
filtered_text

['remove', 'stopwords', 'nltk', 'library', 'python']

# Stemming

In [18]:
from nltk.stem import PorterStemmer
e_words=['wait','waiting','waited','waits']
ps=PorterStemmer()
for w in e_words:
    rootwords=ps.stem(w)
    print(rootwords)

wait
wait
wait
wait


# Lemmanization

In [20]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OM\AppData\Roaming\nltk_data...


True

In [21]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w,wordnet_lemmatizer.lemmatize(w)))

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


# Part of Speech (POS)

In [23]:
import nltk
from nltk.tokenize import word_tokenize
data="The pink sweater fit her perfectly"
words=word_tokenize(data)
for word in words:
    print(nltk.pos_tag([word]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


# Part 2: 

Create representation of document by calculating Term Frequency and Inverse Document
Frequency.

# Import the necessary libraries.

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

  from pandas.core import (


# Initialize the Documents.

In [25]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

# Create BagofWords (BoW) for Document A and B.

In [26]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

# Create Collection of Unique words from Document A and B.


In [27]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [28]:
uniqueWords

{'Jupiter',
 'Mars',
 'Planet',
 'Sun',
 'fourth',
 'from',
 'is',
 'largest',
 'planet',
 'the'}

# Create a dictionary of words and their occurrence for each document in the corpus


In [32]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsA

{'Sun': 1,
 'Jupiter': 0,
 'planet': 1,
 'largest': 0,
 'Planet': 0,
 'fourth': 1,
 'is': 1,
 'Mars': 1,
 'the': 2,
 'from': 1}

In [33]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
numOfWordsB

{'Sun': 1,
 'Jupiter': 0,
 'planet': 1,
 'largest': 0,
 'Planet': 0,
 'fourth': 1,
 'is': 1,
 'Mars': 1,
 'the': 2,
 'from': 1}

# Compute the term frequency for each of our documents.

In [34]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [35]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [36]:
tfA

{'Sun': 0.0,
 'Jupiter': 0.2,
 'planet': 0.0,
 'largest': 0.2,
 'Planet': 0.2,
 'fourth': 0.0,
 'is': 0.2,
 'Mars': 0.0,
 'the': 0.2,
 'from': 0.0}

In [37]:
tfB

{'Sun': 0.125,
 'Jupiter': 0.0,
 'planet': 0.125,
 'largest': 0.0,
 'Planet': 0.0,
 'fourth': 0.125,
 'is': 0.125,
 'Mars': 0.125,
 'the': 0.25,
 'from': 0.125}

# Compute the term Inverse Document Frequency.


In [38]:
def computeIDF(documents):
    import math
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [39]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'Sun': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'is': 0.0,
 'Mars': 0.6931471805599453,
 'the': 0.0,
 'from': 0.6931471805599453}

# Compute the term TF/IDF for all words.


In [40]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [41]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [42]:
tfidfA

{'Sun': 0.0,
 'Jupiter': 0.13862943611198905,
 'planet': 0.0,
 'largest': 0.13862943611198905,
 'Planet': 0.13862943611198905,
 'fourth': 0.0,
 'is': 0.0,
 'Mars': 0.0,
 'the': 0.0,
 'from': 0.0}

In [43]:
tfidfB

{'Sun': 0.08664339756999316,
 'Jupiter': 0.0,
 'planet': 0.08664339756999316,
 'largest': 0.0,
 'Planet': 0.0,
 'fourth': 0.08664339756999316,
 'is': 0.0,
 'Mars': 0.08664339756999316,
 'the': 0.0,
 'from': 0.08664339756999316}

# Converting into the dataframe

In [44]:
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,Sun,Jupiter,planet,largest,Planet,fourth,is,Mars,the,from
0,0.0,0.138629,0.0,0.138629,0.138629,0.0,0.0,0.0,0.0,0.0
1,0.086643,0.0,0.086643,0.0,0.0,0.086643,0.0,0.086643,0.0,0.086643
