Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of documents by calculating Term Frequency and Inverse
DocumentFrequency.

In [14]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Urvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Urvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Urvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Urvi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Urvi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
text = "I love to program in python. Python is great for data science. I enjoy learning new things about data."

In [17]:
sentence = sent_tokenize(text)
print(sentence)

['I love to program in python.', 'Python is great for data science.', 'I enjoy learning new things about data.']


In [18]:
word = word_tokenize(text)
print(word)

['I', 'love', 'to', 'program', 'in', 'python', '.', 'Python', 'is', 'great', 'for', 'data', 'science', '.', 'I', 'enjoy', 'learning', 'new', 'things', 'about', 'data', '.']


In [19]:
stopword = set(stopwords.words('english'))
print(stopword)

{"he'll", 'nor', "we'd", 'its', 'just', 'hers', 'does', 'hadn', 'until', 'm', 'you', 'd', 'my', 'who', 'below', 'how', 'off', 'it', "it'd", 'more', "i'm", 'ourselves', 'before', "aren't", "i'd", 'such', 'is', 'being', 'once', "needn't", 'did', 'as', 'his', "won't", 'has', 'himself', 'above', 'all', 'myself', 'the', 'having', 'under', 'what', "didn't", 'why', "they'd", 'weren', "shan't", 'and', 'so', 'that', 'their', 'than', 'out', 'her', 'those', "mightn't", "don't", "he'd", 't', "you'll", 'for', 'now', 'will', "wouldn't", 'can', 'him', 'yourself', 'doing', "you're", 'very', 'doesn', "hadn't", 'after', 'our', 'needn', 'which', 'o', 'not', 'this', 'am', 'up', 'from', "they're", 'itself', 'down', 'couldn', 'against', 'hasn', 'no', 'y', 'he', "isn't", 'about', 'are', "it'll", 'only', 'an', 'isn', "they've", "we're", 'most', 've', 'some', 'over', 'there', 'few', 'wouldn', 'she', 'yours', 'themselves', 'i', 'when', 'at', 'haven', 'again', 'during', "doesn't", 'in', "haven't", 'these', "hasn

In [20]:
text=re.sub('[^a-zA-Z]',' ',text)
text = word_tokenize(text.lower())
filtered_text = []
for w in text:
    if w not in stopword:
        filtered_text.append(w)
print("Original Text: ",text)
print("Filtered Text: ",filtered_text)

Original Text:  ['i', 'love', 'to', 'program', 'in', 'python', 'python', 'is', 'great', 'for', 'data', 'science', 'i', 'enjoy', 'learning', 'new', 'things', 'about', 'data']
Filtered Text:  ['love', 'program', 'python', 'python', 'great', 'data', 'science', 'enjoy', 'learning', 'new', 'things', 'data']


In [22]:
ps = PorterStemmer()
stem_word = ['wait','waited','waits','waiting']
for w in stem_word:
    print(w, ":", ps.stem(w))
print("Original Words: ",stem_word)


wait : wait
waited : wait
waits : wait
waiting : wait
Original Words:  ['wait', 'waited', 'waits', 'waiting']


In [34]:
lematizer = WordNetLemmatizer()
texts = "studing studies crying cries"
tokenizer = word_tokenize(texts)
for w in tokenizer:
    print("Lemma for {} is {}".format(w, lemmatizer.lemmatize(w)))

Lemma for studing is studing
Lemma for studies is study
Lemma for crying is cry
Lemma for cries is cry


In [40]:
data = "The pink sweater look beautiful on her"
token = word_tokenize(data)
for w in token:
    print(pos_tag([w]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('look', 'NN')]
[('beautiful', 'NN')]
[('on', 'IN')]
[('her', 'PRP$')]


In [48]:
textx = "Python is a coding language. It is used in data science and machine learning. I love coding in python language. I am gaining deep knowledge about this language."

In [56]:
wn = WordNetLemmatizer()
texts = sent_tokenize(textx)
corpus = []
for i in range(len(texts)):
    review = re.sub('[^a-zA-Z]', ' ', texts[i])
    review = review.lower()
    review = review.split()
    review = (wn.lemmatize(word, pos = 'v') for word in review if word not in set(stopwords.words('english')))
    review = ' '.join(review)
    corpus.append(review)
print(corpus)


['python cod language', 'use data science machine learn', 'love cod python language', 'gain deep knowledge language']


In [57]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()
print(X)

[[0.61366674 0.         0.         0.         0.         0.49681612
  0.         0.         0.         0.61366674 0.         0.        ]
 [0.         0.4472136  0.         0.         0.         0.
  0.4472136  0.         0.4472136  0.         0.4472136  0.4472136 ]
 [0.4842629  0.         0.         0.         0.         0.39205255
  0.         0.61422608 0.         0.4842629  0.         0.        ]
 [0.         0.         0.5417361  0.5417361  0.5417361  0.34578314
  0.         0.         0.         0.         0.         0.        ]]
