In [88]:
import pandas as pd
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

text = "Hello everyone. Welcome to DSBDA lab. We are studying text analytics."

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Tokenization

In [89]:
# Sentence Tokenization
from nltk.tokenize import sent_tokenize
token_sent = sent_tokenize(text)
token_sent

['Hello everyone.', 'Welcome to DSBDA lab.', 'We are studying text analytics.']

In [90]:
# Word Tokenization
from nltk.tokenize import word_tokenize
token_word = word_tokenize(text)
token_word

['Hello',
 'everyone',
 '.',
 'Welcome',
 'to',
 'DSBDA',
 'lab',
 '.',
 'We',
 'are',
 'studying',
 'text',
 'analytics',
 '.']

# POS Tagging

In [91]:
nltk.pos_tag(token_word)

[('Hello', 'NNP'),
 ('everyone', 'NN'),
 ('.', '.'),
 ('Welcome', 'NNP'),
 ('to', 'TO'),
 ('DSBDA', 'NNP'),
 ('lab', 'NN'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('studying', 'VBG'),
 ('text', 'IN'),
 ('analytics', 'NNS'),
 ('.', '.')]

# Stop words removal

In [92]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [93]:
filtered_sent = []
for i in token_word:
    if i.lower() not in stopwords:
        filtered_sent.append(i)
        
print("Tokenize", token_word)
print("Filtered", filtered_sent)

Tokenize ['Hello', 'everyone', '.', 'Welcome', 'to', 'DSBDA', 'lab', '.', 'We', 'are', 'studying', 'text', 'analytics', '.']
Filtered ['Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']


# Stemming

In [94]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed = []
for i in filtered_sent:
    stemmed.append(ps.stem(i))
print("Filtered", filtered_sent)
print("Stemmed", stemmed)

Filtered ['Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']
Stemmed ['hello', 'everyon', '.', 'welcom', 'dsbda', 'lab', '.', 'studi', 'text', 'analyt', '.']


# Lemmatization

In [95]:
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()
lemmatize = []
for i in filtered_sent:
    lemmatize.append(lem.lemmatize(i))
print("Filtered", filtered_sent)
print("Lemmatize", lemmatize)

Filtered ['Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']
Lemmatize ['Hello', 'everyone', '.', 'Welcome', 'DSBDA', 'lab', '.', 'studying', 'text', 'analytics', '.']


# Term Frequency and Inverse Document Frequency

In [96]:
text1 = "Data Science is the hardest job of the 21st century"
text2 = "machine learning is the key for data science"

token_word1 = text1.split(" ")
token_word2 = text2.split(" ")

total= set(token_word1).union(set(token_word2))
print(total)

{'data', 'job', '21st', 'Data', 'key', 'of', 'learning', 'hardest', 'Science', 'the', 'machine', 'for', 'century', 'is', 'science'}


In [97]:
dict1 = dict.fromkeys(total, 0)
dict2 = dict.fromkeys(total, 0)

for i in token_word1:
    dict1[i]+=1

for i in token_word2:
    dict2[i]+=1

print(dict1)
print(dict2)

{'data': 0, 'job': 1, '21st': 1, 'Data': 1, 'key': 0, 'of': 1, 'learning': 0, 'hardest': 1, 'Science': 1, 'the': 2, 'machine': 0, 'for': 0, 'century': 1, 'is': 1, 'science': 0}
{'data': 1, 'job': 0, '21st': 0, 'Data': 0, 'key': 1, 'of': 0, 'learning': 1, 'hardest': 0, 'Science': 0, 'the': 1, 'machine': 1, 'for': 1, 'century': 0, 'is': 1, 'science': 1}


In [98]:
df = pd.DataFrame([dict1, dict2])
df

Unnamed: 0,data,job,21st,Data,key,of,learning,hardest,Science,the,machine,for,century,is,science
0,0,1,1,1,0,1,0,1,1,2,0,0,1,1,0
1,1,0,0,0,1,0,1,0,0,1,1,1,0,1,1


In [99]:
def tf(tokenFreq, tokens):
    tfDict = {}
    size = len(tokens)
    for word, count in tokenFreq.items():
        tfDict[word] = count/float(size)
    return tfDict

tf1 = tf(dict1, token_word1)
tf2 = tf(dict2, token_word2)

print(tf1)
print(tf2)

{'data': 0.0, 'job': 0.1, '21st': 0.1, 'Data': 0.1, 'key': 0.0, 'of': 0.1, 'learning': 0.0, 'hardest': 0.1, 'Science': 0.1, 'the': 0.2, 'machine': 0.0, 'for': 0.0, 'century': 0.1, 'is': 0.1, 'science': 0.0}
{'data': 0.125, 'job': 0.0, '21st': 0.0, 'Data': 0.0, 'key': 0.125, 'of': 0.0, 'learning': 0.125, 'hardest': 0.0, 'Science': 0.0, 'the': 0.125, 'machine': 0.125, 'for': 0.125, 'century': 0.0, 'is': 0.125, 'science': 0.125}


In [100]:
tfdf = pd.DataFrame([tf1, tf2])
tfdf

Unnamed: 0,data,job,21st,Data,key,of,learning,hardest,Science,the,machine,for,century,is,science
0,0.0,0.1,0.1,0.1,0.0,0.1,0.0,0.1,0.1,0.2,0.0,0.0,0.1,0.1,0.0
1,0.125,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.125,0.125,0.125,0.0,0.125,0.125
