In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
documentA = 'the man went out for a walk with another man'
documentB = 'the dog sat around the fire grill to see the dogs next house'

In [42]:
bagOfWordsA = documentA.split(' ')
bagOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk', 'with', 'another', 'man']

In [43]:
bagOfWordsB = documentB.split(' ')
bagOfWordsB

['the',
 'dog',
 'sat',
 'around',
 'the',
 'fire',
 'grill',
 'to',
 'see',
 'the',
 'dogs',
 'next',
 'house']

In [44]:
set(bagOfWordsA)

{'a', 'another', 'for', 'man', 'out', 'the', 'walk', 'went', 'with'}

In [45]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'another',
 'around',
 'dog',
 'dogs',
 'fire',
 'for',
 'grill',
 'house',
 'man',
 'next',
 'out',
 'sat',
 'see',
 'the',
 'to',
 'walk',
 'went',
 'with'}

In [46]:
dict.fromkeys(uniqueWords, 0)  # Q. why the sequence order changes???

{'walk': 0,
 'a': 0,
 'another': 0,
 'sat': 0,
 'to': 0,
 'dogs': 0,
 'next': 0,
 'around': 0,
 'the': 0,
 'for': 0,
 'grill': 0,
 'man': 0,
 'dog': 0,
 'out': 0,
 'with': 0,
 'fire': 0,
 'went': 0,
 'see': 0,
 'house': 0}

In [47]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsA:
    numOfWordsA[word] += 1

numOfWordsA

{'walk': 1,
 'a': 1,
 'another': 1,
 'sat': 0,
 'to': 0,
 'dogs': 0,
 'next': 0,
 'around': 0,
 'the': 1,
 'for': 1,
 'grill': 0,
 'man': 2,
 'dog': 0,
 'out': 1,
 'with': 1,
 'fire': 0,
 'went': 1,
 'see': 0,
 'house': 0}

In [19]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsB:
    numOfWordsB[word] += 1

numOfWordsB

{'man': 0,
 'walk': 0,
 'grill': 1,
 'out': 0,
 'a': 0,
 'sat': 1,
 'fire': 1,
 'went': 0,
 'dogs': 1,
 'around': 1,
 'the': 2,
 'for': 0}

In [23]:
from nltk.corpus import stopwords

In [37]:
# stopwords.words('english')

In [48]:
numOfWordsA.items()

dict_items([('walk', 1), ('a', 1), ('another', 1), ('sat', 0), ('to', 0), ('dogs', 0), ('next', 0), ('around', 0), ('the', 1), ('for', 1), ('grill', 0), ('man', 2), ('dog', 0), ('out', 1), ('with', 1), ('fire', 0), ('went', 1), ('see', 0), ('house', 0)])

In [49]:
for word, count in numOfWordsA.items():
    print('word:', word, end=", ")
    print('count:', count)

word: walk, count: 1
word: a, count: 1
word: another, count: 1
word: sat, count: 0
word: to, count: 0
word: dogs, count: 0
word: next, count: 0
word: around, count: 0
word: the, count: 1
word: for, count: 1
word: grill, count: 0
word: man, count: 2
word: dog, count: 0
word: out, count: 1
word: with, count: 1
word: fire, count: 0
word: went, count: 1
word: see, count: 0
word: house, count: 0


In [50]:
# Term Frequency (TF): 
# The number of times a word appears in a document divded by the total number of words in the document.

def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [51]:
computeTF(numOfWordsA, bagOfWordsA)

{'walk': 0.1,
 'a': 0.1,
 'another': 0.1,
 'sat': 0.0,
 'to': 0.0,
 'dogs': 0.0,
 'next': 0.0,
 'around': 0.0,
 'the': 0.1,
 'for': 0.1,
 'grill': 0.0,
 'man': 0.2,
 'dog': 0.0,
 'out': 0.1,
 'with': 0.1,
 'fire': 0.0,
 'went': 0.1,
 'see': 0.0,
 'house': 0.0}

In [55]:
dict.fromkeys(numOfWordsA.keys(), 0)

{'walk': 0,
 'a': 0,
 'another': 0,
 'sat': 0,
 'to': 0,
 'dogs': 0,
 'next': 0,
 'around': 0,
 'the': 0,
 'for': 0,
 'grill': 0,
 'man': 0,
 'dog': 0,
 'out': 0,
 'with': 0,
 'fire': 0,
 'went': 0,
 'see': 0,
 'house': 0}

In [54]:
dict.fromkeys([numOfWordsA, numOfWordsB][0].keys(), 0)

{'walk': 0,
 'a': 0,
 'another': 0,
 'sat': 0,
 'to': 0,
 'dogs': 0,
 'next': 0,
 'around': 0,
 'the': 0,
 'for': 0,
 'grill': 0,
 'man': 0,
 'dog': 0,
 'out': 0,
 'with': 0,
 'fire': 0,
 'went': 0,
 'see': 0,
 'house': 0}

In [52]:
# Inverse Data Frequency (IDF) :
# The log of the number of documents divided by the number of documents that contain the word w

def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [56]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

ZeroDivisionError: float division by zero