In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Initialize Documents

documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

In [3]:
# Create Bag of words for document

bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [4]:
# create collection of unique words from document

uniqueWords = set(bagOfWordsA).union(bagOfWordsB)

In [5]:
# Create dictionary of words and their occurence for each document 

numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA :
    numOfWordsA[word] += 1
    
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [6]:
# compute the term frequency for each document 

def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    
    for word,count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [7]:
tfA

{'from': 0.0,
 'Jupiter': 0.2,
 'the': 0.2,
 'Sun': 0.0,
 'Mars': 0.0,
 'planet': 0.0,
 'is': 0.2,
 'largest': 0.2,
 'fourth': 0.0,
 'Planet': 0.2}

In [8]:
tfB

{'from': 0.125,
 'Jupiter': 0.0,
 'the': 0.25,
 'Sun': 0.125,
 'Mars': 0.125,
 'planet': 0.125,
 'is': 0.125,
 'largest': 0.0,
 'fourth': 0.125,
 'Planet': 0.0}

In [9]:
tf_df = pd.DataFrame([tfA,tfB])
tf_df

Unnamed: 0,from,Jupiter,the,Sun,Mars,planet,is,largest,fourth,Planet
0,0.0,0.2,0.2,0.0,0.0,0.0,0.2,0.2,0.0,0.2
1,0.125,0.0,0.25,0.125,0.125,0.125,0.125,0.0,0.125,0.0


In [10]:
# Compute the term Inverse Document Frequency

def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'from': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'the': 0.0,
 'Sun': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'is': 0.0,
 'largest': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'Planet': 0.6931471805599453}

In [11]:
idfs_df = pd.DataFrame([idfs])
idfs_df

Unnamed: 0,from,Jupiter,the,Sun,Mars,planet,is,largest,fourth,Planet
0,0.693147,0.693147,0.0,0.693147,0.693147,0.693147,0.0,0.693147,0.693147,0.693147


In [12]:
# Compute the term TF/IDF for all Words 

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)


In [13]:
tfidfA

{'from': 0.0,
 'Jupiter': 0.13862943611198905,
 'the': 0.0,
 'Sun': 0.0,
 'Mars': 0.0,
 'planet': 0.0,
 'is': 0.0,
 'largest': 0.13862943611198905,
 'fourth': 0.0,
 'Planet': 0.13862943611198905}

In [14]:
tfidfB

{'from': 0.08664339756999316,
 'Jupiter': 0.0,
 'the': 0.0,
 'Sun': 0.08664339756999316,
 'Mars': 0.08664339756999316,
 'planet': 0.08664339756999316,
 'is': 0.0,
 'largest': 0.0,
 'fourth': 0.08664339756999316,
 'Planet': 0.0}

In [15]:
tfidf_df = pd.DataFrame([tfidfA,tfidfB])
tfidf_df

Unnamed: 0,from,Jupiter,the,Sun,Mars,planet,is,largest,fourth,Planet
0,0.0,0.138629,0.0,0.0,0.0,0.0,0.0,0.138629,0.0,0.138629
1,0.086643,0.0,0.0,0.086643,0.086643,0.086643,0.0,0.0,0.086643,0.0
