In [45]:
#import libraries here
import pandas as pd
import math
import sklearn
import re

In [1]:
docA = "Term frequency, tf(t,d), is the relative frequency of term t within document d,\
    where ft,d is the raw count of a term in a document, i.e., the number of times that \
    term t occurs in document d. Note the denominator is simply the total number of terms\
    in document d (counting each occurrence of the same term separately). There are various\
    other ways to define term frequency, such as the binary, logarithmic, and augmented"

docB = "Inverse document frequency, idf(t,D), is a measure of how much information the word\
        provides, i.e., if it's common or rare across all documents. It is the logarithmically\
        scaled fraction of the documents that contain the word, obtained by dividing the total\
        number of documents by the number of documents containing the term, and then taking the\
        logarithm of that quotient."

In [46]:
termListA =  re.sub(r"[^\w\s]", "", docA).split()
termListB = re.sub(r"[^\w\s]", "", docB).split()
total = list(set(termListA).union(set(termListB)))

In [47]:
print(termListA)
print(termListB)
print(total)

['Term', 'frequency', 'tftd', 'is', 'the', 'relative', 'frequency', 'of', 'term', 't', 'within', 'document', 'd', 'where', 'ftd', 'is', 'the', 'raw', 'count', 'of', 'a', 'term', 'in', 'a', 'document', 'ie', 'the', 'number', 'of', 'times', 'that', 'term', 't', 'occurs', 'in', 'document', 'd', 'Note', 'the', 'denominator', 'is', 'simply', 'the', 'total', 'number', 'of', 'terms', 'in', 'document', 'd', 'counting', 'each', 'occurrence', 'of', 'the', 'same', 'term', 'separately', 'There', 'are', 'various', 'other', 'ways', 'to', 'define', 'term', 'frequency', 'such', 'as', 'the', 'binary', 'logarithmic', 'and', 'augmented']
['Inverse', 'document', 'frequency', 'idftD', 'is', 'a', 'measure', 'of', 'how', 'much', 'information', 'the', 'word', 'provides', 'ie', 'if', 'its', 'common', 'or', 'rare', 'across', 'all', 'documents', 'It', 'is', 'the', 'logarithmically', 'scaled', 'fraction', 'of', 'the', 'documents', 'that', 'contain', 'the', 'word', 'obtained', 'by', 'dividing', 'the', 'total', 'nu

<h2>Bag of Words</h2>

In [48]:
def listtoDic(li):
    dic = {}
    for i in li:
        if(i not in dic):
            dic[i] = 1
        else:
            dic[i] += 1
    return dic

In [49]:
dicA = listtoDic(termListA)
dicB = listtoDic(termListB)

In [50]:
print(dicA)
print(dicB)

{'Term': 1, 'frequency': 3, 'tftd': 1, 'is': 3, 'the': 7, 'relative': 1, 'of': 5, 'term': 5, 't': 2, 'within': 1, 'document': 4, 'd': 3, 'where': 1, 'ftd': 1, 'raw': 1, 'count': 1, 'a': 2, 'in': 3, 'ie': 1, 'number': 2, 'times': 1, 'that': 1, 'occurs': 1, 'Note': 1, 'denominator': 1, 'simply': 1, 'total': 1, 'terms': 1, 'counting': 1, 'each': 1, 'occurrence': 1, 'same': 1, 'separately': 1, 'There': 1, 'are': 1, 'various': 1, 'other': 1, 'ways': 1, 'to': 1, 'define': 1, 'such': 1, 'as': 1, 'binary': 1, 'logarithmic': 1, 'and': 1, 'augmented': 1}
{'Inverse': 1, 'document': 1, 'frequency': 1, 'idftD': 1, 'is': 2, 'a': 1, 'measure': 1, 'of': 5, 'how': 1, 'much': 1, 'information': 1, 'the': 8, 'word': 2, 'provides': 1, 'ie': 1, 'if': 1, 'its': 1, 'common': 1, 'or': 1, 'rare': 1, 'across': 1, 'all': 1, 'documents': 4, 'It': 1, 'logarithmically': 1, 'scaled': 1, 'fraction': 1, 'that': 2, 'contain': 1, 'obtained': 1, 'by': 2, 'dividing': 1, 'total': 1, 'number': 2, 'containing': 1, 'term': 1, 

In [51]:
df = pd.DataFrame([dicA,dicB])
df

Unnamed: 0,Term,frequency,tftd,is,the,relative,of,term,t,within,...,fraction,contain,obtained,by,dividing,containing,then,taking,logarithm,quotient
0,1.0,3,1.0,3,7,1.0,5,5,2.0,1.0,...,,,,,,,,,,
1,,1,,2,8,,5,1,,,...,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0


In [59]:
df = df.fillna(0)
df

Unnamed: 0,Term,frequency,tftd,is,the,relative,of,term,t,within,...,fraction,contain,obtained,by,dividing,containing,then,taking,logarithm,quotient
0,1.0,3,1.0,3,7,1.0,5,5,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1,0.0,2,8,0.0,5,1,0.0,0.0,...,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0


In [56]:
df.columns

Index(['Term', 'frequency', 'tftd', 'is', 'the', 'relative', 'of', 'term', 't',
       'within', 'document', 'd', 'where', 'ftd', 'raw', 'count', 'a', 'in',
       'ie', 'number', 'times', 'that', 'occurs', 'Note', 'denominator',
       'simply', 'total', 'terms', 'counting', 'each', 'occurrence', 'same',
       'separately', 'There', 'are', 'various', 'other', 'ways', 'to',
       'define', 'such', 'as', 'binary', 'logarithmic', 'and', 'augmented',
       'Inverse', 'idftD', 'measure', 'how', 'much', 'information', 'word',
       'provides', 'if', 'its', 'common', 'or', 'rare', 'across', 'all',
       'documents', 'It', 'logarithmically', 'scaled', 'fraction', 'contain',
       'obtained', 'by', 'dividing', 'containing', 'then', 'taking',
       'logarithm', 'quotient'],
      dtype='object')

<h2>TF</h2>(Term Frequency)

In [66]:
def calculateTF(df): #calculating term frequency
    res_df = df.copy()
    col_names = df.columns
    for i in range(len(df)):
        tf = 0
        max_val = 0
        for j in col_names:
            if df.loc[i,j] > max_val:
                max_val = df.loc[i,j]
        for k in col_names:
            tf = 0.5+(0.5*df.loc[i,k])/max_val
            res_df.loc[i,k] = tf
    return res_df
print(calculateTF(df))

       Term  frequency      tftd        is  the  relative        of      term  \
0  0.785714   0.857143  0.785714  0.857143    1  0.785714  0.928571  0.928571   
1  0.750000   0.781250  0.750000  0.812500    1  0.750000  0.906250  0.781250   

          t    within  ...  fraction  contain  obtained      by  dividing  \
0  0.821429  0.785714  ...   0.75000  0.75000   0.75000  0.7500   0.75000   
1  0.750000  0.750000  ...   0.78125  0.78125   0.78125  0.8125   0.78125   

   containing     then   taking  logarithm  quotient  
0     0.75000  0.75000  0.75000    0.75000   0.75000  
1     0.78125  0.78125  0.78125    0.78125   0.78125  

[2 rows x 75 columns]


similarly IDF and TF-IDF can also be calculated. 