In [2]:
import pandas as pd 
import numpy as np
import re

In [3]:
corpus = [
    "It was the best of times.",
    "It was the worst of times.",
    "It was the age of wisdom and the age of foolishness."
]

In [4]:
def tokenize(sentence):
    sentence = sentence.lower() 
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence.split()

Calculating TF


In [5]:
def get_term_frequency(corpus):
    
    vocabulary = []
    for c in corpus:
        sentence = tokenize(c)
        for word in sentence:
            vocabulary.append(word)

    vocabulary = set(vocabulary)
    
    term_freq = []
    for sentence in corpus:
        sentence_tf = dict([(v,0) for v in vocabulary])
        for word in tokenize(sentence):
            sentence_tf[word] += 1
        for v in vocabulary:
            sentence_tf[v] = sentence_tf[v]/len(tokenize(sentence))
        term_freq.append(sentence_tf)
        
    return term_freq

In [6]:
term_freq = get_term_frequency(corpus)
df = pd.DataFrame(term_freq)
df.index = corpus

display(df)

Unnamed: 0,age,of,wisdom,worst,and,times,best,it,the,was,foolishness
It was the best of times.,0.0,0.166667,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.166667,0.166667,0.0
It was the worst of times.,0.0,0.166667,0.0,0.166667,0.0,0.166667,0.0,0.166667,0.166667,0.166667,0.0
It was the age of wisdom and the age of foolishness.,0.181818,0.181818,0.090909,0.0,0.090909,0.0,0.0,0.090909,0.181818,0.090909,0.090909


Calculating IDF:-

In [7]:
def get_inverse_document_frequency(corpus):
    vocabulary = []
    for c in corpus:
        sentence = tokenize(c)
        for word in sentence:
            vocabulary.append(word)

    vocabulary = set(vocabulary)
    
    n = len(corpus)
    
    inverse_document_frequency = {}
    for v in vocabulary:
        num_docs = 0
        for sentence in corpus:
            if v in tokenize(sentence):
                num_docs += 1
        inverse_document_frequency[v] = np.log(n/num_docs)
    return inverse_document_frequency

In [8]:
inverse_document_frequency = get_inverse_document_frequency(corpus)
inverse_document_frequency

{'age': np.float64(1.0986122886681098),
 'of': np.float64(0.0),
 'wisdom': np.float64(1.0986122886681098),
 'worst': np.float64(1.0986122886681098),
 'and': np.float64(1.0986122886681098),
 'times': np.float64(0.4054651081081644),
 'best': np.float64(1.0986122886681098),
 'it': np.float64(0.0),
 'the': np.float64(0.0),
 'was': np.float64(0.0),
 'foolishness': np.float64(1.0986122886681098)}

Calculating TF-IDF:-

In [9]:
def get_tf_idf(corpus):
    tf = get_term_frequency(corpus)
    idf = get_inverse_document_frequency(corpus)
    
    tf_idf = []
    for tf_dict in tf:
        tf_idf_sentence = {}
        for t, term_freq in tf_dict.items():
            tf_idf_sentence[t] = term_freq * idf[t]
        tf_idf.append(tf_idf_sentence)
    
    return tf_idf

In [10]:
tf_idf = get_tf_idf(corpus)
df = pd.DataFrame(tf_idf)
df.index = corpus
display(df)

Unnamed: 0,age,of,wisdom,worst,and,times,best,it,the,was,foolishness
It was the best of times.,0.0,0.0,0.0,0.0,0.0,0.067578,0.183102,0.0,0.0,0.0,0.0
It was the worst of times.,0.0,0.0,0.0,0.183102,0.0,0.067578,0.0,0.0,0.0,0.0,0.0
It was the age of wisdom and the age of foolishness.,0.199748,0.0,0.099874,0.0,0.099874,0.0,0.0,0.0,0.0,0.0,0.099874
