In [36]:
import pandas as pd 
import numpy as np

docs = [
    "machine learning is fun and machine learning is powerful", 
    "deep learning models are powerful",
    "machine learning and deep learning are part of artificial intelligence",
]

In [None]:
# TF : No of times token t appeared in the Doc / No of tokens in that doc 
# IDF: Total No of Docs / number of times Token T appears in doc

In [65]:
class TF_IDF:
    def __init__(self):
        self.vocab = []
        self.idfs = {}
        self.fitted = False
        
    def tokenize(self, sentence):
        return sentence.split()
        
    def fit(self, docs):
        self.vocab = sorted(set(token for d in docs for token in self.tokenize(d)))
        N = len(docs)
        for t in self.vocab:
            count = sum(1 for d in docs if t in self.tokenize(d))
            self.idfs[t] = np.log((N + 1) / (count + 1)) + 1
            
        self.fitted = True
        return self
    
    def transform(self, docs):
        if not self.fitted:
            raise TypeError('Please use .fit first')
            
        if not isinstance(docs, list):
            raise TypeError('Please pass a list of docs')
        
        tf_idfs = []
        for d in docs:
            word_dict = dict.fromkeys(self.vocab, 0)
            tokens = self.tokenize(d)
            total_tokens = len(tokens)
            
            for t in tokens:
                if t in word_dict:
                    word_dict[t] += 1
                    
            tf = {token: count/total_tokens for token, count in word_dict.items()}
            vector = [tf[token] * self.idfs[token] for token in self.vocab]
            
            tf_idfs.append(vector)
            
        return tf_idfs
    
    def fit_transform(self, docs):
        self.fit(docs)
        return self.transform(docs)

In [69]:
tf = TF_IDF()

print(tf.fit_transform(docs))

[[0.14307578582797564, 0.0, 0.0, 0.0, 0.1881274645066606, 0.0, 0.3762549290133212, 0.2222222222222222, 0.2861515716559513, 0.0, 0.0, 0.0, 0.14307578582797564], [0.0, 0.2575364144903562, 0.0, 0.2575364144903562, 0.0, 0.0, 0.0, 0.2, 0.0, 0.3386294361119891, 0.0, 0.0, 0.2575364144903562], [0.1287682072451781, 0.1287682072451781, 0.16931471805599455, 0.1287682072451781, 0.0, 0.16931471805599455, 0.0, 0.2, 0.1287682072451781, 0.0, 0.16931471805599455, 0.16931471805599455, 0.0]]
