<a href="https://colab.research.google.com/github/SayedShaun/Machine-Learning/blob/main/TF_IDF_Vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
from typing import List


class TFIDFVectorizer:
  def __init__(self, documents:str):
    self.documents = documents
    self.vocabs = self._build_vocabs(self.documents)
    self.tf = np.zeros((len(documents), len(self.vocabs)))
    self.idf = np.zeros((len(self.vocabs)))
    self.token2index = {token: idx for idx, token in enumerate(self.vocabs)}

  def _tokenize(self, text:str)->List[str]:
    return text.lower().split()

  def _build_vocabs(self, documents:List[str]):
    vocabs = set()
    for doc in documents:
      tokens = self._tokenize(doc)
      vocabs.update(tokens)
    return vocabs

  def _calculate_tf(self):
    for doc_idx, doc in enumerate(self.documents):
      tokens = self._tokenize(doc)
      for token in tokens:
        token_idx = self.token2index[token]
        self.tf[doc_idx, token_idx] += 1

      self.tf[doc_idx] = self.tf[doc_idx]/len(tokens)

  def _calculate_idf(self):
    n_documents = len(self.documents)
    for token_idx, token in enumerate(self.vocabs):
      doc_count = sum(token in self._tokenize(doc) for doc in self.documents)
      self.idf[token_idx] = np.log(n_documents/(1+doc_count)) # Adding 1 to avoid division by zero

  def tfidf(self):
    self._calculate_tf()
    self._calculate_idf()
    return self.tf * self.idf


s = ["I have been playing football since I came here. I will go home very soon"]
obj = TFIDFVectorizer(s)
print(list(obj.vocabs))
print(obj.tfidf())

['playing', 'have', 'very', 'been', 'will', 'here.', 'i', 'go', 'football', 'since', 'came', 'soon', 'home']
[[-0.04620981 -0.04620981 -0.04620981 -0.04620981 -0.04620981 -0.04620981
  -0.13862944 -0.04620981 -0.04620981 -0.04620981 -0.04620981 -0.04620981
  -0.04620981]]
