In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
import numpy as np
import string

The Test Dataset is used to compare our implementations with those from sklearn. The intent is to achieve equality after vectorization. To mimic the results from sklearn, the vectorizers also do very basic cleaing. They replace all punctuation with spaces, and all characters become lowercase.

In [None]:
# Test Dataset
c1 = 'Human machine interface for Lab ABC computer applications'
c2 = 'A survey of user opinion of computer system response time'
c3 = 'The EPS user interface management system'
c4 = 'System and human system engineering testing of EPS'
c5 = 'Relation of user-perceived response time to error measurement'
m1 = 'The generation of random, binary, unordered trees'
m2 = 'The intersection graph of paths in trees'
m3 = 'Graph minors IV: Widths of trees and well-quasi-ordering'
m4 = 'Graph minors: A survey'
documents = [c1, c2, c3, c4, c5, m1, m2, m3, m4]

In [None]:
class CustomCountVectorizer:
  def create_vectors(self, documents):
    # Get Vocab
    vocab = set()
    clean_docs = list()
    for doc in documents:
      lower_doc = doc.lower()
      lower_doc_no_sym = ''.join([i if i not in string.punctuation else ' ' for i in lower_doc])
      doc_arr = lower_doc_no_sym.split()
      clean_docs.append(doc_arr)
      vocab = vocab.union(doc_arr)

    vocab = sorted(vocab)

    # Get Counts
    vectors = np.zeros((len(clean_docs), len(vocab)))
    for doc_idx, doc in enumerate(clean_docs):
      for wrd_idx, word in enumerate(vocab):
        count = doc.count(word)
        vectors[doc_idx][wrd_idx] = count
    
    return vectors.astype(np.int64)

In [None]:
# By default, CountVectorizer from sklearn ignores tokens under length 2
control_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
control_vecs = control_vectorizer.fit_transform(documents).toarray()

custom_vectorizer = CustomCountVectorizer()
custom_vecs = custom_vectorizer.create_vectors(documents)
print((control_vecs == custom_vecs).all())

True


In [None]:
class CustomTFIDFVectorizer:
  def create_vectors(self, documents):
    vocab = set()
    clean_docs = list()
    for doc in documents:
      lower_doc = doc.lower()
      lower_doc_no_sym = ''.join([i if i not in string.punctuation else ' ' for i in lower_doc])
      doc_arr = lower_doc_no_sym.split()
      clean_docs.append(doc_arr)
      vocab = vocab.union(doc_arr)

    vocab = sorted(vocab)

    idf_dict = {}
    for word in vocab:
      count = sum(1 for document in clean_docs if word in document)
      idf_dict[word] = 1 + np.log((len(clean_docs) + 1) / (count + 1))

    vectors = np.zeros((len(clean_docs), len(vocab)))
    for doc_idx, doc in enumerate(clean_docs):
      for wrd_idx, word in enumerate(vocab):
        tf = doc.count(word) / len(doc)
        idf = idf_dict[word]
        vectors[doc_idx][wrd_idx] = tf * idf

    return preprocessing.normalize(vectors, norm='l2')


In [None]:
control_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
control_vecs = control_vectorizer.fit_transform(documents).toarray()

custom_vectorizer = CustomTFIDFVectorizer()
custom_vecs = custom_vectorizer.create_vectors(documents)
print(np.allclose(control_vecs, custom_vecs, atol=1e-14))
# Due to rounding, we compare for near equality instead

True
