In [1]:
#text dataset
data_corpus = ['this product is very cheap and affordable',
              'the Product is good',
              'the product is bad',
              'this product is very helpfull',
              'hope you are doing well']

In [27]:
#importing essential libraries
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy


## 1. Bag Of Words Implementation

In [17]:
#python Implementation
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if(isinstance(dataset, (list,))):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        return vocab
    else:
        print("you need to pass list of sentance")

In [21]:
def transform(dataset,vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(dataset): # for each document in the dataset
            
            word_freq = dict(Counter(row.split()))
            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue
                col_index = vocab.get(word, -1) 
                # if the word exists
                if col_index !=-1:
                    
                    rows.append(idx)
                   
                    columns.append(col_index)
                    
                    values.append(freq)
        return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    else:
        print("you need to pass list of strings")

In [23]:

vocab = fit(data_corpus)
print(list(vocab.keys()))
print(transform(data_corpus, vocab).toarray())

['Product', 'affordable', 'and', 'are', 'bad', 'cheap', 'doing', 'good', 'helpfull', 'hope', 'is', 'product', 'the', 'this', 'very', 'well', 'you']
[[0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0]
 [1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0]
 [0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1]]


In [24]:
#sklearn implementation
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer='word')

vec.fit(data_corpus)
feature_matrix = vec.transform(data_corpus)
print(feature_matrix.toarray())

[[1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0]
 [0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0]
 [0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1]]


## 2.Tf-Idf Implementation

In [29]:
def fit(dataset):
    words = set()
    if(isinstance(dataset,(list,))):
        for i in dataset:
            for w in i.split():
                if(len(w)<2):
                    continue
                words.add(w)
        words = sorted(list(words))
        vocab = {i:j for j,i in enumerate(words)}
        idf_vector = IDF(dataset,words)
        return vocab,idf_vector
    else:
        print("Please pass list of sentences")
        



def IDF(dataset,words):
    idf_val = {}
    total_doc_len = len(dataset)
    for i in words:
        count = 0
        for j in dataset:
            if(i in j.split()):
                count+=1
        val = 1+(math.log((1+total_doc_len)/(1+count)))
        idf_val[i]=val
    return idf_val



def transform(dataset,vocab,idf_vector):
    rows=[]
    columns = []
    values = []
    if(isinstance(dataset,(list))):
        for i in range(len(dataset)):
            length = len(dataset[i])
            word_freq = dict(Counter(dataset[i].split()))
            for word,word_length in word_freq.items():
                tf_idf_val = (word_length/length)*idf_vector[word]
                col_index = vocab.get(word,-1)
                if(col_index!=-1):
                    rows.append(i)
                    columns.append(col_index)
                    values.append(tf_idf_val)
        sparse_matrix = csr_matrix((values,(rows,columns)),shape=(len(dataset),len(vocab)))
        output_matrix = normalize(sparse_matrix,norm='l2')
        return output_matrix
    
    else:
        print("pass a list of sentences as dataset")
                    

        
vocab,idf_vector = fit(data_corpus)
output_matrix = transform(data_corpus,vocab,idf_vector)


In [30]:
output_matrix

<5x17 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [33]:
        
vocab,idf_vector = fit(data_corpus)
output_matrix = transform(data_corpus,vocab,idf_vector)

print("custom implementation feature names :\n {}".format(list(vocab.keys())))  
print("custom implemented tfidf matrix : \n {}".format(output_matrix.toarray()))

custom implementation feature names :
 ['Product', 'affordable', 'and', 'are', 'bad', 'cheap', 'doing', 'good', 'helpfull', 'hope', 'is', 'product', 'the', 'this', 'very', 'well', 'you']
custom implemented tfidf matrix : 
 [[0.         0.44421436 0.44421436 0.         0.         0.44421436
  0.         0.         0.         0.         0.25026262 0.29749553
  0.         0.35838935 0.35838935 0.         0.        ]
 [0.58042343 0.         0.         0.         0.         0.
  0.         0.58042343 0.         0.         0.32700044 0.
  0.46828197 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.64324583 0.
  0.         0.         0.         0.         0.36239348 0.43078923
  0.51896668 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.5709398  0.         0.32165752 0.38236504
  0.         0.46063063 0.46063063 0.         0.        ]
 [0.         0.         0.       

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(data_corpus)
skl_output = vectorizer.transform(data_corpus)

In [16]:
print(skl_output[0])

  (0, 13)	0.363117453789113
  (0, 12)	0.363117453789113
  (0, 10)	0.25356424898691843
  (0, 9)	0.25356424898691843
  (0, 4)	0.45007472445466307
  (0, 1)	0.45007472445466307
  (0, 0)	0.45007472445466307
