## TFIDF from scratch

Few improvisations can be done to the below code,
1. adding extra function to split a corpus into list of documents.
2. removing puctuations in the corpus.
3. lowercasing the documents.
4. ..............

In [1]:
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
corpus = [
    'this is the first document',
    'this document is the second document',
    'and this is the third one',
    'is this the first document',
]

In [2]:
import numpy as np
import sklearn
from sklearn import preprocessing

In [3]:
'''
Tokenizing the given corpus
'''

def tokenize(corpus):
    corpus_ = []
    dic = set()
    for sent in corpus:
        temp = []
        for word in sent.split(" "):
            if(len(word)>1):
                temp.append(word)
                dic.add(word)
        corpus_.append(temp)
    return corpus_,sorted(dic)

In [4]:
corpus_,dic = tokenize(corpus)
print(corpus_)
print(dic)

[['this', 'is', 'the', 'first', 'document'], ['this', 'document', 'is', 'the', 'second', 'document'], ['and', 'this', 'is', 'the', 'third', 'one'], ['is', 'this', 'the', 'first', 'document']]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [5]:
'''
Creating BOW for the corpus
'''

def BOW(corpus_,dic):
    BOW = np.zeros((len(corpus_),len(dic)))
    for i in range(len(BOW)):
        for j in range(len(BOW[i])):
            count=0
            for word in corpus_[i]:
                if word == dic[j]:
                    count += 1
            BOW[i][j] = count
    return BOW

In [6]:
BOW = BOW(corpus_,dic)
print(BOW)

[[0. 1. 1. 1. 0. 0. 1. 0. 1.]
 [0. 2. 0. 1. 0. 1. 1. 0. 1.]
 [1. 0. 0. 1. 1. 0. 1. 1. 1.]
 [0. 1. 1. 1. 0. 0. 1. 0. 1.]]


In [7]:
'''
Computing the Term Frequency
'''

def compute_TF(BOW,corpus_,dic):
    TF = np.zeros((len(corpus_),len(dic)))
    for i in range(len(BOW)):
        for j in range(len(BOW[i])):
            if BOW[i][j]!=0:
                TF[i][j] = BOW[i][j]/len(corpus_[i])
    return TF     

In [8]:
TF = compute_TF(BOW,corpus_,dic)
print(TF)

[[0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.33333333 0.         0.16666667 0.         0.16666667
  0.16666667 0.         0.16666667]
 [0.16666667 0.         0.         0.16666667 0.16666667 0.
  0.16666667 0.16666667 0.16666667]
 [0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]]


In [9]:
'''
Computing the inverse document frequency
'''

def compute_IDF(BOW,corpus_,dic):
    IDF = np.zeros(len(dic))
    for n in range(len(dic)):
        count=0
        for row in BOW:
            if row[n]>0:
                count+=1
        IDF[n]= 1 + np.log((len(BOW)+1)/(count+1))
    return IDF

In [10]:
IDF = compute_IDF(BOW,corpus_,dic)
print(IDF)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [11]:
'''
Computing the TF-IDF -> TF*IDF
'''

def compute_TFIDF(TF,IDF):
    TFIDF = np.zeros((len(corpus_),len(dic)))
    for i in range(len(TF)):
        for j in range(len(TF[i])):
            TFIDF[i][j] = TF[i][j]*IDF[j]
    return TFIDF

In [12]:
print("\n Before Normalization \n"+"*"*60)
TFIDF = compute_TFIDF(TF,IDF)
print(TFIDF)

'''
Normalizing the array
'''

print("\n After Normalization \n"+"*"*60)
TFIDF = sklearn.preprocessing.normalize(TFIDF,norm='l2')
print(TFIDF)


 Before Normalization 
************************************************************
[[0.         0.24462871 0.30216512 0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.40771452 0.         0.16666667 0.         0.31938179
  0.16666667 0.         0.16666667]
 [0.31938179 0.         0.         0.16666667 0.31938179 0.
  0.16666667 0.31938179 0.16666667]
 [0.         0.24462871 0.30216512 0.2        0.         0.
  0.2        0.         0.2       ]]

 After Normalization 
************************************************************
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [13]:
'''
Sklearn implimentation
'''


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_TFIDF = vectorizer.transform(corpus)
print(skl_TFIDF.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
