# TF-IDF Explanation:



    TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:

    TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

    IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:

    IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd

In [3]:
data = '''Time flies like an arrow. 
Fruit flies like a banana, 
Sam sat on the cat 
The cat is white.'''

print(data)

Time flies like an arrow. 
Fruit flies like a banana, 
Sam sat on the cat 
The cat is white.


### Consider each sentence as a document. Split the data into vectors based on new line.

In [4]:
dataset = data.split('\n')
dataset

['Time flies like an arrow. ',
 'Fruit flies like a banana, ',
 'Sam sat on the cat ',
 'The cat is white.']

In [35]:
len(dataset)

4

### Get the TF matrix

In [5]:
# sklearn.feature_extraction.text.CountVectorizer - Convert a collection of text documents to a matrix of 
#token counts
tf_vectorizer = CountVectorizer(ngram_range=(1,1))
tf = tf_vectorizer.fit_transform(dataset)         # tfs are calculated by CountVectorizer's fit_transform()
print(tf)


  (0, 1)	1
  (0, 0)	1
  (0, 7)	1
  (0, 4)	1
  (0, 12)	1
  (1, 2)	1
  (1, 5)	1
  (1, 7)	1
  (1, 4)	1
  (2, 3)	1
  (2, 11)	1
  (2, 8)	1
  (2, 10)	1
  (2, 9)	1
  (3, 13)	1
  (3, 6)	1
  (3, 3)	1
  (3, 11)	1


In [6]:
tf.shape

(4, 14)

In [7]:
tf_vectorizer.get_feature_names()

['an',
 'arrow',
 'banana',
 'cat',
 'flies',
 'fruit',
 'is',
 'like',
 'on',
 'sam',
 'sat',
 'the',
 'time',
 'white']

In [8]:
pd.DataFrame(tf.toarray(), columns= tf_vectorizer.get_feature_names()) 
# Array mapping from feature integer indices to feature name


Unnamed: 0,an,arrow,banana,cat,flies,fruit,is,like,on,sam,sat,the,time,white
0,1,1,0,0,1,0,0,1,0,0,0,0,1,0
1,0,0,1,0,1,1,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,1,1,1,0,0
3,0,0,0,1,0,0,1,0,0,0,0,1,0,1


### Get TF-IDFs.

In [9]:
# sklearn.feature_extraction.text.TfidfVectorizer - Convert a collection of raw documents 
#to a matrix of TF-IDF features
# idfs are calculated by TfidfTransformer's fit()
# tfidfs are calculated by TfidfTransformer's transform()

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=False,ngram_range=(1,1)) # stop_words='english'
tfidf = tfidf_vectorizer.fit_transform(dataset)  
type(tfidf)
print(tfidf)

  (0, 12)	0.4854606118156975
  (0, 4)	0.3827427224171519
  (0, 7)	0.3827427224171519
  (0, 0)	0.4854606118156975
  (0, 1)	0.4854606118156975
  (1, 4)	0.43779123108611473
  (1, 7)	0.43779123108611473
  (1, 5)	0.5552826649411127
  (1, 2)	0.5552826649411127
  (2, 9)	0.4854606118156975
  (2, 10)	0.4854606118156975
  (2, 8)	0.4854606118156975
  (2, 11)	0.3827427224171519
  (2, 3)	0.3827427224171519
  (3, 11)	0.43779123108611473
  (3, 3)	0.43779123108611473
  (3, 6)	0.5552826649411127
  (3, 13)	0.5552826649411127


In [49]:
# sklearn.feature_extraction.text.TfidfVectorizer - Convert a collection of raw documents to a matrix of TF-IDF features
# idfs are calculated by TfidfTransformer's fit()
# tfidfs are calculated by TfidfTransformer's transform()

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1,1)) # stop_words='english'
tfidf = tfidf_vectorizer.fit_transform(dataset)  
type(tfidf)
print(tfidf)

  (0, 12)	0.4854606118156975
  (0, 4)	0.3827427224171519
  (0, 7)	0.3827427224171519
  (0, 0)	0.4854606118156975
  (0, 1)	0.4854606118156975
  (1, 4)	0.43779123108611473
  (1, 7)	0.43779123108611473
  (1, 5)	0.5552826649411127
  (1, 2)	0.5552826649411127
  (2, 9)	0.4854606118156975
  (2, 10)	0.4854606118156975
  (2, 8)	0.4854606118156975
  (2, 11)	0.3827427224171519
  (2, 3)	0.3827427224171519
  (3, 11)	0.43779123108611473
  (3, 3)	0.43779123108611473
  (3, 6)	0.5552826649411127
  (3, 13)	0.5552826649411127


In [53]:
idf=tfidf_vectorizer.fit(dataset)
idf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [54]:
tfidf_vectorizer.get_feature_names()

['an',
 'arrow',
 'banana',
 'cat',
 'flies',
 'fruit',
 'is',
 'like',
 'on',
 'sam',
 'sat',
 'the',
 'time',
 'white']

In [12]:
tfidf_vectorizer.vocabulary_

{'time': 12,
 'flies': 4,
 'like': 7,
 'an': 0,
 'arrow': 1,
 'fruit': 5,
 'banana': 2,
 'sam': 9,
 'sat': 10,
 'on': 8,
 'the': 11,
 'cat': 3,
 'is': 6,
 'white': 13}

In [13]:
print(tfidf.toarray())

[[0.48546061 0.48546061 0.         0.         0.38274272 0.
  0.         0.38274272 0.         0.         0.         0.
  0.48546061 0.        ]
 [0.         0.         0.55528266 0.         0.43779123 0.55528266
  0.         0.43779123 0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.38274272 0.         0.
  0.         0.         0.48546061 0.48546061 0.48546061 0.38274272
  0.         0.        ]
 [0.         0.         0.         0.43779123 0.         0.
  0.55528266 0.         0.         0.         0.         0.43779123
  0.         0.55528266]]


In [14]:
pd.DataFrame(tfidf.toarray(), columns= tfidf_vectorizer.get_feature_names()) 
# Array mapping from feature integer indices to feature name



Unnamed: 0,an,arrow,banana,cat,flies,fruit,is,like,on,sam,sat,the,time,white
0,0.485461,0.485461,0.0,0.0,0.382743,0.0,0.0,0.382743,0.0,0.0,0.0,0.0,0.485461,0.0
1,0.0,0.0,0.555283,0.0,0.437791,0.555283,0.0,0.437791,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.382743,0.0,0.0,0.0,0.0,0.485461,0.485461,0.485461,0.382743,0.0,0.0
3,0.0,0.0,0.0,0.437791,0.0,0.0,0.555283,0.0,0.0,0.0,0.0,0.437791,0.0,0.555283


# Manual TF and TF-IDF functions 

In [55]:
docA = "the cat sat on my sofa"
docB = "the dog sat on my bed" 

### Split document into vector of words

In [56]:
bowA = docA.split(" ")
print(bowA)
bowB = docB.split(" ")
print(bowB)
type(bowA)

['the', 'cat', 'sat', 'on', 'my', 'sofa']
['the', 'dog', 'sat', 'on', 'my', 'bed']


list

### Vocabulary in the corpus

In [57]:
wordSet = set(bowA).union(set(bowB))
wordSet

{'bed', 'cat', 'dog', 'my', 'on', 'sat', 'sofa', 'the'}

### Dictionaries to keep the word count in each bag of words

In [58]:
wordDictA = dict.fromkeys(wordSet,0) # The method fromkeys() creates a new dictionary with keys from seq and values set to value.
print(wordDictA)
wordDictB = dict.fromkeys(wordSet,0)
print(wordDictB)


{'the': 0, 'sofa': 0, 'my': 0, 'dog': 0, 'cat': 0, 'bed': 0, 'on': 0, 'sat': 0}
{'the': 0, 'sofa': 0, 'my': 0, 'dog': 0, 'cat': 0, 'bed': 0, 'on': 0, 'sat': 0}


In [59]:
# count the frequency of each word in the dictionary
for word in bowA: 
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [60]:
print(bowA)
print(wordDictA)
print(bowB)
print(wordDictB)

['the', 'cat', 'sat', 'on', 'my', 'sofa']
{'the': 1, 'sofa': 1, 'my': 1, 'dog': 0, 'cat': 1, 'bed': 0, 'on': 1, 'sat': 1}
['the', 'dog', 'sat', 'on', 'my', 'bed']
{'the': 1, 'sofa': 0, 'my': 1, 'dog': 1, 'cat': 0, 'bed': 1, 'on': 1, 'sat': 1}


In [61]:
# Put them into a matrix
pd.DataFrame([wordDictA,wordDictB])

Unnamed: 0,bed,cat,dog,my,on,sat,sofa,the
0,0,1,0,1,1,1,1,1
1,1,0,1,1,1,1,0,1


### Python function to compute term frequency

In [62]:
def computeTF(wordDict, bow):
    tfDict = {}                # creating an empty dictionary
    bowCount = len(bow)
    print(bowCount)
    for word, count in wordDict.items():
        # print(wordDict.items())
        tfDict[word] = count/ bowCount
    return tfDict

In [23]:
# Call the function on 1st document
tfbowA = computeTF(wordDictA,bowA)
tfbowA

6


{'the': 0.16666666666666666,
 'sofa': 0.16666666666666666,
 'my': 0.16666666666666666,
 'dog': 0.0,
 'cat': 0.16666666666666666,
 'bed': 0.0,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [24]:
#call the function on 2nd document
tfbowB = computeTF(wordDictB,bowB)
tfbowB

6


{'the': 0.16666666666666666,
 'sofa': 0.0,
 'my': 0.16666666666666666,
 'dog': 0.16666666666666666,
 'cat': 0.0,
 'bed': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666}

##### Compute IDF = log(no of documents / count(documents in which term T occurs))

In [25]:
import math
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    print(N)
    
    # count the number of documents that contains the word w
    idfDict = dict.fromkeys(docList[0].keys(),0)
    print(idfDict)
    
    for doc in docList:
        for word, val in doc.items():
            # print(doc.items())
            if val > 0:
                idfDict[word] += 1
                
    # divide N by denominator above and take log of that
    for word, val in idfDict.items():
        idfDict[word]= math.log(N/val)
        
    return idfDict

In [26]:
print(wordDictA)
print(wordDictB)

{'the': 1, 'sofa': 1, 'my': 1, 'dog': 0, 'cat': 1, 'bed': 0, 'on': 1, 'sat': 1}
{'the': 1, 'sofa': 0, 'my': 1, 'dog': 1, 'cat': 0, 'bed': 1, 'on': 1, 'sat': 1}


In [27]:
### Call computeIDF() function, send the documents as a list as input parameters
idfs = computeIDF([wordDictA,wordDictB])
idfs

2
{'the': 0, 'sofa': 0, 'my': 0, 'dog': 0, 'cat': 0, 'bed': 0, 'on': 0, 'sat': 0}


{'the': 0.0,
 'sofa': 0.6931471805599453,
 'my': 0.0,
 'dog': 0.6931471805599453,
 'cat': 0.6931471805599453,
 'bed': 0.6931471805599453,
 'on': 0.0,
 'sat': 0.0}

### Compute TF * IDF

In [28]:
def computeTFIDF(tfBow,idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

### TFIDF for terms in document A

In [29]:
tfIDFA = computeTFIDF(tfbowA,idfs)
tfIDFA

{'the': 0.0,
 'sofa': 0.11552453009332421,
 'my': 0.0,
 'dog': 0.0,
 'cat': 0.11552453009332421,
 'bed': 0.0,
 'on': 0.0,
 'sat': 0.0}

### TFIDF for terms in document B

In [30]:
tfIDFB = computeTFIDF(tfbowB, idfs)
tfIDFB

{'the': 0.0,
 'sofa': 0.0,
 'my': 0.0,
 'dog': 0.11552453009332421,
 'cat': 0.0,
 'bed': 0.11552453009332421,
 'on': 0.0,
 'sat': 0.0}

In [31]:
pd.DataFrame([tfIDFA,tfIDFB])

Unnamed: 0,bed,cat,dog,my,on,sat,sofa,the
0,0.0,0.115525,0.0,0.0,0.0,0.0,0.115525,0.0
1,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0,0.0
