### Simple TF-IDF

In [152]:
import pandas as pd
import math

In [153]:
d1 = "Shipment of gold damaged in a fire."
d2 = "Delivery of silver arrived in a silver truck"
d3 = "Shipment of gold arrived in a truck"
q = "gold silver truck"

In [154]:
bagOfWordsd1 = d1.split(' ')
bagOfWordsd2 = d2.split(' ')
bagOfWordsd3 = d3.split(' ')
bagOfWordsq = q.split(' ')

In [155]:
dw = set(bagOfWordsd1).union(set(bagOfWordsd2)).union(set(bagOfWordsd3)).union(set(bagOfWordsq))

In [156]:
numOfWordsd1 = dict.fromkeys(dw, 0)

for w in bagOfWordsd1:
    numOfWordsd1[w] += 1
    
numOfWordsd2 = dict.fromkeys(dw, 0)

for w in bagOfWordsd2:
    numOfWordsd2[w] += 1
    
numOfWordsd3 = dict.fromkeys(dw, 0)

for w in bagOfWordsd3:
    numOfWordsd3[w] += 1
    
numOfWordsq = dict.fromkeys(dw, 0)

for w in bagOfWordsq:
    numOfWordsq[w] += 1

In [157]:
numOfWordsq

{'silver': 1,
 'Shipment': 0,
 'damaged': 0,
 'Delivery': 0,
 'truck': 1,
 'fire.': 0,
 'gold': 1,
 'of': 0,
 'a': 0,
 'arrived': 0,
 'in': 0}

In [158]:
def TF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count 
    return tfDict

In [159]:
tfd1 = TF(numOfWordsd1, bagOfWordsd1)
tfd2 = TF(numOfWordsd2, bagOfWordsd2)
tfd3 = TF(numOfWordsd3, bagOfWordsd3)
tfq = TF(numOfWordsq, bagOfWordsq)

In [160]:
tfd1

{'silver': 0,
 'Shipment': 1,
 'damaged': 1,
 'Delivery': 0,
 'truck': 0,
 'fire.': 1,
 'gold': 1,
 'of': 1,
 'a': 1,
 'arrived': 0,
 'in': 1}

In [161]:
documents = [d1,d2,d3]

In [162]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
    return idfDict

In [163]:
idfs = computeIDF([numOfWordsd1, numOfWordsd2, numOfWordsd3])

In [164]:
idfs

{'silver': 0.47712125471966244,
 'Shipment': 0.17609125905568124,
 'damaged': 0.47712125471966244,
 'Delivery': 0.47712125471966244,
 'truck': 0.17609125905568124,
 'fire.': 0.47712125471966244,
 'gold': 0.17609125905568124,
 'of': 0.0,
 'a': 0.0,
 'arrived': 0.17609125905568124,
 'in': 0.0}

In [165]:
def TFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [166]:
tfidfd1 = TFIDF(tfd1, idfs)
tfidfd2 = TFIDF(tfd2, idfs)
tfidfd3 = TFIDF(tfd3, idfs)
tfidfq = TFIDF(tfq, idfs)
df = pd.DataFrame([tfidfd1, tfidfd2, tfidfd3, tfidfq])

In [167]:
df

Unnamed: 0,silver,Shipment,damaged,Delivery,truck,fire.,gold,of,a,arrived,in
0,0.0,0.176091,0.477121,0.0,0.0,0.477121,0.176091,0.0,0.0,0.0,0.0
1,0.954243,0.0,0.0,0.477121,0.176091,0.0,0.0,0.0,0.0,0.176091,0.0
2,0.0,0.176091,0.0,0.0,0.176091,0.0,0.176091,0.0,0.0,0.176091,0.0
3,0.477121,0.0,0.0,0.0,0.176091,0.0,0.176091,0.0,0.0,0.0,0.0


In [168]:
import numpy as np
d1_vec = df.values[0]
d2_vec = df.values[1]
d3_vec = df.values[2]
q_vec = df.values[3]

docs = [d1_vec, d2_vec, d3_vec]

In [169]:
q_vec

array([0.47712125, 0.        , 0.        , 0.        , 0.17609126,
       0.        , 0.17609126, 0.        , 0.        , 0.        ,
       0.        ])

In [170]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(d, q):
    return dot(d, q)/(norm(d)*norm(q))

In [171]:
cos_sim(d1_vec, q_vec)

0.08010451753994625

In [172]:
l = [cos_sim(d, q_vec) for d in docs]

In [173]:
l

[0.08010451753994625, 0.8247514231034946, 0.32718457421366]

---

### Scoring of TF-IDF

- The score if we have $d_1$ as normal: $0.08$
- The score if we have $d_1$ as extra Fire using lowercase: 0.05
- The score if we have $d_1$ as extra Fire using normal: 0.07
- The score if we have $d_1$ as 2 extra Fire using lowercase: 0.04
- The score if we have $d_1$ as 2 extra Fire using normal: 0.05
- The score if we have $d_1$ as extra Gold using lowercase: 0.07
- The score if we have $d_1$ as extra Gold using normal: 0.07
- The score if we have $d_1$ as 2 extra Gold using lowercase: 0.05
- The score if we have $d_1$ as 2 extra Gold using normal: 0.05