### Simple TF-IDF

In [1]:
import pandas as pd
import math
from numpy import dot
from numpy.linalg import norm

In [2]:
d1 = "Shipment of gold damaged in a fire."
d2 = "Delivery of silver arrived in a silver truck"
d3 = "Shipment of gold arrived in a truck"
q = "gold silver truck"

documents = [d1, d2, d3]

In [3]:
b1 = d1.split(' ')
b2 = d2.split(' ')
b3 = d3.split(' ')
bq = q.split(' ')

In [4]:
bowd1 = []
for w in b1:
    bowd1.append(w.lower())

bowd2 = []
for w in b2:
    bowd2.append(w.lower())
    
bowd3 = []
for w in b3:
    bowd3.append(w.lower())
    
bowq = []
for w in bq:
    bowq.append(w.lower())

In [5]:
dw = set(bowd1).union(set(bowd2)).union(set(bowd3)).union(set(bowq))

In [6]:
dw

{'a',
 'arrived',
 'damaged',
 'delivery',
 'fire.',
 'gold',
 'in',
 'of',
 'shipment',
 'silver',
 'truck'}

In [7]:
nowd1 = dict.fromkeys(dw, 0)

for w in bowd1:
    nowd1[w] += 1
    
nowd2 = dict.fromkeys(dw, 0)

for w in bowd2:
    nowd2[w] += 1
    
nowd3 = dict.fromkeys(dw, 0)

for w in bowd3:
    nowd3[w] += 1
    
nowq = dict.fromkeys(dw, 0)

for w in bowq:
    nowq[w] += 1

In [8]:
nowq

{'delivery': 0,
 'fire.': 0,
 'of': 0,
 'arrived': 0,
 'in': 0,
 'a': 0,
 'damaged': 0,
 'shipment': 0,
 'gold': 1,
 'silver': 1,
 'truck': 1}

In [9]:
def TF(wd, bow):
    tf = {}
    bowCount = len(bow)
    for w, count in wd.items():
        tf[w] = count 
    return tf

In [10]:
tfd1 = TF(nowd1, bowd1)
tfd2 = TF(nowd2, bowd2)
tfd3 = TF(nowd3, bowd3)
tfq = TF(nowq, bowq)

In [11]:
tfd1

{'delivery': 0,
 'fire.': 1,
 'of': 1,
 'arrived': 0,
 'in': 1,
 'a': 1,
 'damaged': 1,
 'shipment': 1,
 'gold': 1,
 'silver': 0,
 'truck': 0}

In [12]:
documents = [d1,d2,d3]

In [13]:
def IDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
    return idfDict

In [14]:
idf_dict = IDF([nowd1, nowd2, nowd3])

In [15]:
idf_dict

{'delivery': 0.47712125471966244,
 'fire.': 0.47712125471966244,
 'of': 0.0,
 'arrived': 0.17609125905568124,
 'in': 0.0,
 'a': 0.0,
 'damaged': 0.47712125471966244,
 'shipment': 0.17609125905568124,
 'gold': 0.17609125905568124,
 'silver': 0.47712125471966244,
 'truck': 0.17609125905568124}

In [16]:
def TFIDF(tfbow, idf_dict):
    tfidf = {}
    for word, val in tfbow.items():
        tfidf[word] = val * idf_dict[word]
    return tfidf

In [17]:
tfidfd1 = TFIDF(tfd1, idf_dict)
tfidfd2 = TFIDF(tfd2, idf_dict)
tfidfd3 = TFIDF(tfd3, idf_dict)
tfidfq = TFIDF(tfq, idf_dict)
df = pd.DataFrame([tfidfd1, tfidfd2, tfidfd3, tfidfq])

In [18]:
df

Unnamed: 0,delivery,fire.,of,arrived,in,a,damaged,shipment,gold,silver,truck
0,0.0,0.477121,0.0,0.0,0.0,0.0,0.477121,0.176091,0.176091,0.0,0.0
1,0.477121,0.0,0.0,0.176091,0.0,0.0,0.0,0.0,0.0,0.954243,0.176091
2,0.0,0.0,0.0,0.176091,0.0,0.0,0.0,0.176091,0.176091,0.0,0.176091
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176091,0.477121,0.176091


In [19]:
import numpy as np
d1_vec = df.values[0]
d2_vec = df.values[1]
d3_vec = df.values[2]
q_vec = df.values[3]

docs = [d1_vec, d2_vec, d3_vec]

In [20]:
q_vec

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.17609126, 0.47712125,
       0.17609126])

In [21]:
def cos_sim(d, q):
    return dot(d, q)/(norm(d)*norm(q))

In [22]:
cos_sim(d1_vec, q_vec)

0.08010451753994625

In [23]:
l = [cos_sim(d, q_vec) for d in docs]
l

[0.08010451753994625, 0.8247514231034946, 0.32718457421366]

In [24]:
for i in l:
    print(round(i, 2))

0.08
0.82
0.33


---

### Normalised TF-IDF

- The score if we have $d_1$ as normal: $0.08$
- The score if we have $d_1$ as extra Fire using lowercase: $0.05$
- The score if we have $d_1$ as extra Fire using normal: $0.07$
- The score if we have $d_1$ as 2 extra Fire using lowercase: $0.04$
- The score if we have $d_1$ as 2 extra Fire using normal: $0.05$
- The score if we have $d_1$ as extra Gold using lowercase: $0.07$
- The score if we have $d_1$ as extra Gold using normal: $0.07$
- The score if we have $d_1$ as 2 extra Gold using lowercase: $0.05$
- The score if we have $d_1$ as 2 extra Gold using normal: $0.05$.