Code adapted from: https://www.kaggle.com/uthamkanth/beginner-tf-idf-and-cosine-similarity-from-scratch

In [1]:
import math
import pandas as pd
import numpy as np

In [2]:
d1 = "Shipment of gold damaged in a fire. Gold."
d2 = "Delivery of silver arrived in a silver truck"
d3 = "Shipment of gold arrived in a truck"
q = "gold silver truck"

In [3]:
def compute_tf(docs_list):
    for doc in docs_list:
        doc1_lst = doc.split(" ")
        wordDict_1= dict.fromkeys(set(doc1_lst), 0)

        for token in doc1_lst:
            wordDict_1[token] +=  1
        df = pd.DataFrame([wordDict_1])
        idx = 0
        new_col = ["Term Frequency"]    
        df.insert(loc=idx, column='Document', value=new_col)
        print(df)
        
compute_tf([d1, d2, d3])

         Document  gold  damaged  Shipment  of  Gold.  fire.  a  in
0  Term Frequency     1        1         1   1      1      1  1   1
         Document  truck  silver  of  Delivery  a  arrived  in
0  Term Frequency      1       2   1         1  1        1   1
         Document  gold  truck  Shipment  of  a  arrived  in
0  Term Frequency     1      1         1   1  1        1   1


In [4]:
def termFrequency(term, document):
    normalizeDocument = document.lower().split()
    return normalizeDocument.count(term.lower()) / float(len(normalizeDocument))

def compute_normalizedtf(documents):
    tf_doc = []
    for txt in documents:
        sentence = txt.split()
        norm_tf= dict.fromkeys(set(sentence), 0)
        for word in sentence:
            norm_tf[word] = termFrequency(word, txt)
        tf_doc.append(norm_tf)
        df = pd.DataFrame([norm_tf])
        idx = 0
        new_col = ["Normalized TF"]    
        df.insert(loc=idx, column='Document', value=new_col)
        print(df)
    return tf_doc

tf_doc = compute_normalizedtf([d1, d2, d3])

        Document   gold  damaged  Shipment     of  Gold.  fire.      a     in
0  Normalized TF  0.125    0.125     0.125  0.125  0.125  0.125  0.125  0.125
        Document  truck  silver     of  Delivery      a  arrived     in
0  Normalized TF  0.125    0.25  0.125     0.125  0.125    0.125  0.125
        Document      gold     truck  Shipment        of         a   arrived  \
0  Normalized TF  0.142857  0.142857  0.142857  0.142857  0.142857  0.142857   

         in  
0  0.142857  


In [5]:
def inverseDocumentFrequency(term, allDocuments):
    numDocumentsWithThisTerm = 0
    for doc in range (0, len(allDocuments)):
        if term.lower() in allDocuments[doc].lower().split():
            numDocumentsWithThisTerm = numDocumentsWithThisTerm + 1
 
    if numDocumentsWithThisTerm > 0:
        return 1.0 + math.log(float(len(allDocuments)) / numDocumentsWithThisTerm)
    else:
        return 1.0
    
def compute_idf(documents):
    idf_dict = {}
    for doc in documents:
        sentence = doc.split()
        for word in sentence:
            idf_dict[word] = inverseDocumentFrequency(word, documents)
    return idf_dict
idf_dict = compute_idf([d1, d2, d3])

compute_idf([d1, d2, d3])

{'Shipment': 1.4054651081081644,
 'of': 1.0,
 'gold': 1.4054651081081644,
 'damaged': 2.09861228866811,
 'in': 1.0,
 'a': 1.0,
 'fire.': 2.09861228866811,
 'Gold.': 2.09861228866811,
 'Delivery': 2.09861228866811,
 'silver': 2.09861228866811,
 'arrived': 1.4054651081081644,
 'truck': 1.4054651081081644}

In [6]:
# tf-idf score across all docs for the query string("life learning")
def compute_tfidf_with_alldocs(documents , query):
    tf_idf = []
    index = 0
    query_tokens = query.split()
    df = pd.DataFrame(columns=['doc'] + query_tokens)
    for doc in documents:
        df['doc'] = np.arange(0 , len(documents))
        doc_num = tf_doc[index]
        sentence = doc.split()
        for word in sentence:
            for text in query_tokens:
                if(text == word):
                    idx = sentence.index(word)
                    tf_idf_score = doc_num[word] * idf_dict[word]
                    tf_idf.append(tf_idf_score)
                    df.iloc[index, df.columns.get_loc(word)] = tf_idf_score
        index += 1
    df.fillna(0 , axis=1, inplace=True)
    return tf_idf , df
            
documents = [d1, d2, d3]
tf_idf , df = compute_tfidf_with_alldocs(documents , q)
print(df)

   doc      gold    silver     truck
0    0  0.175683  0.000000  0.000000
1    1  0.000000  0.524653  0.175683
2    2  0.200781  0.000000  0.200781


In [7]:
def compute_query_tf(q):
    query_norm_tf = {}
    tokens = q.split()
    for word in tokens:
        query_norm_tf[word] = termFrequency(word , q)
    return query_norm_tf
query_norm_tf = compute_query_tf(q)
print(query_norm_tf)

{'gold': 0.3333333333333333, 'silver': 0.3333333333333333, 'truck': 0.3333333333333333}


In [8]:
def cosine_similarity(tfidf_dict_qry, df , query , doc_num):
    dot_product = 0
    qry_mod = 0
    doc_mod = 0
    tokens = query.split()
   
    for keyword in tokens:
        dot_product += tfidf_dict_qry[keyword] * df[keyword][df['doc'] == doc_num]
        #||Query||
        qry_mod += tfidf_dict_qry[keyword] * tfidf_dict_qry[keyword]
        #||Document||
        doc_mod += df[keyword][df['doc'] == doc_num] * df[keyword][df['doc'] == doc_num]
    qry_mod = np.sqrt(qry_mod)
    doc_mod = np.sqrt(doc_mod)
    #implement formula
    denominator = qry_mod * doc_mod
    cos_sim = dot_product/denominator
     
    return cos_sim

from collections import Iterable
def flatten(lis):
     for item in lis:
        if isinstance(item, Iterable) and not isinstance(item, str):
             for x in flatten(item):
                yield x
        else:        
             yield item

  from collections import Iterable


In [11]:
def compute_query_idf(q):
    idf_dict_qry = {}
    sentence = q.split()
    documents = [d1, d2, d3]
    for word in sentence:
        idf_dict_qry[word] = inverseDocumentFrequency(word ,documents)
    return idf_dict_qry
idf_dict_qry = compute_query_idf(q)
print(idf_dict_qry)

{'gold': 1.4054651081081644, 'silver': 2.09861228866811, 'truck': 1.4054651081081644}


In [12]:
def compute_query_tfidf(q):
    tfidf_dict_qry = {}
    sentence = q.split()
    for word in sentence:
        tfidf_dict_qry[word] = query_norm_tf[word] * idf_dict_qry[word]
    return tfidf_dict_qry
tfidf_dict_qry = compute_query_tfidf(q)
print(tfidf_dict_qry)

{'gold': 0.4684883693693881, 'silver': 0.6995374295560366, 'truck': 0.4684883693693881}


In [13]:
def rank_similarity_docs(data):
    cos_sim =[]
    for doc_num in range(0 , len(data)):
        cos_sim.append(cosine_similarity(tfidf_dict_qry, df , q , doc_num).tolist())
    return cos_sim
similarity_docs = rank_similarity_docs(documents)
doc_names = ["Document1", "Document2", "Document3"]
print(doc_names)
print(list(flatten(similarity_docs)))

['Document1', 'Document2', 'Document3']
[0.4862404165915704, 0.842865485453629, 0.6876477917177425]


In [14]:
def rank_similarity_docs(data):
    cos_sim =[]
    for doc_num in range(0 , len(data)):
        cos_sim.append(cosine_similarity(tfidf_dict_qry, df , q , doc_num).tolist())
    return cos_sim
similarity_docs = rank_similarity_docs(documents)
doc_names = ["Document1", "Document2", "Document3"]
print(doc_names)
print(list(flatten(similarity_docs)))

['Document1', 'Document2', 'Document3']
[0.4862404165915704, 0.842865485453629, 0.6876477917177425]
