# 3 Vector-Based Methods for Similarity Search
- TF-IDF    (Sparse)
- BM25      (Sparse)
- SBERT     (Dense)

## TF-IDF

In [28]:
a = "purple is the best city in the forest".split()
b = "there is am art to getting your way and throwing bananas on to the street is not it".split()
c = "it is not often you find soggy bananas on the street".split()

In [33]:
def get_tf(query: str, document: str):
    # Frequency of query in our document.
    f1 = document.count(query)
    # Total number of terms in the document.
    f2 = len(document)

    # The term frequency.
    tf = f1/f2
    return(tf)
    


In [34]:
from math import log10, inf

def check_for_query(query: str, document: str):
    return(query in document)

def get_idf(query: str, documents: list):
    # IDF: inverse document frequency.
    
    # Number of documents.
    n = len(documents)

    # Number of documents containing the query.
    cnt = 0
    for document in documents:
        if(check_for_query(query, document)):
            cnt += 1
    
    # IDF.
    try:
        idf = log10(n/cnt)
        return(idf)
    except ZeroDivisionError as e:
        print("None of the documents contain this query!!!", e)
        return(inf)

In [35]:
def tf_idf(query: str, documents: list, idx: int):
    # idx is the index of the document for which TF is calculated.
    try:
        tf = get_tf(query, documents[idx])
        idf = get_idf(query, documents)
        return (tf*idf)
    except IndexError as e:
        print("Index out of bounds", e)
        return(-1)

In [36]:
print(tf_idf("is", [a, b, c], 0))
print(tf_idf("is", [a, b, c], 1))
print(tf_idf("is", [a, b, c], 2))

print(get_idf("forest", [a, b, c]))

print(tf_idf("forest", [a, b, c], 0))
print(tf_idf("forest", [a, b, c], 1))
print(tf_idf("forest", [a, b, c], 2))

0.0
0.0
0.0
0.47712125471966244
0.059640156839957804
0.0
0.0


In [38]:
# Using numpy.
import numpy as np

docs = [a, b, c]

def tfidf(word: str, sentence: list):
    # Term frequency
    tf = sentence.count(word) / len(sentence)
    # Inverse document frequency
    idf = np.log10(len(docs)/sum([1 for doc in docs if word in doc]))

    return(round(tf*idf, 4))


In [39]:
print(tfidf("is", docs[0]))
print(tfidf("is", docs[1]))
print(tfidf("is", docs[2]))

print(tfidf("forest", docs[0]))
print(tfidf("forest", docs[1]))
print(tfidf("forest", docs[2]))

0.0
0.0
0.0
0.0596
0.0
0.0


In [41]:
# Converting TF-IDF to a vector.
vocab = set(a + b + c)
vocab

{'am',
 'and',
 'art',
 'bananas',
 'best',
 'city',
 'find',
 'forest',
 'getting',
 'in',
 'is',
 'it',
 'not',
 'often',
 'on',
 'purple',
 'soggy',
 'street',
 'the',
 'there',
 'throwing',
 'to',
 'way',
 'you',
 'your'}

In [42]:
# Calculate, for every word, the TF-IDF for every document.
vec_a = []
vec_b = []
vec_c = []

for word in vocab:
    vec_a.append(tfidf(word, a))
    vec_b.append(tfidf(word, b))
    vec_c.append(tfidf(word, c))


In [43]:
vec_a

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0596,
 0.0,
 0.0596,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0596,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0596,
 0.0596]

In [44]:
vec_b

[0.0,
 0.0265,
 0.0265,
 0.0,
 0.0265,
 0.0,
 0.0098,
 0.0098,
 0.0,
 0.0098,
 0.0,
 0.0098,
 0.0,
 0.0265,
 0.0,
 0.0265,
 0.0098,
 0.0265,
 0.0,
 0.053,
 0.0265,
 0.0,
 0.0265,
 0.0,
 0.0]

In [45]:
vec_c

[0.0434,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0434,
 0.016,
 0.016,
 0.0434,
 0.016,
 0.0,
 0.016,
 0.0,
 0.0,
 0.0,
 0.0,
 0.016,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0434,
 0.0,
 0.0,
 0.0]

## BM25