In [None]:
!pip install py_stringmatching

Collecting py_stringmatching
[?25l  Downloading https://files.pythonhosted.org/packages/90/d1/9163e0b0ac3bbb0f727ef8d380985c23066fb98d5005a34483ad76da06b4/py_stringmatching-0.4.2.tar.gz (661kB)
[K     |▌                               | 10kB 14.6MB/s eta 0:00:01[K     |█                               | 20kB 19.7MB/s eta 0:00:01[K     |█▌                              | 30kB 10.1MB/s eta 0:00:01[K     |██                              | 40kB 8.3MB/s eta 0:00:01[K     |██▌                             | 51kB 5.4MB/s eta 0:00:01[K     |███                             | 61kB 6.3MB/s eta 0:00:01[K     |███▌                            | 71kB 6.3MB/s eta 0:00:01[K     |████                            | 81kB 6.1MB/s eta 0:00:01[K     |████▌                           | 92kB 6.0MB/s eta 0:00:01[K     |█████                           | 102kB 5.2MB/s eta 0:00:01[K     |█████▌                          | 112kB 5.2MB/s eta 0:00:01[K     |██████                          | 122kB 5.

In [None]:
from py_stringmatching import utils
from py_stringmatching.similarity_measure.jaro import Jaro
from py_stringmatching.similarity_measure.hybrid_similarity_measure import HybridSimilarityMeasure

In [None]:
from __future__ import division
from math import sqrt
import collections

from py_stringmatching import utils
from py_stringmatching.similarity_measure.jaro import Jaro
from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
                                                    HybridSimilarityMeasure


class SoftTfIdf(HybridSimilarityMeasure):
    """Computes soft TF/IDF measure. 
    
    Note:
        Currently, this measure is implemented without dampening. This is similar to setting dampen flag to be False in TF-IDF.
        We plan to add the dampen flag in the next release.   

    Args:
        corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
                                     the input list are considered the only corpus.
        sim_func (function): Secondary similarity function. This should return a similarity score between two strings (optional),
                             default is the Jaro similarity measure.
        threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
                           of a token pair exceeds the threshold, then the token pair is considered a match.

    Attributes:
        sim_func (function): An attribute to store the secondary similarity function.
        threshold (float): An attribute to store the threshold value for the secondary similarity function. 
    """

    def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score,
                 threshold=0.5):
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()
        self.__corpus_size = 0 if self.__corpus_list is None else (
                                                         len(self.__corpus_list))
        self.sim_func = sim_func
        self.threshold = threshold
        super(SoftTfIdf, self).__init__()

    def get_raw_score(self, bag1, bag2):
        """Computes the raw soft TF/IDF score between two lists given the corpus information.

        Args:
            bag1,bag2 (list): Input lists

        Returns:
            Soft TF/IDF score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
            0.17541160386140586
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.5547001962252291
            >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
            >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
            0.81649658092772592

        References:
            * the string matching chapter of the "Principles of Data Integration" book.
        """
        
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
        
        # find unique elements in the input lists and their document frequency 
        local_df = {}
        for element in tf_x:
            local_df[element] = local_df.get(element, 0) + 1
        for element in tf_y:
            local_df[element] = local_df.get(element, 0) + 1

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
                                   (self.__document_frequency, self.__corpus_size))

        # calculating the term sim score against the input string 2,
        # construct similarity map
        similarity_map = {}
        for term_x in tf_x:
            max_score = 0.0
            for term_y in tf_y:
                score = self.sim_func(term_x, term_y)
                # adding sim only if it is above threshold and
                # highest for this element
                if score > self.threshold and score > max_score:
                    similarity_map[term_x] = (term_x, term_y, score)
                    max_score = score

        # position of first string, second string and sim score
        # in the tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        result, v_x_2, v_y_2 = 0.0, 1.0, 1.0
        # soft-tfidf calculation
        for element in local_df.keys():
            if curr_df.get(element) is None:
                continue
            # numerator
            if element in similarity_map:
                sim = similarity_map[element]
                idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
                idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
                v_x = idf_first * tf_x.get(sim[first_string_pos], 0)
                v_y = idf_second * tf_y.get(sim[second_string_pos], 0)
                result += v_x * v_y * sim[sim_score_pos]
            # denominator
            idf = corpus_size / curr_df[element]
            v_x = idf * tf_x.get(element, 0)
            v_x_2 += v_x * v_x
            v_y = idf * tf_y.get(element, 0)
            v_y_2 += v_y * v_y
        return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))

    def get_corpus_list(self):
        """Get corpus list.

        Returns:
            corpus list (list of lists).
        """
        return self.__corpus_list

    def get_sim_func(self):
        """Get secondary similarity function.

        Returns:
            secondary similarity function (function).
        """
        return self.sim_func

    def get_threshold(self):
        """Get threshold used for the secondary similarity function.

        Returns:
            threshold (float).
        """
        return self.threshold

    def set_threshold(self, threshold):
        """Set threshold value for the secondary similarity function.

        Args:
            threshold (float): threshold value.
        """
        self.threshold = threshold
        return True

    def set_sim_func(self, sim_func):
        """Set secondary similarity function.

        Args:
            sim_func (function): Secondary similarity function.
        """
        self.sim_func = sim_func
        return True

    def set_corpus_list(self, corpus_list):
        """Set corpus list.

        Args:
            corpus_list (list of lists): Corpus list.
        """
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()
        self.__corpus_size = 0 if self.__corpus_list is None else (
                                                         len(self.__corpus_list))
        return True

    def __compute_document_frequency(self):
        if self.__corpus_list != None:
            for document in self.__corpus_list:
                for element in set(document):
                    self.__document_frequency[element] = (
                        self.__document_frequency.get(element, 0) + 1)


In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("sample_data/main_dataset.csv")

In [None]:
data.head()

Unnamed: 0,image,name,author,format,book_depository_stars,price,currency,old_price,isbn,category,img_paths
0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,This is Going to Hurt,Adam Kay,Paperback,4.5,7.6,$,11.4,9781510000000.0,Medical,dataset/Medical/0000001.jpg
1,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,"Thinking, Fast and Slow",Daniel Kahneman,Paperback,4.0,11.5,$,15.0,9780140000000.0,Medical,dataset/Medical/0000002.jpg
2,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,When Breath Becomes Air,Paul Kalanithi,Paperback,4.5,9.05,$,11.5,9781780000000.0,Medical,dataset/Medical/0000003.jpg
3,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Happiness Trap,Russ Harris,Paperback,4.0,8.34,$,13.9,9781850000000.0,Medical,dataset/Medical/0000004.jpg
4,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Man's Search For Meaning,Viktor E. Frankl,Paperback,4.5,9.66,$,,9781850000000.0,Medical,dataset/Medical/0000005.jpg


In [None]:
data.shape

(32581, 11)

In [None]:
data.loc[1]['name']

'Thinking, Fast and Slow'

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import string

In [None]:
l = []
for i in range(data.shape[0]):
  s = str(data['name'][i]) + " " +str(data['author'][i]) +" " +str(data['category'][i])
  s = s.lower()
  s = s.translate(s.maketrans("","",string.punctuation))
  s1 = word_tokenize(s)
  l.append(s1)

In [None]:
soft_tfidf = SoftTfIdf(l, sim_func=Jaro().get_raw_score, threshold=0.8)

In [None]:
t = 0
s = 0
for i in l:
  m = soft_tfidf.get_raw_score(i,['paul', 'kalanithi'])
  if t<m:
    t = m
    s = i
print(t,s)

0.671779978329045 ['when', 'breath', 'becomes', 'air', 'paul', 'kalanithi', 'health']


In [None]:
cols = ["Score","Name","Author","Category","ISBN","image_url"]
scores = pd.DataFrame(columns=cols)

In [None]:
scores

Unnamed: 0,Score,Name,Author,Category,ISBN,image_url


In [None]:
for i in range(len(l)):
  m = soft_tfidf.get_raw_score(l[i],['paul', 'kalanithi'])
  scores.loc[len(scores)] = [m,data.loc[i]['name'],data.loc[i]['author'],data.loc[i]['category'],data.loc[i]['isbn'],data.loc[i]['image']]

KeyboardInterrupt: ignored

In [None]:
scores

In [None]:
s1  = scores.sort_values('Score',ascending=False)

In [None]:
s1.head(50)

In [None]:
def book_recommendation_based_on_text_similarity(sen):
  sen = word_tokenize(sen)
  cols = ["Score","Name","Author","Category","ISBN","image_url"]
  scores = pd.DataFrame(columns=cols)
  for i in range(len(l)):
    m = soft_tfidf.get_raw_score(l[i],sen)
    scores.loc[len(scores)] = [m,data.loc[i]['name'],data.loc[i]['author'],data.loc[i]['category'],data.loc[i]['isbn'],data.loc[i]['image']]
  s1  = scores.sort_values('Score',ascending=False)
  return s1.head(50)

In [None]:
book_recommendation_based_on_text_similarity("sha")

Unnamed: 0,Score,Name,Author,Category,ISBN,image_url
14525,1.170815,Shah of Shahs,Ryszard Kapuscinski,History-Archaeology,9780140000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
24091,0.877929,A Darker Shade of Magic,V E Schwab,Science-Fiction-Fantasy-Horror,9780770000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
23745,0.877929,A Darker Shade of Magic,V. E. Schwab,Science-Fiction-Fantasy-Horror,9781780000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
24340,0.877929,A Darker Shade of Magic,V E Schwab,Science-Fiction-Fantasy-Horror,9780770000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
7934,0.877929,A Darker Shade of Magic,V. E. Schwab,Crime-Thriller,9781780000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
21736,0.864817,The Shack,William P. Young,Religion,9780340000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
21890,0.864817,The Shack,William P. Young,Religion,9780960000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
8794,0.864817,The Shack,William P. Young,Crime-Thriller,9781460000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
7937,0.864817,The Shack,William P. Young,Crime-Thriller,9780340000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...
18409,0.859566,The Shark That Walks on Land,Michael Bright,Natural-History,9781850000000.0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...


In [None]:
word_tokenize("history of time")

In [None]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
!pip install flask-cors

Collecting flask-cors
  Downloading https://files.pythonhosted.org/packages/db/84/901e700de86604b1c4ef4b57110d4e947c218b9997adf5d38fa7da493bce/Flask_Cors-3.0.10-py2.py3-none-any.whl
Installing collected packages: flask-cors
Successfully installed flask-cors-3.0.10


In [None]:
from flask import Flask,request
from flask_cors import CORS
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
CORS(app)
run_with_ngrok(app)   
  
@app.route("/books/<string:input>",methods=['GET'])
def home(input):
  df = book_recommendation_based_on_text_similarity(input)
  json = df.to_json(orient='records')
  return json
    
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://a82204d7aaad.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [19/May/2021 09:11:59] "GET /books/hawkings HTTP/1.1" 200 -


In [None]:
df = book_recommendation_based_on_text_similarity("history of time")


In [None]:
json = df.to_json(orient='records')

In [None]:
json