In [1]:
from ipywidgets import widgets
from IPython.display import display
from Levenshtein import distance
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pipeline import Pipeline
import requests
from PIL import Image
from io import BytesIO

In [2]:
input_text = widgets.Text(placeholder="Enter your favourite book")
output_text = widgets.Output()

In [3]:
def get_most_similar_title(title, books):
    global tfidf_vectorizer, tfidf_matrix, combined_data
    books = books.copy()
    vectorized_input = tfidf_vectorizer.transform([title])
    similarities = cosine_similarity(tfidf_matrix, vectorized_input)
    row_index = similarities.argmax()
    row_with_min_distance = books.loc[row_index]
    return row_with_min_distance.bookTitle

In [4]:

def on_click(sender):
    output_text.clear_output()
    global books
    global mapping
    with output_text:
        print("Most similar book in the archive:")
        matched_title = get_most_similar_title(str(input_text.value), books)
        print(matched_title)
        print("Top 5 similar books:")
        similar_books = mapping.get(matched_title)
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}
        for book in similar_books:
            r = requests.get(books.loc[books.bookTitle == book,"Image-URL-M"].iloc[0], headers=headers)
            if r.status_code == 200:
                image_bytes = BytesIO(r.content)
                img = Image.open(image_bytes)
            print("-", book)
            display(img)

#load preprocessed data
books = pd.read_csv("./../data/books_filtered.csv")
books_with_urls = pd.read_csv("./../data/Books.csv")
combined_data = pd.read_csv("./../data/merged_data.csv")
mapping = Pipeline.get_item_item_similar_mapping()
tfidf_vectorizer = TfidfVectorizer(stop_words = "english", ngram_range = (1,5),analyzer = "char_wb")
tfidf_matrix = tfidf_vectorizer.fit_transform(books.bookTitle)
books = books.merge(books_with_urls[["ISBN","Image-URL-M"]],on="ISBN")
input_text.on_submit(on_click)
display(input_text)
display(output_text)

  books_with_urls = pd.read_csv("./../data/Books.csv")
  input_text.on_submit(on_click)


Text(value='', placeholder='Enter your favourite book')

Output()

In [5]:
mapping

{'1984': ['Lord of the Flies',
  "Foucault's Pendulum",
  'The Eight',
  'The Joy Luck Club'],
 '1st to Die: A Novel': ['The Search',
  'Unspeakable',
  'Final Target',
  '2nd Chance'],
 '2nd Chance': ['1st to Die: A Novel',
  'Unspeakable',
  'The Bourne Supremacy',
  'Pop Goes the Weasel'],
 '4 Blondes': ['Last Chance Saloon',
  'Dying for Chocolate (Culinary Mysteries (Paperback))',
  'A Widow for One Year',
  'The Beach House'],
 'A Bend in the Road': ['A Walk to Remember',
  'The Prince of Tides',
  'Hannibal',
  'The Golden Compass (His Dark Materials, Book 1)'],
 'A Case of Need': ['Moonlight Becomes You',
  'Unnatural Exposure',
  "Pretend You Don't See Her",
  'The Diary of Ellen Rimbauer: My Life at Rose Red'],
 'A Child Called \\It\\": One Child\'s Courage to Survive"': ['Rising Tides',
  'Dawn (Cutler)',
  'Dreamcatcher',
  "River's End"],
 'A Civil Action': ['Love in the Time of Cholera (Penguin Great Books of the 20th Century)',
  'The Catcher in the Rye',
  'The Bean Tre

In [6]:
a = {}
for key, value in mapping.items():
    if key in value:
        a[key] = value

In [7]:
a

{}

In [8]:
len(combined_data.bookTitle.unique())

241071

In [9]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
a = [[1,2,3],[42,3,78],[0,2,3],[0,0,0]]
nn_model = NearestNeighbors(n_neighbors=2, algorithm="brute",
                            metric="cosine", n_jobs=-1)
nn_model.fit(a)
nn_model.kneighbors(a)

(array([[0.        , 0.03637589],
        [0.        , 0.14973028],
        [0.        , 0.03637589],
        [1.        , 1.        ]]),
 array([[0, 2],
        [1, 0],
        [2, 0],
        [0, 1]]))

In [10]:
len(books)

271360

In [11]:
len(mapping)

679

In [12]:
biblio_data

NameError: name 'biblio_data' is not defined

In [None]:
len(combined_data)

241071

In [None]:
combined_data.loc[combined_data.bookTitle == "The Lord of the Rings"]

Unnamed: 0.1,Unnamed: 0,userID,ISBN,bookRating,Age,City,Region,State,bookTitle,bookAuthor,yearOfPublication,publisher
31612,31612,8681,395489326,0,34.0,orlando,florida,usa,The Lord of the Rings,J. R. R. Tolkien,1988.0,Houghton Mifflin Company
74399,74399,17950,618153969,0,32.0,puchong,selangor,malaysia,The Lord of the Rings,J. R. R. Tolkien,2001.0,Houghton Mifflin Company
74838,74838,18067,618153969,0,33.0,kajang,selangor,malaysia,The Lord of the Rings,J. R. R. Tolkien,2001.0,Houghton Mifflin Company
79891,79891,20172,618260250,10,27.0,saratoga springs,utah,usa,The Lord of the Rings,J. R. R. Tolkien,2002.0,Houghton Mifflin Company
86426,86426,22625,618260293,10,54.0,lynbrook,new york,usa,The Lord of the Rings,J. R. R. Tolkien,2002.0,Houghton Mifflin Company
91813,91813,23902,618153969,4,34.0,london,england,united kingdom,The Lord of the Rings,J. R. R. Tolkien,2001.0,Houghton Mifflin Company
130634,130634,33030,618260587,10,39.0,sandpoint,idaho,usa,The Lord of the Rings,J. R. R. Tolkien,2002.0,Houghton Mifflin Company
273505,273505,73330,618343997,8,29.0,seattle,washington,usa,The Lord of the Rings,J. R. R. Tolkien,2003.0,Houghton Mifflin Company
309293,309293,82487,618260250,5,33.0,spring hill,florida,usa,The Lord of the Rings,J. R. R. Tolkien,2002.0,Houghton Mifflin Company
311924,311924,83186,618153977,10,34.0,norfolk,virginia,usa,The Lord of the Rings,J. R. R. Tolkien,2001.0,Houghton Mifflin Company


In [None]:
642765/(187713* 2253)

0.0015198358277956738