In [68]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
books_with_tags = pd.read_csv('../data/processed/05_books_with_tags.csv')


## Create the Vectorizer


In [16]:
tfidf = TfidfVectorizer()

# Fit and transform the tag strings
tfidf_matrix = tfidf.fit_transform(books_with_tags['tag_string'])

# Show shape of the matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (735, 229)


## Cosine Similarity Between Books

In [24]:
# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check shape
print("Cosine similarity matrix shape:", cosine_sim.shape)

Cosine similarity matrix shape: (735, 735)


## Build a Top-N Recommendation Function

In [31]:
def get_book_recommendations(title, books_df, cosine_sim_matrix, top_n=5):
    # Reset index to ensure we can look things up by index
    books_df = books_df.reset_index()

    # Find the index of the given book title
    indices = pd.Series(books_df.index, index=books_df['title'].str.lower())

    idx = indices.get(title.lower())

    if idx is None:
        print("Book not found!")
        return []

    # Get pairwise similarity scores for this book
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort books by similarity score (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get book indices of top matches
    book_indices = [i[0] for i in sim_scores]

    # Return the top N most similar books
    return books_df.loc[book_indices][['title', 'authors', 'tag_string']]


#### Check the System by trying a few titles

In [33]:
get_book_recommendations("The Hobbit", books_with_tags, cosine_sim, top_n=5)


Book not found!


[]

In [38]:
get_book_recommendations("The Fault in Our Stars", books_with_tags, cosine_sim, top_n=5)


Unnamed: 0,title,authors,tag_string
5,The Catcher in the Rye,J.D. Salinger,to-read favorites fantasy currently-reading ha...
154,The Death of Ivan Ilych,"Leo Tolstoy, Aylmer Maude",to-read fantasy harry-potter fiction books-i-own
625,"Visions in Death (In Death, #19)",J.D. Robb,fantasy favorites young-adult fiction to-read
219,Freckle Juice,Judy Blume,to-read currently-reading fantasy young-adult ...
626,"Safe with Me (With Me in Seattle, #5)",Kristen Proby,to-read fantasy redwall young-adult owned


## Check & Fix Case Sensitivity

In [40]:
# Show a sample of titles (lowercased for comparison)
books_with_tags['title'].str.lower().sample(10)


261           the snows of kilimanjaro and other stories
386    siege of darkness (forgotten realms: legacy of...
383                           did you ever have a family
381    tears of the moon (gallaghers of ardmore / iri...
347            midnight (warriors: the new prophecy, #1)
473                   broken angels (takeshi kovacs, #2)
195                              saga, vol. 3 (saga, #3)
648                   strangers in death (in death, #26)
329                                          be here now
99                    four to score (stephanie plum, #4)
Name: title, dtype: object

In [44]:
def search_titles(keyword, books_df):
    return books_df[books_df['title'].str.lower().str.contains(keyword.lower())][['title', 'authors']].head(10)


In [46]:
search_titles("hobbit", books_with_tags)


Unnamed: 0,title,authors


In [48]:
books_with_tags['title'].str.lower().str.contains('hobbit').sum()


0

In [50]:
books_with_tags[['title', 'authors']].sample(10)


Unnamed: 0,title,authors
542,"A Shadow of Light (A Shade of Vampire, #4)",Bella Forrest
461,"Tris's Book (Circle of Magic, #2)",Tamora Pierce
309,The Strange Library,"Haruki Murakami, Ted Goossen"
419,Vanishing Girls,Lauren Oliver
693,Crush,Richard Siken
289,Once We Were Brothers,Ronald H. Balson
38,"Where'd You Go, Bernadette",Maria Semple
347,"Midnight (Warriors: The New Prophecy, #1)",Erin Hunter
295,"Moo, Baa, La La La!",Sandra Boynton
415,Where I'm Calling From: New and Selected Stories,Raymond Carver


In [52]:
get_book_recommendations("The Strange Library", books_with_tags, cosine_sim, top_n=5)


Unnamed: 0,title,authors,tag_string
660,"Zita the Spacegirl (Zita the Spacegirl, #1)",Ben Hatke,to-read young-adult vampires horror favorites
129,"A Voice in the Wind (Mark of the Lion, #1)","Francine Rivers, Richard Ferrone",to-read currently-reading fiction favorites yo...
726,Life Before Legend: Stories of the Criminal an...,Marie Lu,to-read fiction currently-reading young-adult ...
671,Bruiser,Neal Shusterman,fantasy young-adult vampires horror favorites
661,"Paradiso (The Divine Comedy, #3)","Dante Alighieri, Dorothy L. Sayers, Barbara Re...",to-read fantasy young-adult vampires horror


## Recommendations Explaination

In [61]:
def explain_recommendations(title, books_df, cosine_sim_matrix, top_n=5):
    books_df = books_df.reset_index()

    indices = pd.Series(books_df.index, index=books_df['title'].str.lower())
    idx = indices.get(title.lower())

    if idx is None:
        print("Book not found!")
        return

    input_tags = set(books_df.loc[idx, 'tag_string'].split())

    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]

    print(f"\n Because you liked: **{title}**")
    print(f"Top {top_n} recommendations:\n")

    for i in book_indices:
        rec_title = books_df.loc[i, 'title']
        rec_author = books_df.loc[i, 'authors']
        rec_tags = set(books_df.loc[i, 'tag_string'].split())

        shared_tags = input_tags.intersection(rec_tags)
        print(f"🔹 {rec_title} by {rec_author}")
        print(f"    ➤ Shared tags: {', '.join(shared_tags)}\n")


#### Try the Recommendation Explaination 

In [64]:
explain_recommendations("The Strange Library", books_with_tags, cosine_sim, top_n=5)



 Because you liked: **The Strange Library**
Top 5 recommendations:

🔹 Zita the Spacegirl (Zita the Spacegirl, #1) by Ben Hatke
    ➤ Shared tags: to-read, horror, young-adult, favorites

🔹 A Voice in the Wind (Mark of the Lion, #1) by Francine Rivers, Richard Ferrone
    ➤ Shared tags: to-read, currently-reading, young-adult, favorites

🔹 Life Before Legend: Stories of the Criminal and the Prodigy (Legend, #0.5) by Marie Lu
    ➤ Shared tags: to-read, currently-reading, young-adult, favorites

🔹 Bruiser by Neal Shusterman
    ➤ Shared tags: young-adult, horror, favorites

🔹 Paradiso (The Divine Comedy, #3) by Dante Alighieri, Dorothy L. Sayers, Barbara Reynolds
    ➤ Shared tags: to-read, horror, young-adult



In [66]:
explain_recommendations("The Fault in Our Stars", books_with_tags, cosine_sim, top_n=5)


 Because you liked: **The Fault in Our Stars**
Top 5 recommendations:

🔹 The Catcher in the Rye by J.D. Salinger
    ➤ Shared tags: fantasy, harry-potter

🔹 The Death of Ivan Ilych by Leo Tolstoy, Aylmer Maude
    ➤ Shared tags: fantasy, fiction, harry-potter

🔹 Visions in Death (In Death, #19) by J.D. Robb
    ➤ Shared tags: young-adult, fantasy, fiction

🔹 Freckle Juice by Judy Blume
    ➤ Shared tags: young-adult, fantasy, fiction

🔹 Safe with Me (With Me in Seattle, #5) by Kristen Proby
    ➤ Shared tags: fantasy, owned, young-adult



In [70]:
np.save('../data/processed/cosine_sim.npy', cosine_sim)