In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
print(os.getcwd())

/Users/mdshameemalam/Desktop/Project/Book-Recommendation-System/ml_model_content_based


In [7]:
final_books_df = pd.read_pickle('../artifacts/books.pkl')
ratings_df = pd.read_pickle('../artifacts/ratings.pkl')
print(final_books_df.head(), '\n')
print(ratings_df.head())

         ISBN                                         Book-Title  \
0  0590567330   A Light in the Storm: The Civil War Diary of ...   
1  0964147726                              Always Have Popsicles   
2  0942320093               Apple Magic (The Collector's series)   
3  0310232546   Ask Lily (Young Women of Faith: Lily Series, ...   
4  0962295701   Beyond IBM: Leadership Marketing and Finance ...   

        Book-Author Year-Of-Publication                    Publisher  \
0       Karen Hesse                1999  Hyperion Books for Children   
1    Rebecca Harvin                1994            Rebecca L. Harvin   
2  Martina Boudreau                1984           Amer Cooking Guild   
3      Nancy N. Rue                2001                   Zonderkidz   
4        Lou Mobley                1989       Teleonet, Incorporated   

   Avg-Book-Rating                                        Image-URL-M  
0             2.25  http://images.amazon.com/images/P/0590567330.0...  
1             

In [8]:
# First, calculate the number of ratings for each book
num_ratings_df = ratings_df.groupby('ISBN').count()['Book-Rating'].reset_index()
num_ratings_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace=True)
num_ratings_df

Unnamed: 0,ISBN,num_ratings
0,0330299891,2
1,0375404120,2
2,0586045007,1
3,9022906116,2
4,9032803328,1
...,...,...
340551,cn113107,1
340552,ooo7156103,1
340553,§423350229,1
340554,´3499128624,1


In [10]:
# Merge this count into our main books dataframe
books_with_ratings_df = final_books_df.merge(num_ratings_df, on='ISBN')
books_with_ratings_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Avg-Book-Rating,Image-URL-M,num_ratings
0,0590567330,A Light in the Storm: The Civil War Diary of ...,Karen Hesse,1999,Hyperion Books for Children,2.250000,http://images.amazon.com/images/P/0590567330.0...,4
1,0964147726,Always Have Popsicles,Rebecca Harvin,1994,Rebecca L. Harvin,0.000000,http://images.amazon.com/images/P/0964147726.0...,1
2,0942320093,Apple Magic (The Collector's series),Martina Boudreau,1984,Amer Cooking Guild,0.000000,http://images.amazon.com/images/P/0942320093.0...,1
3,0310232546,"Ask Lily (Young Women of Faith: Lily Series, ...",Nancy N. Rue,2001,Zonderkidz,8.000000,http://images.amazon.com/images/P/0310232546.0...,1
4,0962295701,Beyond IBM: Leadership Marketing and Finance ...,Lou Mobley,1989,"Teleonet, Incorporated",0.000000,http://images.amazon.com/images/P/0962295701.0...,1
...,...,...,...,...,...,...,...,...
269210,3499232499,Ã?Â?lpiraten.,Janwillem van de Wetering,2002,Rowohlt Tb.,0.000000,http://images.amazon.com/images/P/3499232499.0...,2
269211,325721538X,Ã?Â?rger mit Produkt X. Roman.,Joan Aiken,1987,Diogenes Verlag,5.250000,http://images.amazon.com/images/P/325721538X.0...,4
269212,3451274973,Ã?Â?sterlich leben.,Anselm GrÃ?Â¼n,2001,"Herder, Freiburg",7.000000,http://images.amazon.com/images/P/3451274973.0...,1
269213,3442725739,Ã?Â?stlich der Berge.,David Guterson,2000,btb,2.666667,http://images.amazon.com/images/P/3442725739.0...,3


In [12]:
# Set a threshold, e.g., 50 ratings 
content_df = books_with_ratings_df[books_with_ratings_df['num_ratings'] >= 50].reset_index(drop=True)
content_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Avg-Book-Rating,Image-URL-M,num_ratings
0,1551668300,16 Lighthouse Road,Debbie Macomber,2001,Mira,1.907692,http://images.amazon.com/images/P/1551668300.0...,65
1,0451524934,1984,George Orwell,1990,Signet Book,4.614583,http://images.amazon.com/images/P/0451524934.0...,192
2,0316666009,1st to Die: A Novel,James Patterson,2001,Little Brown and Company,3.830508,http://images.amazon.com/images/P/0316666009.0...,118
3,0446610038,1st to Die: A Novel,James Patterson,2002,Warner Vision,3.498721,http://images.amazon.com/images/P/0446610038.0...,391
4,0345303067,2010: Odyssey Two,Arthur C. Clarke,1984,Del Rey Books,2.721519,http://images.amazon.com/images/P/0345303067.0...,79
...,...,...,...,...,...,...,...,...
2135,0553277472,Zen and the Art of Motorcycle Maintenance: An ...,ROBERT PIRSIG,1984,Bantam,3.119565,http://images.amazon.com/images/P/0553277472.0...,184
2136,0440203856,Zoya,Danielle Steel,1989,Dell,1.186047,http://images.amazon.com/images/P/0440203856.0...,86
2137,0449003787,"\O\"" Is for Outlaw""",SUE GRAFTON,2001,Ballantine Books,2.765714,http://images.amazon.com/images/P/0449003787.0...,175
2138,0805059555,"\O\"" Is for Outlaw""",Sue Grafton,1999,Henry Holt &amp; Company,3.404255,http://images.amazon.com/images/P/0805059555.0...,94


In [13]:
# --- Step 3: Feature Engineering - Creating a "Tags" Column ---
# We will combine the most descriptive text features into a single string called 'tags'.
# This "content soup" will represent the content of each book.
# We'll use: Book-Title, Book-Author, Publisher, and Year-Of-Publication.

# Creating 'tags' column for content representation

In [14]:
# Convert year to string so it can be combined with other text
content_df['Year-Of-Publication'] = content_df['Year-Of-Publication'].astype(str)

content_df['Year-Of-Publication'].dtype
# dtype('O'): O means "object", which is how pandas stores strings

dtype('O')

In [15]:
# Remove spaces from author and publisher names to treat them as single entities
# e.g., 'J. K. Rowling' becomes 'J.K.Rowling'
content_df['Book-Author'] = content_df['Book-Author'].str.replace(' ', '')
content_df['Publisher'] = content_df['Publisher'].str.replace(' ', '')

print(content_df['Book-Author'], '\n')
print(content_df['Publisher'])

0       DebbieMacomber
1         GeorgeOrwell
2       JamesPatterson
3       JamesPatterson
4       ArthurC.Clarke
             ...      
2135      ROBERTPIRSIG
2136     DanielleSteel
2137        SUEGRAFTON
2138        SueGrafton
2139        NeilGaiman
Name: Book-Author, Length: 2140, dtype: object 

0                        Mira
1                  SignetBook
2       LittleBrownandCompany
3                WarnerVision
4                 DelReyBooks
                ...          
2135                   Bantam
2136                     Dell
2137          BallantineBooks
2138    HenryHolt&amp;Company
2139                Perennial
Name: Publisher, Length: 2140, dtype: object


In [16]:
# Combine all the features into a single "tags" string
content_df['tags'] = content_df['Book-Title'] + ' ' + \
                     content_df['Book-Author'] + ' ' + \
                     content_df['Publisher'] + ' ' + \
                     content_df['Year-Of-Publication']
    
# Convert all tags to lowercase for consistency
content_df['tags'] = content_df['tags'].str.lower()
    
content_df.tags

0             16 lighthouse road debbiemacomber mira 2001
1                       1984 georgeorwell signetbook 1990
2       1st to die: a novel jamespatterson littlebrown...
3       1st to die: a novel jamespatterson warnervisio...
4       2010: odyssey two arthurc.clarke delreybooks 1984
                              ...                        
2135    zen and the art of motorcycle maintenance: an ...
2136                         zoya daniellesteel dell 1989
2137    \o\" is for outlaw" suegrafton ballantinebooks...
2138    \o\" is for outlaw" suegrafton henryholt&amp;c...
2139                   stardust neilgaiman perennial 2001
Name: tags, Length: 2140, dtype: object

In [17]:
# --- Step 3: Vectorize the Tags ---
# Computers don't understand words, so we convert the text 'tags' into numerical vectors.
# TfidfVectorizer is excellent for this as it gives more weight to more important words.

# Initialize the vectorizer. `stop_words='english'` removes common English words.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Create the vector matrix by transforming the 'tags' column.
feature_vectors = tfidf_vectorizer.fit_transform(content_df['tags'])

feature_vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13004 stored elements and shape (2140, 3417)>

In [18]:
print(feature_vectors.shape)  # e.g. (10000, 5000)
print(type(feature_vectors))  # <class 'scipy.sparse.csr.csr_matrix'>
print(tfidf_vectorizer.get_feature_names_out()[:10])  # first 10 words
print(feature_vectors.toarray()[:2])  # first 2 books as dense vectors

(2140, 3417)
<class 'scipy.sparse._csr.csr_matrix'>
['10' '100' '1027' '12' '13' '1328' '13th' '14' '15' '16']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [27]:
# --- Step 5: Build the On-Demand Recommendation Function ---
# This function now calculates similarity only when called.
# Notice we are NO LONGER creating the full similarity_matrix.

def recommend_on_demand(book_title, data, vectors):
    # Find the index of the book that matches the title
    try:
        book_index = data[data['Book-Title'] == book_title].index[0]
    except IndexError:
        print(f"Book '{book_title}' not found in the dataset. It might have been filtered out.")
        return

    # Get the vector for the specific book
    target_book_vector = vectors[book_index]
    
    # Calculate cosine similarity of this one book against all others
    # This returns a list of similarity scores, which is much smaller than the full matrix.
    similarity_scores = cosine_similarity(target_book_vector, vectors)
    
    # The result is a nested list, so we grab the inner list of scores
    distances = similarity_scores[0]
    
    # Sort the books based on the similarity scores
    books_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])
    
    print(f"\n--- Recommendations for '{book_title}' ---")
    # Get the top 5 most similar books, skipping the first one (the book itself)
    for i in books_list[1:6]:
        recommended_book_title = data.iloc[i[0]]['Book-Title']
        print(recommended_book_title)


In [28]:
# We now pass the feature_vectors to the function instead of a similarity matrix
recommend_on_demand('The Da Vinci Code', content_df, feature_vectors)
recommend_on_demand('The Lovely Bones: A Novel', content_df, feature_vectors)


--- Recommendations for 'The Da Vinci Code' ---
The Devil's Code
Code to Zero
Angels &amp; Demons
Bleachers
Illuminati.

--- Recommendations for 'The Lovely Bones: A Novel' ---
The Lovely Bones
Sea Glass: A Novel
Fortune's Rocks : A Novel
The Last Time They Met : A Novel
2nd Chance


In [30]:
# --- Step 6: Save the Model Artifacts ---
# For this optimized model, we need to save the vectorizer and the filtered dataframe.
# We no longer have a similarity matrix to save.

pickle.dump(tfidf_vectorizer, open('../artifacts/tfidf_vectorizer.pkl', 'wb'))
pickle.dump(feature_vectors, open('../artifacts/feature_vectors.pkl', 'wb'))
content_df.to_pickle('../artifacts/content_df.pkl')

In [31]:
os.listdir('../artifacts')

['tfidf_vectorizer.pkl',
 'ratings.pkl',
 'content_df.pkl',
 'books.pkl',
 'users.pkl',
 'feature_vectors.pkl',
 '.ipynb_checkpoints',
 'popular_books.pkl']