In [1]:
import pandas as pd
import numpy as np

# Load the datasets
books = pd.read_csv('./dataset/Books.csv')
users = pd.read_csv('./dataset/Users.csv')
ratings = pd.read_csv('./dataset/Ratings.csv')

# --- Initial Exploration ---

print("Books DataFrame Head:")
print(books.head())
print("\nBooks DataFrame Info:")
books.info()

print("\nUsers DataFrame Head:")
print(users.head())
print("\nUsers DataFrame Info:")
users.info()

print("\nRatings DataFrame Head:")
print(ratings.head())
print("\nRatings DataFrame Info:")
ratings.info()

  books = pd.read_csv('./dataset/Books.csv')


Books DataFrame Head:
         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...   
1  htt

In [2]:
# Merge ratings and books dataframes
ratings_with_name = ratings.merge(books)
ratings_with_name.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


In [3]:
# Group by book title and count the number of ratings
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace=True)

print("Number of Ratings per Book:")
print(num_rating_df.head())

Number of Ratings per Book:
                                          Book-Title  num_ratings
0   A Light in the Storm: The Civil War Diary of ...            4
1                              Always Have Popsicles            1
2               Apple Magic (The Collector's series)            1
3   Ask Lily (Young Women of Faith: Lily Series, ...            1
4   Beyond IBM: Leadership Marketing and Finance ...            1


In [4]:
# Group by book title and calculate the average rating
avg_rating_df = ratings_with_name.groupby('Book-Title').mean(numeric_only=True)['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)

print("\nAverage Rating per Book:")
print(avg_rating_df.head())


Average Rating per Book:
                                          Book-Title  avg_rating
0   A Light in the Storm: The Civil War Diary of ...        2.25
1                              Always Have Popsicles        0.00
2               Apple Magic (The Collector's series)        0.00
3   Ask Lily (Young Women of Faith: Lily Series, ...        8.00
4   Beyond IBM: Leadership Marketing and Finance ...        0.00


In [5]:
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
print("\nCombined Popularity DataFrame:")
print(popular_df.head())


Combined Popularity DataFrame:
                                          Book-Title  num_ratings  avg_rating
0   A Light in the Storm: The Civil War Diary of ...            4        2.25
1                              Always Have Popsicles            1        0.00
2               Apple Magic (The Collector's series)            1        0.00
3   Ask Lily (Young Women of Faith: Lily Series, ...            1        8.00
4   Beyond IBM: Leadership Marketing and Finance ...            1        0.00


In [6]:
# Filter for books with at least 150 ratings, then sort by average rating
# This gives us the top-rated, most popular books
final_popular_df = popular_df[popular_df['num_ratings'] >= 150].sort_values('avg_rating', ascending=False)

print("\nTop Popular Books (Final Result):")
print(final_popular_df.head(20)) # Display the top 20 popular books


Top Popular Books (Final Result):
                                               Book-Title  num_ratings  \
80434   Harry Potter and the Prisoner of Azkaban (Book 3)          428   
80422        Harry Potter and the Goblet of Fire (Book 4)          387   
80441      Harry Potter and the Sorcerer's Stone (Book 1)          278   
80426   Harry Potter and the Order of the Phoenix (Boo...          347   
60582       Ender's Game (Ender Wiggins Saga (Paperback))          249   
80414    Harry Potter and the Chamber of Secrets (Book 2)          556   
191612  The Hobbit : The Enchanting Prelude to The Lor...          281   
187377  The Fellowship of the Ring (The Lord of the Ri...          368   
189274                 The Giver (21st Century Reference)          179   
94259        Ishmael: An Adventure of the Mind and Spirit          162   
80445   Harry Potter and the Sorcerer's Stone (Harry P...          575   
211384     The Two Towers (The Lord of the Rings, Part 2)          260   
200

In [7]:
# Count ratings per user and per book
user_rating_counts = ratings_with_name.groupby('User-ID').count()['Book-Rating']
book_rating_counts = ratings_with_name.groupby('Book-Title').count()['Book-Rating']

# Filter for users who have rated more than 100 books
active_users = user_rating_counts[user_rating_counts > 100].index
filtered_ratings = ratings_with_name[ratings_with_name['User-ID'].isin(active_users)]

# Filter for books that have more than 50 ratings
popular_books = book_rating_counts[book_rating_counts > 50].index
final_ratings = filtered_ratings[filtered_ratings['Book-Title'].isin(popular_books)]

# `final_ratings` is now a much smaller, denser dataset
print(final_ratings.shape)

(144131, 10)


In [8]:
# Create the pivot table and convert to float32
pivot_table = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pivot_table.fillna(0, inplace=True)
pivot_table = pivot_table.astype(np.float32)  # Add this line

print(pivot_table.head())

User-ID              254     507     882     1424    1435    1733    1903    \
Book-Title                                                                    
10 Lb. Penalty          0.0     0.0     0.0     0.0     0.0     0.0     0.0   
16 Lighthouse Road      0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1984                    9.0     0.0     0.0     0.0     0.0     0.0     0.0   
1st to Die: A Novel     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2010: Odyssey Two       0.0     0.0     0.0     0.0     0.0     0.0     0.0   

User-ID              2033    2110    2276    ...  275020  275970  276463  \
Book-Title                                   ...                           
10 Lb. Penalty          0.0     0.0     0.0  ...     0.0     0.0     0.0   
16 Lighthouse Road      0.0     0.0     0.0  ...     0.0     0.0     0.0   
1984                    0.0     0.0     0.0  ...     0.0     0.0     0.0   
1st to Die: A Novel     0.0     0.0     0.0  ...     0.0     0.0  

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity and convert to float32
similarity_scores = cosine_similarity(pivot_table).astype(np.float32)  # Add .astype()

# The result is a square matrix where each row/column is a book
print(similarity_scores.shape)

(2381, 2381)


In [10]:
import numpy as np

def recommend(book_name):
    # Find the index of the input book in our pivot table
    # np.where() returns an array of indices, we take the first one
    index = np.where(pivot_table.index == book_name)[0][0]
    
    # Get the similarity scores for this book against all other books
    # Enumerate adds a counter to an iterable
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)
    
    print(f"Recommendations for '{book_name}':\n")
    # We start from index 1 because index 0 is the book itself (with a similarity of 1.0)
    for i in similar_items[1:6]:
        # i[0] is the index of the similar book
        recommended_book_title = pivot_table.index[i[0]]
        # i[1] is the similarity score, which we can optionally display
        similarity_score = i[1]
        print(f"- {recommended_book_title} (Similarity: {similarity_score:.2f})")

In [11]:
# Test the recommendation function
recommend('The Hobbit : The Enchanting Prelude to The Lord of the Rings')

Recommendations for 'The Hobbit : The Enchanting Prelude to The Lord of the Rings':

- The Two Towers (The Lord of the Rings, Part 2) (Similarity: 0.24)
- The Fellowship of the Ring (The Lord of the Rings, Part 1) (Similarity: 0.23)
- The Return of the King (The Lord of the Rings, Part 3) (Similarity: 0.21)
- Maniac Magee (Similarity: 0.17)
- Lord of the Flies (Similarity: 0.16)


In [12]:
# Create a pivot table with users as rows and books as columns
user_pivot_table = final_ratings.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating')

# Fill missing values with 0
user_pivot_table.fillna(0, inplace=True)
user_pivot_table = user_pivot_table.astype(np.float32)  # Add this line

print("User-Item Matrix Head:")
print(user_pivot_table.head())

User-Item Matrix Head:
Book-Title  10 Lb. Penalty  16 Lighthouse Road  1984  1st to Die: A Novel  \
User-ID                                                                     
254                    0.0                 0.0   9.0                  0.0   
507                    0.0                 0.0   0.0                  0.0   
882                    0.0                 0.0   0.0                  0.0   
1424                   0.0                 0.0   0.0                  0.0   
1435                   0.0                 0.0   0.0                  0.0   

Book-Title  2010: Odyssey Two  204 Rosewood Lane  2061: Odyssey Three  \
User-ID                                                                 
254                       0.0                0.0                  0.0   
507                       0.0                0.0                  0.0   
882                       0.0                0.0                  0.0   
1424                      0.0                0.0                  0.0   

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between users and convert to float32
user_similarity_scores = cosine_similarity(user_pivot_table).astype(np.float32)  # Add .astype()

# The result is a square matrix where each row/column represents a user
print(user_similarity_scores.shape)

(1642, 1642)


In [14]:
import numpy as np

def recommend_for_user(user_id):
    try:
        # Find the index of the user in our pivot table
        user_index = np.where(user_pivot_table.index == user_id)[0][0]
    except IndexError:
        print(f"User ID {user_id} not found. Please choose from active users.")
        return

    # Get similarity scores for the target user against all other users
    # Sort them to find the most similar users (top 5)
    similar_users_indices = np.argsort(user_similarity_scores[user_index])[::-1][1:6]

    # Get the books the target user has already rated
    rated_by_target_user = set(user_pivot_table.columns[user_pivot_table.iloc[user_index] > 0])
    
    recommended_books = set()
    
    print(f"Recommendations for User ID {user_id}:\n")
    # Loop through the top similar users
    for similar_user_index in similar_users_indices:
        # Get books rated highly (e.g., > 4) by the similar user
        similar_user_ratings = user_pivot_table.iloc[similar_user_index]
        books_liked_by_similar_user = set(similar_user_ratings[similar_user_ratings > 4].index)
        
        # Add these books to our recommendation set
        recommended_books.update(books_liked_by_similar_user)
        
    # Remove books the target user has already rated
    final_recommendations = recommended_books - rated_by_target_user
    
    if not final_recommendations:
        print("Could not find new books to recommend from similar users.")
        return

    # Print the first 10 recommendations
    for i, book in enumerate(list(final_recommendations)[:10]):
        print(f"{i+1}. {book}")

In [15]:
# Test the function with a user ID from your dataset
# Example user ID (replace with one from your `final_ratings`):
test_user_id = 254 
recommend_for_user(test_user_id)

Recommendations for User ID 254:

1. The Bell Jar : A Novel (Perennial Classics)
2. Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death
3. House Atreides (Dune: House Trilogy, Book 1)
4. Pattern Recognition
5. Fight Club
6. Harry Potter and the Order of the Phoenix (Book 5)
7. The Great Gatsby
8. Wuthering Heights
9. Children of Dune (Dune Chronicles, Book 3)
10. How to Make an American Quilt


In [16]:
import pickle
import os

# Create a directory to store the models if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the necessary objects
pickle.dump(final_popular_df, open('models/popular.pkl', 'wb'))
pickle.dump(pivot_table, open('models/pivot_table.pkl', 'wb'))
pickle.dump(similarity_scores, open('models/similarity_scores.pkl', 'wb'))
pickle.dump(user_pivot_table, open('models/user_pivot_table.pkl', 'wb'))
pickle.dump(user_similarity_scores, open('models/user_similarity_scores.pkl', 'wb'))
pickle.dump(final_ratings, open('models/final_ratings.pkl', 'wb'))
# Add this to your notebook to save the main books dataframe
pickle.dump(books, open('models/books.pkl', 'wb'))

print("All model artifacts have been saved successfully.")

All model artifacts have been saved successfully.
