In [81]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

In [82]:
# --- Step 1: Load the Cleaned Artifacts ---
# For this model, we need all three core artifacts.
books_df = pd.read_pickle('../artifacts/books.pkl')
ratings_df = pd.read_pickle('../artifacts/ratings.pkl')
users_df = pd.read_pickle('../artifacts/users.pkl')

In [83]:
print(books_df.shape)
print(ratings_df.shape)
print(users_df.shape)

(269215, 7)
(1149780, 3)
(105283, 3)


In [84]:
# --- Step 2: Merge DataFrames ---
# To work with the data effectively, we'll merge ratings_df and books_df.
# We only need the title and ISBN from books_df to merge

ratings_with_books_df = ratings_df.merge(books_df[['ISBN', 'Book-Title', 'Book-Author']], on='ISBN')
ratings_with_books_df


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5,Rites of Passage,Judith Rae
2,276727,0446520802,0,The Notebook,Nicholas Sparks
3,276729,052165615X,3,Help!: Level 1,Philip Prowse
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
...,...,...,...,...,...
1025112,276704,0876044011,0,Edgar Cayce on the Akashic Records: The Book o...,Kevin J. Todeschi
1025113,276704,1563526298,9,Get Clark Smart : The Ultimate Guide for the S...,Clark Howard
1025114,276706,0679447156,0,Eight Weeks to Optimum Health: A Proven Progra...,Andrew Weil
1025115,276709,0515107662,10,The Sherbrooke Bride (Bride Trilogy (Paperback)),Catherine Coulter


In [85]:
# --- Step 3: Filter the Data for Model Stability ---
# The dataset is very large and sparse. To build a reliable model, we'll filter it:
# 1. Keep users who have rated at least 200 books (experienced users).
# 2. Keep books that have been rated at least 50 times (popular books).

# Count ratings per user
user_rating_counts = ratings_with_books_df.groupby('User-ID').count()['Book-Rating']
user_rating_counts

User-ID
2          1
8         17
9          3
10         1
12         1
          ..
278846     1
278849     4
278851    22
278852     1
278854     8
Name: Book-Rating, Length: 91784, dtype: int64

In [86]:
# Get a list of user id of the users who have rated >= 200 books
experienced_users = user_rating_counts[user_rating_counts >= 200].index
experienced_users


Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='User-ID', length=812)

In [87]:
# Filter the main dataframe to keep only these experienced users
filtered_ratings_by_user = ratings_with_books_df[ratings_with_books_df['User-ID'].isin(experienced_users)]
filtered_ratings_by_user


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
1145,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1146,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll
1147,277427,003008685X,8,Pioneers,James Fenimore Cooper
1148,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau
1149,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens
...,...,...,...,...,...
1023355,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias
1023356,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt
1023357,275970,3829021860,0,The Penis Book,Joseph Cohen
1023358,275970,4770019572,0,Musashi,Eiji Yoshikawa


In [88]:
duplicates = filtered_ratings_by_user[filtered_ratings_by_user.duplicated(subset=['Book-Title', 'Book-Author'], keep=False)]

# Show some results
# print(duplicates.head())

# Optional: sort them to see duplicates together
duplicates_sorted = duplicates.sort_values(by=['Book-Title', 'Book-Author'])
duplicates_sorted

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
142271,35859,0590567330,0,A Light in the Storm: The Civil War Diary of ...,Karen Hesse
356290,96448,0590567330,9,A Light in the Storm: The Civil War Diary of ...,Karen Hesse
100598,26544,006250746X,9,Earth Prayers From around the World: 365 Pray...,Elizabeth Roberts
660275,179744,006250746X,6,Earth Prayers From around the World: 365 Pray...,Elizabeth Roberts
757471,205980,006250746X,10,Earth Prayers From around the World: 365 Pray...,Elizabeth Roberts
...,...,...,...,...,...
50141,11676,3518120549,8,Ã?Â?ber das Fernsehen.,Pierre Bourdieu
587357,160032,3518120549,0,Ã?Â?ber das Fernsehen.,Pierre Bourdieu
49681,11676,3257700512,0,Ã?Â?ber die Pflicht zum Ungehorsam gegen den S...,Henry David Thoreau
587207,160032,3257700512,0,Ã?Â?ber die Pflicht zum Ungehorsam gegen den S...,Henry David Thoreau


In [89]:
# Grouping by both title and author for accuracy, then count the rows and reset the index
book_rating_counts = filtered_ratings_by_user.groupby(['Book-Title', 'Book-Author']).size().reset_index(name='num_book_ratings')
book_rating_counts

Unnamed: 0,Book-Title,Book-Author,num_book_ratings
0,A Light in the Storm: The Civil War Diary of ...,Karen Hesse,2
1,Always Have Popsicles,Rebecca Harvin,1
2,Apple Magic (The Collector's series),Martina Boudreau,1
3,Beyond IBM: Leadership Marketing and Finance ...,Lou Mobley,1
4,Clifford Visita El Hospital (Clifford El Gran...,Norman Bridwell,1
...,...,...,...
161536,Ã?Â?ber das Fernsehen.,Pierre Bourdieu,2
161537,Ã?Â?ber die Pflicht zum Ungehorsam gegen den S...,Henry David Thoreau,3
161538,Ã?Â?lpiraten.,Janwillem van de Wetering,1
161539,Ã?Â?stlich der Berge.,David Guterson,1


In [91]:
book_rating_counts.duplicated(subset=['Book-Title', 'Book-Author']).sum()

np.int64(0)

In [92]:
# Get a list of popular books (where count is >= 50)
popular_books_df = book_rating_counts[book_rating_counts['num_book_ratings'] >= 50]
popular_books_df

Unnamed: 0,Book-Title,Book-Author,num_book_ratings
487,1984,George Orwell,71
521,1st to Die: A Novel,James Patterson,160
660,2nd Chance,James Patterson,122
805,4 Blondes,Candace Bushnell,70
1131,A Bend in the Road,Nicholas Sparks,113
...,...,...,...
160352,Year of Wonders,Geraldine Brooks,57
160576,You Belong To Me,Mary Higgins Clark,55
161210,Zen and the Art of Motorcycle Maintenance: An ...,ROBERT PIRSIG,55
161355,Zoya,Danielle Steel,59


In [93]:
filtered_ratings_by_user

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
1145,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1146,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll
1147,277427,003008685X,8,Pioneers,James Fenimore Cooper
1148,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau
1149,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens
...,...,...,...,...,...
1023355,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias
1023356,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt
1023357,275970,3829021860,0,The Penis Book,Joseph Cohen
1023358,275970,4770019572,0,Musashi,Eiji Yoshikawa


In [103]:
# Filter the ratings dataframe again to keep only ratings for these popular books
final_ratings = filtered_ratings_by_user.merge(popular_books_df[['Book-Title', 'Book-Author']], on=['Book-Title', 'Book-Author'])
final_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver
2,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett
3,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich
4,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett
...,...,...,...,...,...
52051,275970,1400031354,0,Tears of the Giraffe (No.1 Ladies Detective Ag...,Alexander McCall Smith
52052,275970,1400031362,0,Morality for Beautiful Girls (No.1 Ladies Dete...,Alexander McCall Smith
52053,275970,1573229725,0,Fingersmith,Sarah Waters
52054,275970,1586210661,9,Me Talk Pretty One Day,David Sedaris


In [80]:
duplicates = final_ratings[final_ratings.duplicated(subset=['Book-Title', 'Book-Author'], keep=False)]

# sort them to see duplicates together
duplicates_sorted = duplicates.sort_values(by=['Book-Title', 'Book-Author'])
duplicates_sorted

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
276,254,0451524934,9,1984,George Orwell
1141,7346,0451519841,8,1984,George Orwell
2176,11676,0451519841,0,1984,George Orwell
2177,11676,0451524934,0,1984,George Orwell
2180,11676,0452262933,10,1984,George Orwell
...,...,...,...,...,...
50632,268032,0449003787,0,"\O\"" Is for Outlaw""",SUE GRAFTON
50779,268330,0449003787,0,"\O\"" Is for Outlaw""",SUE GRAFTON
50971,269566,0449003787,0,"\O\"" Is for Outlaw""",SUE GRAFTON
51094,269719,0449003787,0,"\O\"" Is for Outlaw""",SUE GRAFTON


In [95]:
final_ratings.duplicated(subset=['Book-Title', 'Book-Author']).sum()

np.int64(51428)

In [104]:
# --- Step 4: Create the User-Item Interaction Matrix ---
# We'll pivot the table so that rows are book titles and columns are user IDs.
# The values will be the ratings. This is the "utility matrix".

# If a user rated the same book multiple times, take the mean of their ratings rather than simply dropping duplicates.
final_ratings = final_ratings.groupby(['User-ID', 'Book-Title', 'Book-Author'])['Book-Rating'].mean().reset_index()
final_ratings

Unnamed: 0,User-ID,Book-Title,Book-Author,Book-Rating
0,254,1984,George Orwell,9.0
1,254,A Bend in the Road,Nicholas Sparks,0.0
2,254,ANGELA'S ASHES,Frank McCourt,0.0
3,254,American Gods,Neil Gaiman,10.0
4,254,Angela's Ashes (MMP) : A Memoir,Frank McCourt,0.0
...,...,...,...,...
50692,278418,Watership Down,Richard Adams,0.0
50693,278418,Welcome to Temptation,Jennifer Crusie,0.0
50694,278418,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,0.0
50695,278418,Where the Red Fern Grows,Wilson Rawls,0.0


In [107]:
final_ratings.duplicated(subset=['User-ID','Book-Title', 'Book-Author']).sum()

np.int64(0)

In [108]:
pivot_table = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pivot_table

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,0.0,...,,9.0,,,,,0.0,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,0.0,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,


In [110]:
# Fill missing values (NaNs) with 0. This means a user hasn't rated a book.
pivot_table.fillna(0, inplace=True)
pivot_table

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
# Convert the pivot table to a sparse matrix for efficiency
book_sparse_matrix = csr_matrix(pivot_table.values)
book_sparse_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12731 stored elements and shape (626, 809)>

In [115]:
# --- Step 5: Train the Nearest Neighbors Model ---
# We will use the NearestNeighbors algorithm to find books that are "closest" to each other
# in the user-rating space.

# We use 'cosine' similarity because we care about the angle of the rating vectors, not their magnitude.
model = NearestNeighbors(algorithm='brute', metric='cosine')
model.fit(book_sparse_matrix)
model

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [119]:
# --- Step 6: Build the Recommendation Function ---
def get_collaborative_recommendations(book_title, pivot, model, n_recommendations=5):
    try:
        # Find the index of the book in the pivot table
        book_index = np.where(pivot.index == book_title)[0][0]
        
        # Find the nearest neighbors (similar books)
        distances, indices = model.kneighbors(pivot.iloc[book_index, :].values.reshape(1, -1), n_neighbors=n_recommendations + 1)
        
        print(f"\n--- Recommendations for '{book_title}' ---")
        # Loop through the recommendations, skipping the first one (the book itself)
        for i in range(1, len(distances.flatten())):
            recommended_book = pivot.index[indices.flatten()[i]]
            print(f"{i}. {recommended_book}")
            
    except IndexError:
        print(f"Book '{book_title}' not found in the filtered dataset. Please try another book.")



In [120]:
get_collaborative_recommendations("The Da Vinci Code", pivot_table, model)
get_collaborative_recommendations("The Hobbit", pivot_table, model)


--- Recommendations for 'The Da Vinci Code' ---
1. Angels &amp; Demons
2. Touching Evil
3. Saving Faith
4. Middlesex: A Novel
5. The Blue Nowhere : A Novel
Book 'The Hobbit' not found in the filtered dataset. Please try another book.


In [121]:
# --- Step 7: Save the Model Artifacts ---
# For this model, we need to save the trained model and the pivot table (to map indices to titles).

pickle.dump(model, open('../artifacts/collaborative_model.pkl', 'wb'))
pivot_table.to_pickle('../artifacts/collaborative_pivot_table.pkl')


In [122]:
os.listdir('../artifacts')

['tfidf_vectorizer.pkl',
 'collaborative_model.pkl',
 'ratings.pkl',
 'content_df.pkl',
 'books.pkl',
 'users.pkl',
 'feature_vectors.pkl',
 'collaborative_pivot_table.pkl',
 '.ipynb_checkpoints',
 'popular_books.pkl']