In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import dok_matrix
from sklearn.cluster import KMeans

class UserClusterCalculator:
    def __init__(self, df):
        self.df = df
    
    def load_data(self):
        print('Loading data...')
        
        # Filter relevant columns
        ratings_df = self.df[['User-ID', 'ISBN', 'Book-Rating']]
        
        # Create user-item matrix
        user_item_matrix = ratings_df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', fill_value=0)
        
        # Convert to sparse matrix
        user_ratings = dok_matrix(user_item_matrix.values, dtype=np.float32)
        
        print('Data loaded.')
        return user_item_matrix.index.tolist(), user_ratings
    
    def calculate(self, k=23):
        print('Training K-Means clustering...')
        user_ids, user_ratings = self.load_data()
        
        # Fit KMeans clustering
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit(user_ratings.tocsr())
        
        # Save cluster assignments in DataFrame
        self.df['Cluster'] = self.df['User-ID'].map(dict(zip(user_ids, clusters.labels_)))
        
        print('Clustering complete.')
        return clusters
    
    def filter_users_by_book(self, book_title):

        first_users = self.df.loc[self.df["Book-Title"].str.contains(book_title, na=False), 'User-ID']
        filtered_df = self.df[self.df['User-ID'].isin(first_users)].copy()
        return UserClusterCalculator(filtered_df)  # Ensure returning an instance


if __name__ == '__main__':
    print('Calculating user clusters...')
    df = pd.read_csv('C:/Users/vmaru/Desktop/Economics 2/clean_df.csv')  # Load your dataset
    cluster_calculator = UserClusterCalculator(df).filter_users_by_book("The Lord of the Rings")
    clusters = cluster_calculator.calculate(k=15)
    
clust_df = cluster_calculator.df


Calculating user clusters...
Training K-Means clustering...
Loading data...
Data loaded.
Clustering complete.


In [2]:
def opt_group(df, str):
    book_counts = df[df['Book-Title'].str.contains(str, na=False)]\
    .groupby('Cluster')['Book-Title'].count()

    return int(np.where(book_counts == max(book_counts))[0])

clust_n = opt_group(clust_df, "The Lord of the Rings")

filt_clust = clust_df[clust_df['Cluster'] == clust_n]



  return int(np.where(book_counts == max(book_counts))[0])


In [3]:
from scipy.sparse import lil_matrix


user_filt = filt_clust['User-ID'].unique()

book_filt = filt_clust['ISBN'].unique()

n_book = len(book_filt)

n_user = len(user_filt)

matrix = lil_matrix((n_user, n_book), dtype=float)

# Create user and book index mappings
user_to_index = {user: idx for idx, user in enumerate(user_filt)}
book_to_index = {book: idx for idx, book in enumerate(book_filt)}


# Populate the sparse matrix
for _, row in filt_clust.iterrows():
    user_idx = user_to_index[row['User-ID']]
    book_idx = book_to_index[row['ISBN']]
    matrix[user_idx, book_idx] = row['Book-Rating']

from scipy.sparse import csr_matrix

rating_matrix = matrix.tocsr()

from sklearn.metrics.pairwise import cosine_similarity

# Transpose to get items (books) as rows
item_similarity = cosine_similarity(rating_matrix.T)

item_similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], shape=(13220, 13220))

In [4]:
index_to_book = {idx: book for book, idx in book_to_index.items()}

def recommend_books_based_on_book(book_id, top_n=5):
    if book_id not in book_to_index:
        return pd.DataFrame(columns=["Book-Title", "ISBN", "Similarity-Score"])

    # Get the index of the input book
    book_idx = book_to_index[book_id]
    
    # Get similarity scores for this book
    similar_books = list(enumerate(item_similarity[book_idx]))
    
    # Sort by similarity scores (highest first), excluding the input book itself
    similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)
    
    # Select top N similar books (excluding the input book itself)
    recommended_books = [
        (index_to_book[idx], round(float(score), 4))
        for idx, score in similar_books[1:top_n + 1]
    ]
    
    # Create a DataFrame for the recommendations
    recommendations_df = pd.DataFrame(
        {
            "Book-Title": [
                filt_clust.loc[filt_clust["ISBN"] == isbn, "Book-Title"].values[0]
                for isbn, _ in recommended_books
            ],
            "Similarity-Score": [score for _, score in recommended_books],
        }
    )
    return recommendations_df


In [6]:
book_id = '0618260250'
recommendations = recommend_books_based_on_book(book_id, top_n=10)
print(recommendations)


                                          Book-Title  Similarity-Score
0            The Girls' Guide to Hunting and Fishing            0.4235
1        Confessions of an Ugly Stepsister : A Novel            0.4189
2  The Devil in the White City : Murder, Magic, a...            0.3535
3                               The Tortilla Curtain            0.3446
4      Shock Wave (Dirk Pitt Adventures (Paperback))            0.3446
5                                The Dominant Blonde            0.3446
6             The Silver Chair (full color) (Narnia)            0.3446
7                     The Magician's Nephew (Narnia)            0.3446
8      Ender's Game (Ender Wiggins Saga (Paperback))            0.3446
9                                      Black Rainbow            0.3446
