In [21]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [23]:
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
books = pd.read_csv('BX_Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)
ratings.columns = ['userID', 'ISBN', 'bookRating']
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
ratings = ratings[ratings['bookRating'] > 0]


In [24]:
active_users = ratings['userID'].value_counts()
filtered_users = active_users[active_users > 50].index
ratings = ratings[ratings['userID'].isin(filtered_users)]

popular_books = ratings['ISBN'].value_counts()
filtered_books = popular_books[popular_books > 20].index
ratings = ratings[ratings['ISBN'].isin(filtered_books)]


In [25]:
rating_matrix = ratings.pivot_table(index='userID', columns='ISBN', values='bookRating').fillna(0)
print("✅ Rating matrix shape:", rating_matrix.shape)


✅ Rating matrix shape: (1157, 325)


In [26]:
user_similarity = cosine_similarity(rating_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)


In [27]:
def get_similar_users(user_id, n=5):
    if user_id not in user_similarity_df.index:
        print(f"❌ User {user_id} not found.")
        return []
    return user_similarity_df[user_id].sort_values(ascending=False)[1:n+1].index



In [30]:
def recommend_books(user_id, n=5):
    similar_users = get_similar_users(user_id)
    if similar_users.empty:
        return

    user_books = set(rating_matrix.columns[rating_matrix.loc[user_id] > 0])
    scores = {}

    for sim_user in similar_users:
        sim_ratings = rating_matrix.loc[sim_user]
        for book in sim_ratings.index:
            if book not in user_books and sim_ratings[book] > 0:
                scores.setdefault(book, []).append(sim_ratings[book])

    avg_scores = {book: sum(score_list)/len(score_list) for book, score_list in scores.items()}
    top_books = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)[:n]

    print(f"\n📚 Top {n} Book Recommendations for User {user_id}:\n")
    for isbn, score in top_books:
        title = books[books['ISBN'] == isbn]['bookTitle'].values
        print(f"✔️ {title[0] if len(title) else 'Unknown'} (Score: {score:.2f})")

In [31]:
sample_user = rating_matrix.index[0]
print("✅ Sample valid user ID:", sample_user)
recommend_books(user_id=sample_user, n=5)


✅ Sample valid user ID: 254

📚 Top 5 Book Recommendations for User 254:

✔️ The Amazing Adventures of Kavalier & Clay (Score: 10.00)
✔️ The Valley of Horses (Score: 10.00)
✔️ Harry Potter and the Order of the Phoenix (Book 5) (Score: 10.00)
✔️ The Red Tent (Bestselling Backlist) (Score: 10.00)
✔️ White Oleander : A Novel (Score: 10.00)
