In [56]:
#This Entire Notebook will be used for all the code so make sure you have all the packages installed otherwise you wont be able to run this notebook

In [57]:
#Import all the libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD  
from sklearn.model_selection import train_test_split
from difflib import SequenceMatcher

In [58]:
# Load the datasets
books_path = '../Dataset/Books.csv'
ratings_path = '../Dataset/Ratings.csv'
users_path = '../Dataset/Users.csv'

# Reading the files
books = pd.read_csv(books_path, dtype={'Year-Of-Publication': object})
ratings = pd.read_csv(ratings_path)
users = pd.read_csv(users_path)


In [59]:
# Preprocessing steps (as described earlier)

# Preprocessing Books DataFrame
books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce').fillna(0).astype(int)
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)

# Preprocessing Ratings DataFrame
ratings['Book-Rating'] = pd.to_numeric(ratings['Book-Rating'], errors='coerce')

# Preprocessing Users DataFrame
users['Age'] = pd.to_numeric(users['Age'], errors='coerce').fillna(users['Age'].median())
users['Age'] = users['Age']
users['Age'] = users['Age'].clip(lower=10, upper=100).astype(int)

In [60]:
# Deduplicate ratings
deduped_ratings = ratings.drop_duplicates(['User-ID', 'ISBN']) 

# Train/test split
train, test = train_test_split(deduped_ratings, test_size=0.2, random_state=42) 

# Evaluation metric
def rmse(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    return np.sqrt(mse)

In [61]:
# Pivot ratings matrix 
def pivot_ratings(ratings):
    return ratings.pivot(index='User-ID', columns='ISBN', values='Rating').fillna(0)


In [62]:
# Similarity function 
def book_similarity(title1, title2):
    return SequenceMatcher(None, title1, title2).ratio() 


In [63]:
# Pivot ratings matrix 
def pivot_ratings(ratings):
    return ratings.pivot(index='User-ID', columns='ISBN', values='Rating').fillna(0)


In [64]:
def content_based_recommendations(user_preferences, books, n_recs=5):
    
    # Filter books by favorite authors 
    favorite_authors = user_preferences.get('favorite_authors', [])
    auth_books = books[books['Book-Author'].isin(favorite_authors)]
    
    # Filter books similar to user's favorite books
    favorite_books = user_preferences.get('favorite_books', [])
    similar_books = []
    for book in favorite_books:
        similarities = books.apply(lambda x: book_similarity(x['Book-Title'], book), axis=1)
        similar_book = books.loc[similarities.idxmax()] 
        similar_books.append(similar_book)
        
    # Combine the filtered data frames and return top books
    recs = pd.concat([auth_books, pd.DataFrame(similar_books)], ignore_index=True) 
    return recs[:n_recs]

In [65]:
def collab_recommendations(user_id, ratings, books, n_recs=5):

    # Compute cosine similarity between users 
    user_ratings = ratings[ratings['User-ID'] == user_id]
    other_ratings = ratings[ratings['User-ID'] != user_id]
    user_book_matrix = pivot_ratings(user_ratings)
    other_book_matrix = pivot_ratings(other_ratings)
    similarities = cosine_similarity(user_book_matrix, other_book_matrix)
    
    # Find most similar users
    similar_users = np.argsort(similarities)[-1:-6:-1]  
    
    # Get top book recommendations from similar users
    top_books = {}
    for user in similar_users:
        other_user_books = other_book_matrix.iloc[user] 
        for i, rating in other_user_books.iteritems():
            if i not in user_book_matrix.columns:
                if i not in top_books or top_books[i] < rating:
                    top_books[i] = rating
                    
    # Return top books
    return books.loc[list(top_books.keys())][:n_recs]

In [66]:
def hybrid_recommendations(user_id, user_prefs, ratings, books, n=5):

    user_ratings = ratings[ratings['User-ID'] == user_id]
    
    if len(user_ratings) >= 10:
       cf_recs = collab_recommendations(user_id, ratings, books, n//2)
       cb_recs = content_based_recommendations(user_prefs, books, n//2)  
       return pd.concat([cf_recs, cb_recs]).head(n)
       
    elif len(user_ratings) >= 5: 
       # Put more weight on collaborative filtering
       cf_recs = collab_recommendations(user_id, ratings, books, n*2//3)  
       cb_recs = content_based_recommendations(user_prefs, books, n//3)
       return pd.concat([cf_recs, cb_recs]).head(n)
       
    else:
       # Only content-based filtering
       return content_based_recommendations(user_prefs, books, n)

In [67]:
def evaluate_cf(model_predictions, actual_ratings):
    
    # Calculate RMSE
    mse = mean_squared_error(actual_ratings, model_predictions)  
    rmse = np.sqrt(mse)
    
    print(f'Collaborative Filtering RMSE: {rmse:.3f}')


In [68]:
# Example usage
user_id = 12345  # Replace with a user ID
user_preferences = {
    "favorite_authors": ["J.K. Rowling", "Isaac Asimov"],
    "favorite_books": ["Harry Potter and the Sorcerer's Stone"]
}
recommended_books = hybrid_recommendations(user_id, user_preferences, ratings, books)
print(recommended_books)

         ISBN                                         Book-Title  \
0  0671721038                       The New Hugo Winners, Vol. 2   
1  0446670626                I, Robot:The Illustrated Screenplay   
2  0385177259  Foundation's Edge (Foundation Novels (Hardcover))   
3  0380699176             100 Great Fantasy Short, Short Stories   
4  0345309014  Second Foundation (Foundation Novels (Paperback))   

    Book-Author  Year-Of-Publication                           Publisher  
0  Isaac Asimov                 1991                                Baen  
1  Isaac Asimov                 1994                        Warner Books  
2  Isaac Asimov                 1982               Bantam Dell Pub Group  
3  Isaac Asimov                 1987  Harper Mass Market Paperbacks (Mm)  
4  Isaac Asimov                 1983                    Ballantine Books  
