# Book Recommendation System - PART 2 (Modeling)

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for content based filtering
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

#for collaborative filtering
import os
import math
import random

from surprise import accuracy, Reader, Dataset, dump
from surprise import NormalPredictor, KNNBasic, SVD, SVDpp
from surprise.model_selection import cross_validate, GridSearchCV

## Import Dataset

In [3]:
pd.options.display.float_format = '{:.2f}'.format
ratings = pd.read_csv('Dataset/ratings.csv')
books = pd.read_csv('Dataset/books_cleaned.csv')

In [4]:
books.head()

Unnamed: 0.1,Unnamed: 0,book_id,title,authors,year,pages,description,genres,average_rating,ratings_count,books_count,small_image_url
0,0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2008,374,winning means fame and fortunelosing means cer...,"youngadult, fiction, fantasy, sciencefiction, ...",4.34,4780653,272,https://images.gr-assets.com/books/1447303603s...
1,1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",1997,309,harry potters life is miserable his parents ar...,"fantasy, fiction, youngadult, classics",4.44,4602479,491,https://images.gr-assets.com/books/1474154022s...
2,2,3,"Twilight (Twilight, #1)",Stephenie Meyer,2005,501,about three things i was absolutely positive ...,"youngadult, fantasy, romance, fiction, paranormal",3.57,3866839,226,https://images.gr-assets.com/books/1361039443s...
3,3,4,To Kill a Mockingbird,Harper Lee,1960,324,the unforgettable novel of a childhood in a sl...,"classics, fiction, historicalfiction, youngadult",4.25,3198671,487,https://images.gr-assets.com/books/1361975680s...
4,4,5,The Great Gatsby,F. Scott Fitzgerald,1925,200,alternate cover edition isbn isbn13 the great...,"classics, fiction, historicalfiction, romance",3.89,2683664,1356,https://images.gr-assets.com/books/1490528560s...


In [5]:
books.columns

Index(['Unnamed: 0', 'book_id', 'title', 'authors', 'year', 'pages',
       'description', 'genres', 'average_rating', 'ratings_count',
       'books_count', 'small_image_url'],
      dtype='object')

In [6]:
books = books.drop(columns=['Unnamed: 0'], axis=1)

In [7]:
books.columns

Index(['book_id', 'title', 'authors', 'year', 'pages', 'description', 'genres',
       'average_rating', 'ratings_count', 'books_count', 'small_image_url'],
      dtype='object')

In [8]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


## 1. Simple Recommendation

One of the easiest way to give recommendation is to rank the book based on `average_rating` or `rating_count` (popularity). However, as we mentioned in EDA, we found:
1. Book with relatively lower number of `rating_count` (less popular) when we rank book based on `average_rating`.
2. Book with relatively lower `average_rating` when we ranked book based on `rating_count`.

Therefore, we need to make a weighted rating of `average_rating` and `rating_count`.

### a. Recommendation based on Weighted Average of Rating and Popularity

One of the easiest way to give recommendation is to rank the book in metadata based on average_rating or rating_count (popularity). However, as we mentioned in the EDA phase, we found:
1. Book with relatively lower number of rating_count (less popular) when we rank book based on average_rating.
2. Book with relatively lower average_rating when we ranked book based on rating_count.

Therefore, we need to make a new scoring of average_rating and rating_count. In this case, I will use rating formula like the one used in IMDB site to determine the Top Rated 250 Movies.

New Rating Score is determined by the following equation:
![New Rating Formula](new_rating_score.png "New Rating Score")

where:<br>
v = number of ratings (`ratings_count`)<br>
m = minimum `ratings_count` required to be recommended<br>
R = average of ratings (`average_rating`) <br>
C = the mean ratings for all books


Now let's determine the appropriate value for m, the number of votes needed to be listed in the chart. For this simple recommender, our cutoff will be the 95th percentile. In order for a book to appear in the recommendation, it must receive at least 95% of the other books on the list (around 2100 ratings).

In [9]:
def simple_recommender(books, n=5):
    v = books['ratings_count']
    m = books['ratings_count'].quantile(0.95)
    R = books['average_rating']
    C = books['average_rating'].median()
    score = (v/(v+m) * R) + (m/(m+v) * C)   
    books['score'] = score
    
    qualified  = books.sort_values('score', ascending=False)
    return qualified[['book_id', 'title', 'authors', 'average_rating', 'ratings_count','score']].head(n)

In [10]:
recommend = simple_recommender(books)
type(recommend)

pandas.core.frame.DataFrame

In [11]:
books = recommend.to_dict()
len(books)

6

### b. Evaluation

This system offers generalized recommendations to every user based on popularity and average rating of the book. The recommender some flaws. For example, it makes the same suggestion to everyone, regardless of their own preferences. The top of our chart is full with J.K. Rowling's Harry Potter novels.

In order to personalize of our recommendations, we are going to create recommendation system that compares books based on a set of metrics and suggests books that are most similar to a particular book that a user liked.

### 2. Collaborative Filtering

In [12]:
#to have reproducible experiments
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [13]:
#Load the full dataset
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings, reader)

In [14]:
#shuffle the ratings for unbiased result
all_ratings = data.raw_ratings
random.shuffle(all_ratings)

In [15]:
#split data into train and test data with the ratio 70:30
threshold = int(0.7 * len(all_ratings))
train_ratings = all_ratings[:threshold]
test_ratings = all_ratings[threshold:]

In [16]:
def book_read(user_id):
    '''Take user_id and return list of book that user has read'''
    books_list = list(books['book_id'])
    book_read_list = list(ratings['book_id'][ratings['user_id'] == user_id])
    return books_list, book_read_list

### Singular Value Decomposition (SVD)

The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize. SVD finds the latent factors associated with some matrix. SVD will decompose user-rating matrix into matrices that represents latent user-user features and item-item features.

In [17]:
#change data to trainset
data.raw_ratings = train_ratings 

#select algorithm
svd = SVD(random_state=0)

In [18]:
%%time

#cross validation for train data
svd_result = cross_validate(svd, data, measures=["RMSE"], cv=5, verbose=True, n_jobs = 2)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8503  0.8500  0.8484  0.8503  0.8510  0.8500  0.0009  
Fit time          66.06   61.25   70.27   67.95   44.72   62.05   9.16    
Test time         16.97   21.47   19.31   21.14   10.16   17.81   4.15    
CPU times: total: 2min 24s
Wall time: 4min 42s


In [19]:
%%time

#retrain whole train test
trainset = data.build_full_trainset()
svd.fit(trainset)

# Compute RMSE on trainset (without fold)
svd_train_pred = svd.test(trainset.build_testset())
print('Train RMSE:')
train_rmse = accuracy.rmse(svd_train_pred)

#compute RMSE on testset
testset = data.construct_testset(test_ratings)
svd_test_pred = svd.test(testset)
print('Test RMSE:')
test_rmse = accuracy.rmse(svd_test_pred)

Train RMSE:
RMSE: 0.6441
Test RMSE:
RMSE: 0.8386
CPU times: total: 2min 56s
Wall time: 3min 3s


In [20]:
def get_recommendation_svd(user_id, n=5):
    '''Give n recommendation to user_id'''
    
    all_books, user_books =  book_read(user_id)
    next_books = [book for book in all_books if book not in user_books]
    
    if n <= len(next_books):
        ratings = []
        for book in next_books:
            est = svd.predict(user_id, book).est
            ratings.append((book, est))
        ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        book_ids = [id for id, rate in ratings[:n]]
        return books[books.book_id.isin(book_ids)][['book_id', 'title', 'authors', 'year', 'pages', 'description', 'genres', 'average_rating', 'small_image_url']]
    else:
        print('Please reduce your recommendation request')
        return

In [21]:
get_recommendation_svd(2, 5)

Please reduce your recommendation request


In [22]:
def get_books_by_genre(df, genres):
    return df[df.genres.str.contains('|'.join(genres), case=False)]

In [None]:
type(genres)

pandas.core.frame.DataFrame

## Saving the SVD model

In [24]:
import pickle


with open('model.pkl', 'wb') as f:
    pickle.dump(svd,f)

In [35]:
books

{'book_id': {21: 25, 23: 27, 15: 18, 20: 24, 1: 2},
 'title': {21: 'Harry Potter and the Deathly Hallows (Harry Potter, #7)',
  23: 'Harry Potter and the Half-Blood Prince (Harry Potter, #6)',
  15: 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',
  20: 'Harry Potter and the Goblet of Fire (Harry Potter, #4)',
  1: "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"},
 'authors': {21: 'J.K. Rowling, Mary GrandPré',
  23: 'J.K. Rowling, Mary GrandPré',
  15: 'J.K. Rowling, Mary GrandPré, Rufus Beck',
  20: 'J.K. Rowling, Mary GrandPré',
  1: 'J.K. Rowling, Mary GrandPré'},
 'average_rating': {21: 4.61, 23: 4.54, 15: 4.53, 20: 4.53, 1: 4.44},
 'ratings_count': {21: 1746574,
  23: 1678823,
  15: 1832823,
  20: 1753043,
  1: 4602479},
 'score': {21: 4.557539449718259,
  23: 4.492069644048632,
  15: 4.486605122534423,
  20: 4.484805254011454,
  1: 4.425000618173237}}

In [36]:
books_df = pd.read_csv("Dataset/books_cleaned.csv")

In [39]:
books_df.head()

Unnamed: 0.1,Unnamed: 0,book_id,title,authors,year,pages,description,genres,average_rating,ratings_count,books_count,small_image_url
0,0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2008,374,winning means fame and fortunelosing means cer...,"youngadult, fiction, fantasy, sciencefiction, ...",4.34,4780653,272,https://images.gr-assets.com/books/1447303603s...
1,1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",1997,309,harry potters life is miserable his parents ar...,"fantasy, fiction, youngadult, classics",4.44,4602479,491,https://images.gr-assets.com/books/1474154022s...
2,2,3,"Twilight (Twilight, #1)",Stephenie Meyer,2005,501,about three things i was absolutely positive ...,"youngadult, fantasy, romance, fiction, paranormal",3.57,3866839,226,https://images.gr-assets.com/books/1361039443s...
3,3,4,To Kill a Mockingbird,Harper Lee,1960,324,the unforgettable novel of a childhood in a sl...,"classics, fiction, historicalfiction, youngadult",4.25,3198671,487,https://images.gr-assets.com/books/1361975680s...
4,4,5,The Great Gatsby,F. Scott Fitzgerald,1925,200,alternate cover edition isbn isbn13 the great...,"classics, fiction, historicalfiction, romance",3.89,2683664,1356,https://images.gr-assets.com/books/1490528560s...


In [37]:
def get_book_id(book_title):
    book = books_df[books_df['title']==book_title]
    if len(book) > 0:
        book_id = books_df['book_id'].iloc[0]
        return book_id
    else:
        return None


In [42]:
get_book_id("The Great Gatsby")

1

In [28]:
def get_new_user_id(title_ratings, ratings_df):
    # Get book IDs for the given book titles
    book_ids = [get_book_id(title) for title in title_ratings.keys()]
    
    # Create a new row for the new user
    new_user_id = ratings_df['user_id'].max() + 1
    new_user_ratings = pd.DataFrame({
        'user_id': [new_user_id] * len(book_ids),
        'book_id': book_ids,
        'rating': list(title_ratings.values())
    })
    
    # Append the new user's ratings to the existing ratings dataframe
    updated_ratings_df = pd.concat([rating_df, new_user_ratings], ignore_index=True)
    
    # Return the ID of the new user and the updated ratings dataframe
    return new_user_id, updated_ratings_df


In [43]:
title_ratings = {'The Hunger Games': 5, 'To Kill a Mockingbird': 4, 'Pride and Prejudice': 3}

# Get the ID of the new user and the updated ratings dataframe
new_user_id, updated_ratings_df = get_new_user_id(title_ratings, ratings)

NameError: name 'rating_df' is not defined

In [29]:
title_ratings = {'The Hunger Games': 5, 'To Kill a Mockingbird': 4, 'Pride and Prejudice': 3}

# Get the ID of the new user and the updated ratings dataframe
new_user_id, updated_ratings_df = get_new_user_id(title_ratings, ratings)

KeyError: False