**Book Recommendation system using Collaborative Filtering (Nearest Neighbours)**


In [1]:
#Importing the dataset from kaggle

!pip install -q kaggle

from google.colab import files
files.upload()

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d zygmunt/goodbooks-10k

! unzip goodbooks-10k.zip

Saving kaggle.json to kaggle.json
Downloading goodbooks-10k.zip to /content
 52% 6.00M/11.6M [00:00<00:00, 59.7MB/s]
100% 11.6M/11.6M [00:00<00:00, 56.5MB/s]
Archive:  goodbooks-10k.zip
  inflating: book_tags.csv           
  inflating: books.csv               
  inflating: ratings.csv             
  inflating: sample_book.xml         
  inflating: tags.csv                
  inflating: to_read.csv             


In [2]:
#Importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

  import pandas.util.testing as tm


In [3]:
#Loading the datasets

books= pd.read_csv("books.csv",error_bad_lines = False)             #Containing book details
book_tags = pd.read_csv("book_tags.csv",error_bad_lines = False)    #Books with their tags
ratings = pd.read_csv("ratings.csv", error_bad_lines = False)       #ratings from the users for the books
tags = pd.read_csv("tags.csv", error_bad_lines = False)             #genres corresponding to their tag

In [4]:
#Look at the datasets
display(books.head(2))
print("\n")
print(books.info())

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   book_id                    10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  in

In [5]:
display(book_tags.head(2))
print("\n")
print(book_tags.info())

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999912 entries, 0 to 999911
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   goodreads_book_id  999912 non-null  int64
 1   tag_id             999912 non-null  int64
 2   count              999912 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB
None


In [6]:
display(ratings.head(2))
print("\n")
print(ratings.info())

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981756 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  981756 non-null  int64
 1   user_id  981756 non-null  int64
 2   rating   981756 non-null  int64
dtypes: int64(3)
memory usage: 22.5 MB
None


In [8]:
display(tags.tail(2))
print("\n")
print(tags.info())

Unnamed: 0,tag_id,tag_name
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34252 entries, 0 to 34251
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tag_id    34252 non-null  int64 
 1   tag_name  34252 non-null  object
dtypes: int64(1), object(1)
memory usage: 535.3+ KB
None


In [9]:
#Data cleaning process

#Remove duplicates and rows with NAN values

books_colns = ["id", "book_id", "original_title", "title", "average_rating", "ratings_count"]        #Get required columns
#books[books_colns].info()
books_df = books[books_colns].drop_duplicates(subset=["original_title"]).dropna()
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9274 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              9274 non-null   int64  
 1   book_id         9274 non-null   int64  
 2   original_title  9274 non-null   object 
 3   title           9274 non-null   object 
 4   average_rating  9274 non-null   float64
 5   ratings_count   9274 non-null   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 507.2+ KB


In [10]:
books_df.isna().any()

id                False
book_id           False
original_title    False
title             False
average_rating    False
ratings_count     False
dtype: bool

In [11]:
ratings_df = ratings.drop_duplicates(subset=["book_id", "user_id"])
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 979478 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  979478 non-null  int64
 1   user_id  979478 non-null  int64
 2   rating   979478 non-null  int64
dtypes: int64(3)
memory usage: 29.9 MB


In [12]:
ratings_df["rating"].value_counts()

4    356620
5    292068
3    248144
2     63116
1     19530
Name: rating, dtype: int64

In [13]:
#To make good recommendation, we should choose users who rated atleast 20 times and books have been rated atleast 20 times

book_rating_count = ratings_df["book_id"].value_counts()                                                   #Book ratings count
ratings_df1 = ratings_df[ratings_df["book_id"].isin(book_rating_count[book_rating_count >= 20].index)]     #Books with rating counts >= 20

user_rating_count = ratings_df1["user_id"].value_counts()                                                  #User ratings count
ratings_df2 = ratings_df1[ratings_df1["user_id"].isin(user_rating_count[user_rating_count >= 20].index)]   #Users with ratings >= 20

In [14]:
ratings_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 720208 entries, 0 to 981752
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  720208 non-null  int64
 1   user_id  720208 non-null  int64
 2   rating   720208 non-null  int64
dtypes: int64(3)
memory usage: 22.0 MB


In [15]:
#Create rating matrix

rating_matrix = ratings_df2.pivot(index="book_id", columns="user_id", values="rating").fillna(0)

In [16]:
rating_matrix.head()

user_id,7,19,23,27,35,40,41,46,47,49,52,75,82,87,89,107,111,113,116,118,119,124,143,145,146,148,153,158,164,173,178,193,194,202,206,207,208,209,213,215,...,53218,53238,53241,53244,53245,53251,53256,53266,53268,53275,53279,53281,53286,53288,53291,53292,53293,53295,53306,53318,53331,53332,53333,53337,53339,53347,53348,53352,53364,53366,53372,53373,53378,53381,53388,53401,53403,53409,53411,53413
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#rating_matrix.values
sparse_matrix = csr_matrix(rating_matrix.values)
sparse_matrix

<9997x14612 sparse matrix of type '<class 'numpy.float64'>'
	with 720208 stored elements in Compressed Sparse Row format>

In [20]:
#Use K-NN model

model = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20, n_jobs=-1)
model.fit(sparse_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [39]:
def make_recommendation(model_knn, data, mapper, index, book, n_recommendations):
    
    # fit
    model_knn.fit(data)
    
    # get input movie index
    print("Input book:", book)
    
    distances, indices = model_knn.kneighbors(data[index], n_neighbors=n_recommendations+1)
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    
    # book mapper
    book_mapper = {v: k for k, v in mapper.items()}

    # print recommendations
    print("\nHere are the few recommended books for you: \n")
    
    for i, (idx, dist) in enumerate(raw_recommends):
        if idx not in book_mapper.keys():
            continue
        print('{0}: {1}'.format(i+1, book_mapper[idx]))
        

In [40]:
#my_book = "Harry Potter and the Chamber of Secrets"
print("Enter your book: ")
my_book = input().strip()
book_index = books_df[books_df["original_title"] == "Harry Potter and the Chamber of Secrets"].index[0]
indices = pd.Series(books_df.index, index=books_df['original_title'])

Enter your book: 
Harry Potter and the Chamber of Secrets


In [41]:
make_recommendation(
    model_knn=model,
    data=sparse_matrix,
    book=my_book,
    mapper=indices,
    index=book_index,
    n_recommendations=10)

Input book: Harry Potter and the Chamber of Secrets

Here are the few recommended books for you: 

1: The Return of the King
2: Mockingjay
3: The Da Vinci Code
4: Catching Fire
5: Harry Potter and the Philosopher's Stone
6: Harry Potter and the Deathly Hallows
7: Harry Potter and the Half-Blood Prince
8: Harry Potter and the Order of the Phoenix
9: Harry Potter and the Goblet of Fire
10: Harry Potter and the Prisoner of Azkaban
