In [3]:
import numpy as np
import pandas as pd

In [4]:
books = pd.read_csv('Books Dataset/Books.csv')
users = pd.read_csv('Books Dataset/Users.csv')
ratings = pd.read_csv('Books Dataset/Ratings.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'books.csv'

In [None]:
books.head()

In [None]:
users.head()

In [None]:
ratings.head()

In [None]:
print(books.shape)
print(ratings.shape)
print(users.shape)

In [None]:
books.isnull().sum()

In [None]:
users.isnull().sum()

In [None]:
ratings.isnull().sum()

In [None]:
books.duplicated().sum()

In [None]:
ratings.duplicated().sum()

In [None]:
users.duplicated().sum()

## Popularity Based Recommender System

#### We will list all the 50 books with highest average rating but we will select only those books whose no of ratings is more than 250

In [None]:
# merging books and ratings dataset on ISBN no attribute

ratings_with_name = ratings.merge(books,on='ISBN')

In [None]:
# creating a dataframe with book title and number of ratings on the book(num_ratings)
# we will use reset_index() to form a dataframe otherwise it will be a list

num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)
num_rating_df

In [None]:
# we will calculate the average rating using mean function

avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_rating'},inplace=True)
avg_rating_df

In [None]:
# Now the final dataframe will be popular_df after merging above two dataframes on Book Title attribute

popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
popular_df

In [None]:
#filtering books with num_rating>250 and then sorting the books and displaying top 50 books

popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)

In [None]:
# Some books have duplicate titles , so droping duplicates and the displaying title, Author,Image,num_rating,avg_rating

popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_rating']]

In [None]:
popular_df

## Collaborative Filtering Based Recommender System

#### Our approach here is that we will only select users which has rated more than 200 books and we will only select books which are rated by more than 50 users so as to get more accurate results for the model

In [None]:
# We will only select users who have given rating to more than 200 books to avoid outliners

x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
selected_users = x[x].index

In [None]:
#we will only select those ratings which are given by selected users 

filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(selected_users)]

In [None]:
# We will only select those books which has more than 50 ratings to avoid outliners and get more accurate results

y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index

In [None]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [None]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [None]:
pt.fillna(0,inplace=True)

In [None]:
pt

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity_scores = cosine_similarity(pt)

In [None]:
similarity_scores.shape

In [None]:
def recommend(book_name):
    # index fetch
    index = np.where(pt.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:5]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    
    return data

In [None]:
recommend('1984')

In [None]:
import pickle
pickle.dump(popular_df,open('popular.pkl','wb'))

In [None]:
books.drop_duplicates('Book-Title')

In [None]:
pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))