# Book Recommendation Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import *
import pickle

In [2]:
# Importing the dataset

books = pd.read_csv('books.csv')
ratings = pd.read_csv('ratings.csv')
users = pd.read_csv('users.csv')

books.drop(['Unnamed: 0'], axis=1, inplace=True)
ratings.drop(['Unnamed: 0'], axis=1, inplace=True)
users.drop(['Unnamed: 0'], axis=1, inplace=True)

books.dropna(inplace=True)

  books = pd.read_csv('books.csv')


## Models

### Popularity Based Approach

```The formula for calculating the Top Rated 250 Titles gives a true Bayesian estimate:

weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C
where:

R = average for the movie (mean)
v = number of votes for the movie
m = minimum votes required to be listed in the Top 250 
C = the mean vote across the whole report ```

In [3]:
# removing all the books whose ISBN is not 10 digits long

books = books[books["ISBN"].apply(lambda x: len(x) == 10)]

unique_ISBN = set(books["ISBN"].unique())

# removing all the ISBNs that are not in the books dataset

ratings = ratings[ratings["ISBN"].isin(unique_ISBN)]

# removing all the users who rated 0 to books

ratings = ratings[ratings["bookRating"] != 0]


In [4]:
# makeing a dataframe which stores the avg rating of a book and also the number of ratings it has got

avg_rating = pd.DataFrame()
avg_rating["ISBN"] = ratings.groupby("ISBN")["bookRating"].mean().index
avg_rating["avg_rating"] = ratings.groupby("ISBN")["bookRating"].mean().values
avg_rating["num_ratings"] = ratings.groupby("ISBN")["bookRating"].count().values

avg_rating.reset_index(inplace=True)


avg_rating.sort_values("num_ratings", ascending=False)
# sum(avg_rating["num_ratings"])


Unnamed: 0,index,ISBN,avg_rating,num_ratings
21945,21945,0316666343,8.185290,707
117742,117742,0971880107,4.390706,581
38819,38819,0385504209,8.435318,487
18058,18058,0312195516,8.182768,383
4131,4131,0060928336,7.887500,320
...,...,...,...,...
63115,63115,0553100130,8.000000,1
63109,63109,0553099817,8.000000,1
63108,63108,0553099809,7.000000,1
63106,63106,0553099744,3.000000,1


In [5]:
# weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C
# where:

# R = average for the movie (mean)
# v = number of votes for the movie
# m = minimum votes required to be listed in the Top 250 
# C = the mean vote across the whole report

m = avg_rating["num_ratings"].quantile(0.99) # top 150 books
C = ratings["bookRating"].mean()

def weighted_rating(x, m = m, C=C):
    v = x["num_ratings"]
    R = x["avg_rating"]
    return (v/(v+m) * R) + (m/(m+v) * C)


avg_rating["weighted_rating"] = avg_rating.apply(weighted_rating, axis=1)


In [6]:
avg_rating.sort_values("weighted_rating", ascending=False,inplace=True)
avg_rating.drop(["index"], axis=1, inplace=True)

In [7]:
avg_rating.head(10)

Unnamed: 0,ISBN,avg_rating,num_ratings,weighted_rating
46134,0439139597,9.262774,137,9.018884
24538,0345339738,9.402597,77,8.980597
46424,043935806X,9.033981,206,8.887132
46124,0439136369,9.082707,133,8.860129
68897,059035342X,8.939297,313,8.845817
46123,0439136350,9.035461,141,8.830547
50044,0446310786,8.943925,214,8.811094
24537,0345339711,9.120482,83,8.785423
68896,0590353403,8.983193,119,8.755526
46489,0439425220,9.869565,23,8.724261


In [8]:
avg_rating.to_csv("avg_rating.csv", index=False)

#### book covers of top rated books according to the model

In [9]:
l = avg_rating.sort_values("weighted_rating", ascending=False).head(10)["ISBN"].values

In [10]:
#show image of the book cover

from IPython.display import Image

for i in l:
    display(Image(url=books[books["ISBN"] == i]["imageUrlM"].values[0]))
    print(books[books["ISBN"] == i]["bookTitle"].values[0])

Harry Potter and the Goblet of Fire (Book 4)


The Return of the King (The Lord of the Rings, Part 3)


Harry Potter and the Order of the Phoenix (Book 5)


Harry Potter and the Prisoner of Azkaban (Book 3)


Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))


Harry Potter and the Prisoner of Azkaban (Book 3)


To Kill a Mockingbird


The Two Towers (The Lord of the Rings, Part 2)


Harry Potter and the Sorcerer's Stone (Book 1)


Harry Potter and the Chamber of Secrets Postcard Book


### Collaborative Filtering

```A recommendation technique that leverages the collective behavior and preferences of users to make personalized recommendations.```

Steps involved in collaborative filtering:

1. Data representation: Create a table with users as rows and items (ISBNs) as columns to capture user-item interactions or ratings.

2. Standardization: Normalize the ratings within each user to remove biases and bring them to a common scale.

3. Similarity calculation: Compute item-item similarity using metrics like cosine similarity based on user ratings or interactions.

4. Recommendation generation: Find the most similar items to a given item and recommend them to users who have interacted with the original item.

```Collaborative filtering taps into the idea that users with similar tastes for certain items are likely to have similar tastes for other items, providing personalized recommendations based on user behavior and preferences.```


In [11]:
# list of top 600 books according to no. of people who rated it

top_600 = avg_rating.sort_values("num_ratings", ascending=False).head(600)["ISBN"].values

In [12]:
top_600 = set(top_600)

# removing all the books that are not in the top 600

new_ratings = ratings[ratings["ISBN"].isin(top_600)]

#### for the sake of less complexity, we are taking 600 books 

In [13]:
# make a dataframe in which the index is ISBN and the columns are the userIDs and the values are the ratings given by the user

new_ratings = new_ratings.pivot(index="userID", columns="ISBN", values="bookRating")

In [14]:
new_ratings.fillna(0, inplace=True)

In [15]:
new_ratings.head()

ISBN,002542730X,0060096195,006016848X,0060173289,0060175400,0060188731,0060199652,0060391626,0060392452,0060502258,...,1559029838,1573225517,1573225789,1573227331,1573229326,1573229571,1576737330,1592400876,1844262553,1878424319
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# standardizing the ratings so that the mean of each row is 0 and the values are between -1 and 1

def standardize(row):
    new_row = (row - row.mean())/(row.max() - row.min())
    return new_row

new_ratings = new_ratings.apply(standardize)

In [17]:
new_ratings

ISBN,002542730X,0060096195,006016848X,0060173289,0060175400,0060188731,0060199652,0060391626,0060392452,0060502258,...,1559029838,1573225517,1573225789,1573227331,1573229326,1573229571,1576737330,1592400876,1844262553,1878424319
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
16,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
26,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
42,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
51,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278832,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
278836,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
278843,-0.002776,-0.001991,-0.001829,0.897926,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148
278844,-0.002776,-0.001991,-0.001829,-0.002074,-0.003021,-0.001645,-0.002291,-0.002896,-0.005696,-0.005959,...,-0.001959,-0.002005,-0.003506,-0.001894,-0.003206,-0.002046,-0.001474,-0.002199,-0.001986,-0.002148


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
# making a matrix in which the index and columns are the ISBNs and the values are the similarity between the two books

item_similarity = cosine_similarity(new_ratings.T)  # .T is used to transpose the matrix because we want the similarity between the ISBNs and not the users.

In [19]:
item_similarity_df = pd.DataFrame(item_similarity, index=new_ratings.columns, columns=new_ratings.columns)
item_similarity_df.head()

ISBN,002542730X,0060096195,006016848X,0060173289,0060175400,0060188731,0060199652,0060391626,0060392452,0060502258,...,1559029838,1573225517,1573225789,1573227331,1573229326,1573229571,1576737330,1592400876,1844262553,1878424319
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,1.0,-0.002856,0.01136,-0.002957,0.021552,0.006839,0.010143,0.00824,0.020944,0.009225,...,0.010207,0.026693,0.026751,0.036436,0.034428,0.023493,0.027385,0.023338,0.008378,-0.002976
0060096195,-0.002856,1.0,-0.002441,-0.002485,0.013087,-0.002175,-0.00254,-0.002831,0.01806,-0.004196,...,0.038404,-0.002372,0.008079,-0.002411,-0.003304,-0.002436,-0.002143,0.024936,-0.002331,-0.002501
006016848X,0.01136,-0.002441,1.0,-0.002527,0.019162,0.016128,0.02303,0.032253,0.01889,0.03876,...,0.022963,-0.002412,0.017499,0.012254,0.013313,0.044284,-0.002179,0.020884,0.019325,0.031241
0060173289,-0.002957,-0.002485,-0.002527,1.0,0.052795,-0.002252,0.057423,-0.002931,0.019747,0.023578,...,-0.002476,-0.002456,0.011201,-0.002496,0.004897,0.019434,-0.002218,-0.002561,-0.002413,-0.002589
0060175400,0.021552,0.013087,0.019162,0.052795,1.0,0.043631,0.092599,0.014578,0.03177,0.011794,...,0.037567,-0.002855,0.012699,0.008809,0.0093,0.015468,0.028994,0.015656,0.014471,-0.00301


In [20]:
def get_similar_books(book_code, user_rating):
    similar_score = item_similarity_df[book_code]*(user_rating-5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score


##### demo

In [21]:
l = get_similar_books("059035342X", 10).head(10)

#showing image of the book cover
display(Image(url=books[books["ISBN"] == "059035342X"]["imageUrlM"].values[0]))
for i in l.index:
    display(Image(url=books[books["ISBN"] == i]["imageUrlM"].values[0]))
    print(books[books["ISBN"] == i]["bookTitle"].values[0])


Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))


Harry Potter and the Chamber of Secrets (Book 2)


Harry Potter and the Prisoner of Azkaban (Book 3)


Harry Potter and the Goblet of Fire (Book 4)


Harry Potter and the Goblet of Fire (Book 4)


Harry Potter and the Order of the Phoenix (Book 5)


Harry Potter and the Prisoner of Azkaban (Book 3)


Harry Potter and the Chamber of Secrets (Book 2)


Anne of Green Gables (Anne of Green Gables Novels (Paperback))


Anne of Avonlea (Anne of Green Gables Novels (Paperback))


#### for multiple inputs by a user

In [22]:
def get_recommendations(book_ratings):
    recommendations = pd.Series()  # Empty Series to store the recommendations
    
    for book, rating in book_ratings.items():
        similar_scores = item_similarity_df[book] * (rating - 5)
        similar_scores = similar_scores.sort_values(ascending=False)
        recommendations = pd.concat([recommendations, similar_scores])
        
    recommendations = recommendations.groupby(recommendations.index).sum()
    recommendations = recommendations.sort_values(ascending=False)
    
    l = []
    for i in recommendations.index:
        if i not in book_ratings.keys():
            l.append(i)
        if len(l)==10:
            break

    return l




#### demo

In [23]:
d = {
    "059035342X": 9,
    "0345370775": 10,
    "044021145X": 8,
    "0440214041": 10,
    "0440211727": 7,
}

for i in d:
    display(Image(url=books[books["ISBN"] == i]["imageUrlM"].values[0]))
    print(books[books["ISBN"] == i]["bookTitle"].values[0])

print("-------------------------------------------")
l = get_recommendations(d)

for i in l:
    display(Image(url=books[books["ISBN"] == i]["imageUrlM"].values[0]))
    print(books[books["ISBN"] == i]["bookTitle"].values[0])



Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))


Jurassic Park


The Firm


The Pelican Brief


A Time to Kill
-------------------------------------------


The Client


Harry Potter and the Chamber of Secrets (Book 2)


Silence of the Lambs


The Chamber


The Rainmaker


Postmortem


Harry Potter and the Prisoner of Azkaban (Book 3)


The Runaway Jury


Harry Potter and the Goblet of Fire (Book 4)


The Lost World


In [24]:
# making a text file in which write the names of 600 books that we have used

f = open("books.txt", "w")
for i in top_600:
    name = books[books["ISBN"] == i]["bookTitle"].values[0]
    f.write(name+" "+i)
    f.write("\n")
f.close()

#### demo on a real user

In [25]:
# ratings by Samyak Deshpande (my dear friend)

d = {
    "0439139597": 7,
    "0345391802": 8,
    "0590353403": 6,
    "0439064864": 3,
    "0316769487": 8,
    "0439136350": 5,
    "059035342X": 8,
    "0684801523": 10,
    "0439136369": 5,
    "043935806X": 4,
    "0439064872": 3,
    "0804111359": 9,
    "0451526341": 9,
    "0156628708": 9,
    "0451524934": 7
}

for i in d:
    display(Image(url=books[books["ISBN"] == i]["imageUrlM"].values[0]))
    print(books[books["ISBN"] == i]["bookTitle"].values[0])

Harry Potter and the Goblet of Fire (Book 4)


The Hitchhiker's Guide to the Galaxy


Harry Potter and the Sorcerer's Stone (Book 1)


Harry Potter and the Chamber of Secrets (Book 2)


The Catcher in the Rye


Harry Potter and the Prisoner of Azkaban (Book 3)


Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))


The Great Gatsby


Harry Potter and the Prisoner of Azkaban (Book 3)


Harry Potter and the Order of the Phoenix (Book 5)


Harry Potter and the Chamber of Secrets (Book 2)


Secret History


Animal Farm


Mrs Dalloway


1984


In [26]:
l = get_recommendations(d)

for i in l:
    display(Image(url=books[books["ISBN"] == i]["imageUrlM"].values[0]))
    print(books[books["ISBN"] == i]["bookTitle"].values[0])

# Samyak was satisfied with the recommendations

Lord of the Flies


To Kill a Mockingbird


The World According to Garp


The Stone Diaries


Anne of Green Gables (Anne of Green Gables Novels (Paperback))


Tis: A Memoir


Catch 22


Lolita (Vintage International)


The Vampire Lestat (Vampire Chronicles, Book II)


The Gunslinger (The Dark Tower, Book 1)


#### code

In [27]:
# make a pickle file in which we store the item_similarity_df

import pickle

pickle.dump(item_similarity_df, open("item_similarity_df.pkl", "wb"))




### Content Based Recommendations 

1. Created a numpy array called `main_matrix` with dimensions (50000, 900) to represent ratings for each book across 900 unique genres.

2. Populated `main_matrix` by assigning average ratings to the corresponding positions based on the book's genres.

3. Defined the `get_recommendations(s)` function to generate book recommendations based on the user's genre preferences.

4. Initialized a numpy array called `like` (shape: (900,)) with zeros to represent the user's genre preferences.

5. Calculated recommendation scores by performing matrix multiplication between `main_matrix` and `like`.

6. Sorted the recommendation scores and obtained the top 5 book recommendations based on the highest scores.

7. Printed the titles of the top 5 recommended books from the original dataset.


In [None]:

data = pd.read_csv("books_1.Best_Books_Ever.csv")
data
df = data[["isbn","genres","rating"]]
df["genres"] = df["genres"].apply(eval)
df["genres"][0]
s = set()
for i in df["genres"]:
    for j in i:
        s.add(j)
with open("genres_order.txt", "w", encoding="utf-8") as f:
    d = defaultdict(int)
    j = 0

    for i in s:
        d[i] = j
        f.write(i + "\n")
        j += 1
    f.close()
df
main_matrix = np.zeros((52478, 982))
for i in range(len(df)):
    for j in df["genres"][i]:
        main_matrix[i][d[j]] = df["rating"][i]
def get_recommendations(s):
    like = np.zeros((982,))
    for i in s:
        like[d[i]] = 1
    
    ans = np.dot(main_matrix, like)
    
    l = []
    for i in range(len(ans)):
        l.append((ans[i], i))
    l.sort(reverse=True)

    for i in l[:5]:
        print(data.loc[i[1], "title"])

get_recommendations(["Fiction", "Romance", "Magic","Vampires","Action"])
with open('genre_matrix.pkl', 'wb') as file:
    pickle.dump(main_matrix, file)
with open("booktitle_chatbot.txt","w",encoding="utf-8") as file:
    for i in data["title"]:
        file.write(i + "\n")
    
    
