### Books Recommender system using clustering & collaborative based

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

Le dataset utilisé pour ce système de recommandation est importé depuis kaggle (ref : https://www.kaggle.com/datasets/ra4u12/bookrecommendation/), il est composé de 3 fichiers de données csv : 
*   La liste des livres "BX-Books" avec leurs informations nécessaires : l'identifiant 'ISBN', le titre du livre 'Book-Title', l'auteur 'Book-Author', la date de publication 'Year-Of-Publication', le publicateur 'Publisher', l'image de couverture du livre en plusieurs taille S,M et L 'Image-URL-S', 'Image-URL-M', 'Image-URL-L' respct
*   La liste des utilisateurs "BX-Users" avec quelques informations : l'identifiant de l'utilisateur 'User-ID', l'adresse 'location', l'age de l'utilisateur 'Age'
*   La liste des notes attribuées par certains utilisateurs "BX-Book-Ratings" aux livres contenant : l'identifiant du livre, l'identifiant de l'utilisateur et la note attribuée.

In [6]:
# Import dataset BX_books 
books = pd.read_csv('data/BX-Books.csv', sep=";", on_bad_lines = "skip", encoding = "latin-1",low_memory=False)

In [7]:
# Explore columns
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [8]:
# data shape
books.shape

(271360, 8)

In [9]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-L']]
# rename columns 
books.rename(columns = {
    "ISBN" : "isbn",
    "Book-Title" : "title",
    "Book-Author" : "author",
    "Year-Of-Publication" : "year",
    "Publisher" : "publisher",
    "Image-URL-L" : "img_url",
}, inplace = True)

In [21]:
books.head()

Unnamed: 0,isbn,title,author,year,publisher,img_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [17]:
# Import dataset BX_Users
users = pd.read_csv('data/BX-Users.csv', sep=";",on_bad_lines = "skip", encoding = "latin-1",low_memory=False)

In [18]:
# Explore columns
users.columns

Index(['User-ID', 'Location', 'Age'], dtype='object')

In [19]:
# data shape
users.shape

(278858, 3)

In [20]:
# rename columns
users.rename(columns = {
    "User-ID" : "id",
    "Location" : "location",
    "Age" : "age",
}, inplace = True)

In [22]:
users.head()

Unnamed: 0,id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [23]:
# Import dataset BX-Book-Ratings
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=";", on_bad_lines = "skip", encoding = "latin-1",low_memory=False)

In [25]:
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [26]:
ratings.rename(columns = {
    "User-ID" : "id",
    "ISBN" : "isbn",
    "Book-Rating" : "rating",
}, inplace = True)

In [27]:
ratings.shape

(1149780, 3)

In [28]:
ratings.head()

Unnamed: 0,id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [32]:
# Keep only the books that have more than 200 ratings from users
x = ratings["id"].value_counts() > 200
x[x].shape

(899,)

In [31]:
y = x[x].index
y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727, 268622,
       188951],
      dtype='int64', name='id', length=899)

In [36]:
ratings = ratings[ratings["id"].isin(y)]
ratings.shape

(526356, 3)

In [37]:
# Retrieve the details of each book
books_with_ratings = ratings.merge(books, on = "isbn")
books_with_ratings.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,img_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [38]:
num_ratings = books_with_ratings.groupby('title')["rating"].count().reset_index()
num_ratings.head(10)

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1
5,Dark Justice,1
6,Deceived,1
7,Earth Prayers From around the World: 365 Pray...,3
8,Final Fantasy Anthology: Official Strategy Gu...,3
9,Flight of Fancy: American Heiresses (Zebra Ba...,1


In [39]:
num_ratings.rename(columns={'rating':'num_of_rating'}, inplace = True)
num_ratings.head()

Unnamed: 0,title,num_of_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [40]:
books_with_ratings = books_with_ratings.merge(num_ratings, on = "title")
books_with_ratings.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,img_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82


In [44]:
data = books_with_ratings[books_with_ratings['num_of_rating']>= 50]

In [None]:
data.drop_duplicates(["id","title"], inplace = True)

In [46]:
data.shape

(59850, 9)

In [47]:
data.sample(10)

Unnamed: 0,id,isbn,rating,title,author,year,publisher,img_url,num_of_rating
78,269566,002542730X,7,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
54014,11676,0380789019,7,Neverwhere,Neil Gaiman,1998,Avon,http://images.amazon.com/images/P/0380789019.0...,72
54199,6575,0380977788,9,Coraline,Neil Gaiman,2002,HarperCollins,http://images.amazon.com/images/P/0380977788.0...,59
61239,274308,0553582526,0,Dust to Dust,TAMI HOAG,2002,Bantam,http://images.amazon.com/images/P/0553582526.0...,78
86683,40889,006101351X,0,The Perfect Storm : A True Story of Men Agains...,Sebastian Junger,1998,HarperTorch,http://images.amazon.com/images/P/006101351X.0...,163
57034,110973,0671021001,0,She's Come Undone (Oprah's Book Club),Wally Lamb,1998,Pocket,http://images.amazon.com/images/P/0671021001.0...,128
99106,78783,0312966970,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks,http://images.amazon.com/images/P/0312966970.0...,98
52617,182085,0330332775,0,Bridget Jones's Diary,Helen Fielding,1997,Picador (UK),http://images.amazon.com/images/P/0330332775.0...,277
35864,185233,044023669X,0,The Kiss,Danielle Steel,2002,Dell Publishing Company,http://images.amazon.com/images/P/044023669X.0...,76
56374,14521,0553250531,0,The Valley of Horses,JEAN M. AUEL,1984,Bantam,http://images.amazon.com/images/P/0553250531.0...,52


In [48]:
# matrice de contingence
pivot_data = data.pivot_table(columns = 'id', index = 'title', values = 'rating')

In [49]:
pivot_data

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [51]:
# replace Nan values with 0
pivot_data.fillna(0,inplace=True)

In [52]:
pivot_data

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Matrice sparse
book_sparse = csr_matrix(pivot_data)

In [54]:
# Apply NearestNeighbors algorithm
model = NearestNeighbors(algorithm = 'brute')
model.fit(book_sparse)

In [55]:
# Exemple d'un des livres 
distance, suggestion = model.kneighbors(pivot_data.iloc[237,:].values.reshape(1,-1), n_neighbors = 6)

In [56]:
distance

array([[ 0.        , 68.78953409, 69.5413546 , 72.64296249, 76.83098333,
        77.28518616]])

In [57]:
for i in range(len(suggestion)) :
    print(pivot_data.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [58]:
books_name = pivot_data.index

In [101]:
# save results on files
pickle.dump(model, open('artifacts/model.pkl','wb'))
pickle.dump(books_name, open('artifacts/books_name.pkl','wb'))
pickle.dump(data, open('artifacts/data.pkl','wb'))
pickle.dump(pivot_data, open('artifacts/pivot_data.pkl','wb'))

In [59]:
def recommend_book(book_name):
    book_id = np.where(pivot_data.index ==book_name)[0][0]
    distance, suggestion = model.kneighbors(pivot_data.iloc[book_id,:].values.reshape(1,-1), n_neighbors = 6)
    books = pivot_data.index[suggestion[0]]
    for j in books : 
        print(j)

In [60]:
recommend_book('1984')

1984
No Safe Place
A Civil Action
Foucault's Pendulum
Long After Midnight
Abduction
