In [1]:
import numpy as np
import pandas as pd
import warnings

In [2]:
warnings.filterwarnings('ignore')
books = pd.read_csv('Books.csv',sep=';',on_bad_lines='skip',encoding='latin-1')

In [3]:
# Books.head()

In [4]:
books.shape

(271360, 8)

In [5]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [6]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [7]:
books = books[['ISBN','Book-Title','Book-Author','Year-Of-Publication', 'Publisher']]
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial


In [8]:
books.rename(columns={'Book-Title':'title' , 'Book-Author':'author','Year-Of-Publication':'year','Publisher':'pb'},inplace=True)

In [9]:
books.head(2)
# books.isnull().sum()

Unnamed: 0,ISBN,title,author,year,pb
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


In [10]:
users = pd.read_csv('Users.csv',sep=';',on_bad_lines='skip',encoding='latin-1')

In [11]:
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [12]:
users.shape

(278858, 3)

In [13]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [14]:
users.rename(columns={'User-ID':'user_id','Location':'location','Age':'age'},inplace=True)
users.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [15]:
ratings = pd.read_csv('Book-Ratings.csv',sep=';',on_bad_lines='skip',encoding='latin-1')

In [16]:
ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


In [17]:
ratings.rename(columns={'User-ID':'user_id','Book-Rating':'rating'},inplace=True)

In [18]:
ratings.head(2)
# ratings.isnull().sum()

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


### shape/size/length of data 

#### books

In [19]:
print(books.shape)
print(books.size)
print(len(books))
# 271360 books exist

(271360, 5)
1356800
271360


In [20]:
books.head(2)

Unnamed: 0,ISBN,title,author,year,pb
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


#### users

In [21]:
print(users.shape)
print(users.size)
print(len(users))
# 278858 users exist to read books

(278858, 3)
836574
278858


In [22]:
users.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


#### ratings

In [23]:
print(ratings.shape)
print(ratings.size)
print(len(ratings))
# 1149780 ratings points earned by books from users 

(1149780, 3)
3449340
1149780


In [24]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


<h4>To build a recommender system we such users that rate books minimum 160 points mean so firts we have to get that who readbooks out 278858</h4>

In [25]:
# ratings.value_counts() to get every user ratings

# To get exact figuer of users that rate books mean neglect users that don't rate mean mean don't read books
ratings['user_id'].value_counts().shape

(105283,)

In [54]:
# get these users their rating is >=160
# X_rate_maximum = ratings['user_id'].value_counts()>=160
X_rate_maximum = ratings['user_id'].value_counts()>200
X_rate_maximum.value_counts()

count
True     899
False    252
Name: count, dtype: int64

<h3>Users who read books and rate are only 105283 and out of them on 1151 users rate >=160, So we will use them to build a recommender system.</h3>

In [55]:
X_rate_maximum

user_id
11676      True
198711     True
153662     True
98391      True
35859      True
          ...  
172061    False
110746    False
173018    False
132173    False
186039    False
Name: count, Length: 1151, dtype: bool

In [56]:
# To get only true values
# X_rate_maximum = X_rate_maximum[X_rate_maximum].shape

# To get indexes of obtained users
Y = X_rate_maximum[X_rate_maximum].index
Y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       116122, 188951,  44296,  28634,  73681, 274808, 155916, 268622,  59727,
         9856],
      dtype='int64', name='user_id', length=899)

In [60]:
# To get those rows where our obtained usrs (>=160) rate
ratings = ratings[ratings['user_id'].isin(Y)]
ratings.shape

(526356, 3)

<h3>Note: only 1151 users rate 571450 of 1149780 </h3>

In [61]:
ratings.head(3)

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8


### Merge books and ratings table mean join

In [62]:
books_with_ratings = ratings.merge(books,on='ISBN')
books_with_ratings.shape

(487671, 7)

In [63]:
books_with_ratings.head(3)

Unnamed: 0,user_id,ISBN,rating,title,author,year,pb
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc


In [64]:
number_of_rating = books_with_ratings.groupby('title')['rating'].count().reset_index()

In [65]:
number_of_rating.rename(columns={'rating':'number of rating'},inplace=True)
number_of_rating.head(3)

Unnamed: 0,title,number of rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1


In [66]:
final_rating = books_with_ratings.merge(number_of_rating,on='title')
final_rating.shape

(487671, 8)

In [67]:
final_rating = final_rating[final_rating['number of rating']>50]

In [68]:
final_rating.shape


(59903, 8)

In [69]:
final_rating.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,pb,number of rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82


In [70]:
final_rating.drop_duplicates(['user_id','title'],inplace=True)
final_rating.shape

(57952, 8)

In [71]:
book_pivot = final_rating.pivot_table(columns='user_id',index='title',values='rating')

In [72]:
book_pivot.shape

(703, 888)

In [73]:
# To replace NaN with 0.0
book_pivot.fillna(0,inplace=True)
book_pivot.head(3)

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
from scipy.sparse import csr_matrix
book_sparse = csr_matrix(book_pivot)
type(book_sparse)
# book_sparse

scipy.sparse._csr.csr_matrix

In [75]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [76]:
model.fit(book_sparse)

In [92]:
distance , suggestions = model.kneighbors(book_pivot.iloc[444,:].values.reshape(1,-1),n_neighbors=6)

In [93]:
distance

array([[ 0.        , 27.33130074, 27.65863337, 28.3019434 , 28.47806173,
        28.86173938]])

In [94]:
suggestions

array([[444, 173, 507, 356, 269, 692]], dtype=int64)

In [95]:
book_pivot.index[444]

'Sole Survivor'

In [96]:
for i in range(len(suggestions[0])):
    print(book_pivot.index[suggestions[0][i]])
#     print('\n')

Sole Survivor
Exclusive
The Cradle Will Fall
No Safe Place
Invasion
Winter Moon


In [91]:
book_index = 237  # Example index
if book_index < len(book_pivot):
    distances, suggestions = model.kneighbors(book_pivot.iloc[book_index, :].values.reshape(1, -1), n_neighbors=6)
    print(f"Recommendations for {book_pivot.index[book_index]}:")

    for i in range(len(suggestions[0])):
        print(f"{i+1}: {book_pivot.index[suggestions[0][i]]} (Distance: {distances[0][i]:.2f})")
else:
    print("Book index out of range.")


Recommendations for Hemlock Bay:
1: Hemlock Bay (Distance: 0.00)
2: Exclusive (Distance: 21.77)
3: Jacob Have I Loved (Distance: 23.92)
4: No Safe Place (Distance: 24.66)
5: The Cradle Will Fall (Distance: 25.06)
6: Last Man Standing (Distance: 25.32)


In [99]:
np.where(book_pivot.index=='Animal Farm')[0][0]

47

In [123]:
def recommented_system(book_name):
    book_id = np.where(book_pivot.index==book_name)[0][0]
    distances , suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1),n_neighbors=6)
    
    for i in range(len(suggestions[0])):
        print(book_pivot.index[suggestions[0][i]])

In [124]:
recommented_system('Exclusive')

Sole Survivor
Exclusive
The Cradle Will Fall
No Safe Place
Invasion
Winter Moon
