# Book Rental Recommendation- Project

In [1]:
#importing the most used libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Read the books dataset and explore it

In [2]:
#reading the data
df_books= pd.read_csv('BX-Books.csv', encoding= "latin1")
df_users = pd.read_csv('BX-Users.csv', encoding= 'latin1')
df_ratings= pd.read_csv("BX-Book-Ratings.csv", encoding= 'latin1')

In [3]:
# Exploring the data (The type and quantity)

In [4]:
df_books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [5]:
df_ratings.head( )

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [6]:
df_users.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


### Clean up NaN values

In [7]:
df_books.isnull().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

In [8]:
df_ratings.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

In [9]:
df_users.isnull().sum()

user_id          0
Location         1
Age         110763
dtype: int64

In [10]:
df_users.count()

user_id     278859
Location    278858
Age         168096
dtype: int64

In [11]:
# Since there is negligible null values and not in any of the columns we required Null value treatment not required

### Take a quick look at the number of unique users and books

In [12]:
# Checking if all values in isbn (df_books) and user_id (df_users) are unique

In [13]:
df_books.isbn.count() == len(df_books.isbn.unique())

True

In [14]:
df_users.user_id.count() == len(df_users.user_id.unique())

True

In [15]:
df_books.isbn.count() , df_users.user_id.count(), df_ratings.rating.count()

(271379, 278859, 1048575)

### Create a merged DataFrame with popular books and more frequent readers

In [16]:
# merging ratings data with books data
df_merged = df_books.merge(right= df_ratings, on= 'isbn')

In [17]:
#merging merged data with users data
df_merged = df_merged.merge(right= df_users, on= 'user_id')

In [18]:
df_merged.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating,Location,Age
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0,"stockton, california, usa",18.0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,"timmins, ontario, canada",
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,8,0,"timmins, ontario, canada",
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,8,0,"timmins, ontario, canada",
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,8,0,"timmins, ontario, canada",


In [19]:
df_merged.drop(columns=['Age', 'Location', 'publisher', 'book_title', 'book_author', 'year_of_publication'], inplace= True)

In [20]:
df_merged.sort_values(by= 'isbn', inplace= True)

In [21]:
df_merged

Unnamed: 0,isbn,user_id,rating
191355,000104687X,23902,6
29185,000104799X,166596,8
194792,000104799X,28204,7
110709,000123207X,198711,0
751354,000160418X,10067,7
...,...,...,...
397847,B000234N3A,100906,9
397844,B000234NC6,100906,0
573472,B00029DGGO,100088,0
292874,B0002JV9PY,179791,0


In [22]:
#Create a dataframe with only popular books
df_popular= df_merged.groupby('isbn').count().sort_values(by= 'rating', ascending= False)

In [23]:
df_popular.reset_index(inplace= True)

In [24]:
df_popular

Unnamed: 0,isbn,user_id,rating
0,971880107,2250,2250
1,316666343,1156,1156
2,385504209,809,809
3,312195516,663,663
4,60928336,659,659
...,...,...,...
256163,380729423,1,1
256164,380729474,1,1
256165,380729687,1,1
256166,380729822,1,1


In [25]:
df_merged.groupby('user_id').count().sort_values(by= 'rating', ascending= False)

Unnamed: 0_level_0,isbn,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
11676,11147,11147
198711,6456,6456
153662,5817,5817
98391,5779,5779
35859,5646,5646
...,...,...
107093,1,1
107100,1,1
107111,1,1
107117,1,1


In [26]:
df_popular = df_popular.loc[df_popular.rating >= 100, :]

In [27]:
df_popular

Unnamed: 0,isbn,user_id,rating
0,971880107,2250,2250
1,316666343,1156,1156
2,385504209,809,809
3,312195516,663,663
4,60928336,659,659
...,...,...,...
611,515124214,100,100
612,553580930,100,100
613,316781142,100,100
614,451204530,100,100


In [28]:
df_merged= df_merged.loc[df_merged.isbn.isin(df_popular.isbn), :]

In [29]:
df_merged

Unnamed: 0,isbn,user_id,rating
828530,002542730X,50216,10
685042,002542730X,93166,0
651501,002542730X,218404,8
194398,002542730X,28204,0
591844,002542730X,26535,0
...,...,...,...
561052,99771519,80371,0
755536,99771519,192854,0
257445,99771519,118434,9
393325,99771519,76626,0


In [30]:
#creating a dataframe with most frequent readers (those who have rated more than 10 books)
freq_users= df_merged.groupby('user_id').count()[['rating']].loc[df_merged.groupby('user_id').count()
                                                     [['rating']]['rating'] >= 10, :]

In [31]:
freq_users.reset_index(inplace= True)

In [32]:
df_merged = df_merged.loc[df_merged.user_id.isin(freq_users.user_id), :]

In [33]:
# final merged dataframe with popular books and frequent users
df_merged

Unnamed: 0,isbn,user_id,rating
194398,002542730X,28204,0
591844,002542730X,26535,0
328642,002542730X,225763,7
68366,002542730X,81088,0
240455,002542730X,88733,0
...,...,...,...
567209,99771519,26883,0
479794,99771519,149908,0
257445,99771519,118434,9
393325,99771519,76626,0


### Convert ISBN variables to numeric numbers in the correct order

In [34]:
df_merged.sort_values(by= 'isbn', inplace= True)

In [35]:
isbn = pd.DataFrame(df_merged.isbn.unique())

In [36]:
isbn['isbn_new'] = pd.Series(range(1, len(df_merged.isbn.unique())+1))

In [37]:
isbn.rename(columns= {0 : 'isbn'}, inplace= True)

In [38]:
#isbn_new = isbn

In [39]:
#df_merged['isbn_new'] = (range(1, df_merged.isbn.count()+1))

In [40]:
df_merged = df_merged.merge(right= isbn, on= 'isbn')

In [41]:
#df_merged.isbn_new = list(range(1, df_merged.isbn.count()+1))

In [42]:
df_merged

Unnamed: 0,isbn,user_id,rating,isbn_new
0,002542730X,28204,0,1
1,002542730X,184299,0,1
2,002542730X,165582,5,1
3,002542730X,130474,0,1
4,002542730X,77809,0,1
...,...,...,...,...
64939,99771519,170184,0,616
64940,99771519,110029,0,616
64941,99771519,31226,0,616
64942,99771519,148258,5,616


In [43]:
user_id_new= pd.DataFrame(df_merged.user_id.unique()).sort_values(by= 0)

In [44]:
user_id_new.reset_index(drop= True, inplace=True)

In [45]:
user_id_new

Unnamed: 0,0
0,243
1,254
2,507
3,638
4,882
...,...
2092,250359
2093,250405
2094,250510
2095,250645


In [46]:
user_id_new.reset_index(inplace=True)

In [47]:
user_id_new.rename(columns= {0 : "user_id"}, inplace= True)

In [48]:
user_id_new

Unnamed: 0,index,user_id
0,0,243
1,1,254
2,2,507
3,3,638
4,4,882
...,...,...
2092,2092,250359
2093,2093,250405
2094,2094,250510
2095,2095,250645


In [49]:
user_id_new['user_id_new'] = (range(1, user_id_new.user_id.count()+1))

In [50]:
user_id_new

Unnamed: 0,index,user_id,user_id_new
0,0,243,1
1,1,254,2
2,2,507,3
3,3,638,4
4,4,882,5
...,...,...,...
2092,2092,250359,2093
2093,2093,250405,2094
2094,2094,250510,2095
2095,2095,250645,2096


In [51]:
df_merged = df_merged.merge(right= user_id_new, on = 'user_id')

In [52]:
df_merged

Unnamed: 0,isbn,user_id,rating,isbn_new,index,user_id_new
0,002542730X,28204,0,1,214,215
1,006101351X,28204,0,4,214,215
2,014025448X,28204,0,6,214,215
3,014028009X,28204,0,7,214,215
4,038542471X,28204,9,23,214,215
...,...,...,...,...,...,...
64939,553279556,106369,9,447,864,865
64940,553279912,106369,8,448,864,865
64941,553280341,106369,9,449,864,865
64942,553280368,106369,9,450,864,865


### Re-index the columns to build a matrix

In [53]:
# importing train_test_split
from sklearn.model_selection import train_test_split

In [54]:
df_train, df_test = train_test_split(df_merged, test_size=0.20)

In [55]:
len(df_merged.user_id_new.unique())

2097

In [56]:
n_users= len(df_merged.user_id_new.unique())
n_books= len(df_merged.isbn_new.unique())

In [57]:
#creating matrix with shape (Number of users, Number of books)
# This will help in performing rmse to check accuracy
train_data_matrix = np.zeros((n_users, n_books))
test_data_matrix = np.zeros((n_users, n_books))

In [58]:
train_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
df_train

Unnamed: 0,isbn,user_id,rating,isbn_new,index,user_id_new
17979,385503822,235105,8,237,1939,1940
59478,60938455,217121,8,496,1788,1789
53157,671880314,201674,0,551,1658,1659
30709,399149155,36606,0,253,290,291
51561,60391626,74286,0,485,602,603
...,...,...,...,...,...,...
39486,671867091,90198,0,547,731,732
34938,743418174,208147,9,584,1715,1716
37497,451524934,75485,10,410,608,609
62970,671027360,3373,5,530,22,23


In [60]:
for line in df_train.itertuples():
    #[user_id index, movie_id index] = given rating.
    #if line[3].isdigit():
    train_data_matrix[int(line[6]) -1, int(line[4]) - 1] = line[3]
train_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 9., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [61]:
train_data_matrix.max()

10.0

In [62]:
for line in df_test.itertuples():
    #[user_id index, movie_id index] = given rating.
    #if line[1].isdigit():
        test_data_matrix[int(line[6]) -1, int(line[4]) - 1] = line[3] 
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
test_data_matrix.max()

10.0

## Cosine SImilarity

In [64]:
#importing pairwise_distance
from sklearn.metrics import pairwise_distances
user_similarity = 1- pairwise_distances(train_data_matrix, metric='cosine')
book_similarity = 1- pairwise_distances(train_data_matrix.T, metric='cosine')

In [65]:
mean_user_rating = np.zeros(len(df_merged.user_id_new.unique()))

In [66]:
train_data_matrix[0][np.nonzero(train_data_matrix[0])].mean()

8.571428571428571

In [67]:
# creating a loop that will fill mean rating of each user by taking a mean of his/her ratings
# Since 0 means no rating we will neglect them
for i in range(0, len(df_merged.user_id_new.unique())):
    mean_user_rating[i]= train_data_matrix[i][np.nonzero(train_data_matrix[i])].mean()

In [68]:
np.array([mean_user_rating]).T.shape

(2097, 1)

In [69]:
ratings_diff = (train_data_matrix - np.array([mean_user_rating]).T) 

In [70]:
ratings_diff

array([[-8.57142857, -8.57142857, -8.57142857, ..., -8.57142857,
        -8.57142857, -8.57142857],
       [-8.72727273, -8.72727273, -8.72727273, ..., -8.72727273,
        -8.72727273, -8.72727273],
       [-8.        , -8.        , -8.        , ..., -8.        ,
        -8.        , -8.        ],
       ...,
       [-9.        , -9.        , -9.        , ...,  0.        ,
        -9.        , -9.        ],
       [-8.        , -8.        , -8.        , ..., -8.        ,
        -8.        , -8.        ],
       [-8.        , -8.        , -8.        , ..., -8.        ,
        -8.        , -8.        ]])

In [71]:
# Creating a loop to correct rating difference where it is stored as 0
for i in range(0, len(ratings_diff)):
    for j in range(0, len(ratings_diff[i])):
        if ratings_diff[i][j] <= (0 -mean_user_rating[i]) or str(ratings_diff[i][j]) == 'nan':
            ratings_diff[i][j] = 0

In [72]:
ratings_diff.max()

5.818181818181818

In [73]:
# Predicting rating of all books by all users
user_pred = np.array([mean_user_rating]).T + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[8.55520936, 8.57665963, 8.58172274, ..., 8.58233849, 8.48794894,
        8.58410235],
       [8.71475645, 8.73134032, 8.7223933 , ..., 8.72995319, 8.62557714,
        8.72789684],
       [7.9994391 , 8.00176884, 7.9975038 , ..., 7.98670814, 7.84132963,
        8.00238592],
       ...,
       [8.9295592 , 9.00769633, 9.01369552, ..., 9.25583561, 8.98855828,
        9.0038701 ],
       [7.98627338, 7.95948644, 7.99650694, ..., 7.9960888 , 7.93120539,
        8.        ],
       [7.97206633, 8.00993813, 8.01768477, ..., 8.05188513, 7.89890696,
        8.01485285]])

In [74]:
for i in range(0, len(user_pred)):
    for j in range(0, len(user_pred[i])):
        if str(user_pred[i][j]) == 'nan':
            user_pred[i][j] = 0

In [75]:
#importing mean_squared_error and sqrt
from sklearn.metrics import mean_squared_error
from math import sqrt
# making a function to calculate rmse
def rmse(prediction, ground_truth):
    # Nonzero func only those values for which test matrix had nonzero values
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [76]:
#Passing our predictions and actual ratings to the function
print('RMSE: ' + str(rmse(user_pred, test_data_matrix)))

RMSE: 1.797931256621148


In [77]:
# Creating final matrix
actual_data_matrix= np.zeros((n_users, n_books))

In [78]:
# we will fill the final matrix with all ratings (Last time we filled only train data)
for line in df_merged.itertuples():
    #[user_id index, movie_id index] = given rating.
    #if line[3].isdigit():
    actual_data_matrix[int(line[6]) -1, int(line[4]) - 1] = line[3]
actual_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 9., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [79]:
user_similarity_actual = 1- pairwise_distances(actual_data_matrix, metric='cosine')

In [80]:
mean_user_rating_actual = np.zeros(len(df_merged.user_id_new.unique()))

In [81]:
for i in range(0, len(df_merged.user_id_new.unique())):
    mean_user_rating_actual[i]= train_data_matrix[i][np.nonzero(train_data_matrix[i])].mean()

In [82]:
ratings_diff_actual = (actual_data_matrix - np.array([mean_user_rating_actual]).T) 

In [83]:
for i in range(0, len(ratings_diff_actual)):
    for j in range(0, len(ratings_diff_actual[i])):
        if ratings_diff_actual[i][j] <= (0 -mean_user_rating_actual[i]) or str(ratings_diff_actual[i][j]) == 'nan':
            ratings_diff_actual[i][j] = 0

In [84]:
ratings_diff_actual

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [85]:
#Making predictions again
user_pred_actual = np.array([mean_user_rating_actual]).T + user_similarity_actual.dot(ratings_diff_actual)/np.array(
    [np.abs(user_similarity_actual).sum(axis=1)]).T
user_pred_actual

array([[8.55431426, 8.57529847, 8.57973323, ..., 8.5792533 , 8.41169773,
        8.58639139],
       [8.71722022, 8.72163234, 8.72586044, ..., 8.72445534, 8.55553109,
        8.72889106],
       [8.00789334, 7.98774714, 8.00465057, ..., 7.96645295, 7.80334198,
        7.99162876],
       ...,
       [8.98017803, 9.00320469, 9.01080112, ..., 9.13084177, 8.79983826,
        9.00507949],
       [7.99165119, 7.92934191, 8.00644147, ..., 7.99674939, 7.93886502,
        8.00913561],
       [7.97664036, 8.00837043, 8.01489508, ..., 8.0452314 , 7.91585968,
        8.01224257]])

In [86]:
# creating a function that takes user id and will recommend the books he hasn't read but will most likely love
# ie. give them highest rating based on predictions
def recommendor(user_id):
    if user_id in list(df_merged['user_id']):
        # converting user_id to user_id_new as it will allow to get his ratings from the matrix
        new_id = df_merged.loc[df_merged.user_id == user_id, :]['user_id_new'].iloc[0]
        # now we will get his ratings
        recommendations = user_pred_actual[int(new_id)]
        # put the books he has already rated as 0 as they shouldn't be recommended
        recommendations[[actual_data_matrix[new_id].nonzero()]] = 0
        recommend = pd.DataFrame(recommendations)
        # reset index to get book ids (isbn_new) in a separate colum
        recommend.reset_index(inplace = True)
        #sorting by rating
        recommended_books_ids = recommend.sort_values(by = 0, ascending = False).iloc[0:10]['index']
        recommended_books_ids_original = df_merged.loc[df_merged.isbn_new.isin(recommended_books_ids), :]['isbn'].unique()
        #return book title from df_books
        return df_books.loc[df_books.isbn.isin(recommended_books_ids_original), :]['book_title'].unique()
    else:
        # incase user has not rated more than 10 books we will recommend most popular books
        popular_books_final = df_ratings.groupby('isbn').count().sort_values(by = 
                                                                             'rating', ascending= False).iloc[0:10].index
        return df_books.loc[df_books.isbn.isin(popular_books_final), :]['book_title'].unique()
        

In [87]:
#passing a user_id who has rated more than 10 books
recommendor(28204)

array(['The Four Agreements: A Practical Guide to Personal Freedom',
       'Confessions of a Shopaholic (Summer Display Opportunity)',
       'Snow Falling on Cedars', 'Pay It Forward',
       'The Eyre Affair: A Novel', "The Handmaid's Tale : A Novel",
       'Holes (Yearling Newbery)', 'The Summons', 'A Fine Balance',
       'The Saving Graces: A Novel'], dtype=object)

In [88]:
#passing a user who has rated less than 10 books
recommendor(2)

array(['Wild Animus', 'Angels &amp; Demons', 'A Painted House',
       'The Secret Life of Bees', 'The Lovely Bones: A Novel',
       'The Red Tent (Bestselling Backlist)', 'The Da Vinci Code',
       'Divine Secrets of the Ya-Ya Sisterhood: A Novel',
       'Snow Falling on Cedars'], dtype=object)