# Book Rental Recommendation

## Project_04

In [1]:
# First importing the required library.
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# For Removing the unneccesary warnings
import warnings
warnings.filterwarnings("ignore")

### 1. Read the books dataset and explore it.

In [3]:
df_user = pd.read_csv("BX-Users.csv", encoding= 'latin-1')

In [4]:
df_user.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
df_user.shape

(278859, 3)

In [6]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278859 non-null  object 
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.4+ MB


In [7]:
df_books = pd.read_csv('BX-Books.csv', encoding= 'latin-1')

In [8]:
df_books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [9]:
df_books.shape

(271379, 5)

### 2. Clean up NaN values.

In [10]:
# First checking the nan values in the data set 
df_user.isnull().sum()

user_id          0
Location         1
Age         110763
dtype: int64

In [11]:
# Droping the nan Values.
df_user = df_user.dropna(axis=0)

In [12]:
df_user.isnull().sum() # Now there is no nan values.

user_id     0
Location    0
Age         0
dtype: int64

### 3. Read the data where ratings are given by users.

In [13]:
df_book_ratings = pd.read_csv('BX-Book-Ratings.csv', encoding= 'latin-1', nrows=10000)

In [14]:
df_book_ratings.tail()

Unnamed: 0,user_id,isbn,rating
9995,243,425164403,0
9996,243,440224764,0
9997,243,440225701,0
9998,243,440226430,0
9999,243,440234743,0


In [15]:
df_book_ratings.shape

(10000, 3)

In [16]:
df_book_ratings.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

In [17]:
df_book_ratings.describe()

Unnamed: 0,user_id,rating
count,10000.0,10000.0
mean,265844.3796,1.9747
std,56937.189618,3.424884
min,2.0,0.0
25%,277478.0,0.0
50%,278418.0,0.0
75%,278418.0,4.0
max,278854.0,10.0


#### Now merging the two data set books and rating.

In [18]:
# Joining te data set 
df = pd.merge(df_book_ratings, df_books, on = 'isbn')
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


In [19]:
df.shape

(8701, 7)

### 4. Take a quick look at the number of unique users and books.

In [20]:
# Number of the unique user.
n_users = df['user_id'].nunique()

In [21]:
print("Number of users: {}".format(n_users))

Number of users: 828


In [22]:
# Number of the unique books.
n_books = df['isbn'].nunique()

In [23]:
print("Number of books: {}".format(n_books))

Number of books: 8051


### 5. Convert the user_id variable to numeric numbers in the correct order.

In [24]:
# convert user_id into the numeric number.
list_userid = df.user_id.unique()
print("length of isbn list: ", len(list_userid))

length of isbn list:  828


In [25]:
def userid_numeric(user_id):
    itemindex = np.where(list_userid==user_id)
    return itemindex[0][0]

In [26]:
# do the same with ISBN and it into the numeric number.
list_isbn = df.isbn.unique()
print("length of isbn list: ", len(list_isbn))

length of isbn list:  8051


In [27]:
list_isbn

array(['034545104X', '155061224', '446520802', ..., '425098834',
       '425163407', '425164403'], dtype=object)

In [28]:
def isbn_numeric_id(isbn):
    itemindex = np.where(list_isbn==isbn)
    return itemindex[0][0]

### 6. Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

In [29]:
df['user_id_order'] = df['user_id'].apply(userid_numeric)

In [30]:
df['isbn_order'] = df['isbn'].apply(isbn_numeric_id)

In [31]:
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_order,isbn_order
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


### 7. Re-index the columns to build a matrix.

In [32]:
ordered_col = ['user_id_order', 'isbn_order', 'rating', 'book_title', 'book_author', 'year_of_publication','publisher',
               'user_id', 'isbn']
df = df.reindex(columns = ordered_col)

In [33]:
df.head()

Unnamed: 0,user_id_order,isbn_order,rating,book_title,book_author,year_of_publication,publisher,user_id,isbn
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,276725,034545104X
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle,276726,155061224
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,276727,446520802
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,278418,446520802
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,276729,052165615X


### 8. Split your data into two sets (training and testing).

In [34]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.30, random_state = 10)

In [35]:
train.shape

(6090, 9)

In [36]:
test.head()

Unnamed: 0,user_id_order,isbn_order,rating,book_title,book_author,year_of_publication,publisher,user_id,isbn
7794,3,7160,0,Aerie Tik Tok of Oz: Defiant-Cn16dp,Baum,1988,Tor Books,278418,1559029897
5451,3,4839,0,"Wild Honey (Harlequin Super Romance, No 731)",Veronica Sattler,1997,Harlequin,278418,373707312
8288,53,7640,0,Click,Dan Whipple,2001,University Press of Colorado,278633,870816527
2853,465,2373,0,The Devil's Arithmetic,Jane Yolen,1990,Puffin Books,277814,140345353
5348,3,4736,0,Way We Wed (A Year Of Loving Dangerously) (Sil...,Pat Warren,2001,Silhouette,278418,373271409


In [37]:
test.shape

(2611, 9)

### 9. Make predictions based on user and item variables.

In [47]:
train_matrix = np.zeros((n_users, n_books))
for line in train.itertuples():
    train_matrix[line[1]-1, line[2]-1] = line[3]
    
test_matrix = np.zeros((n_users, n_books))
for line in test.itertuples():
    test_matrix[line[1]-1, line[2]-1] = line[3]    

In [48]:
from sklearn.metrics.pairwise import pairwise_distances 
user_correlation = pairwise_distances(train_matrix, metric= 'cosine')
item_correlation = pairwise_distances(train_matrix.T, metric= 'cosine')

In [49]:
user_correlation

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [50]:
def predict(ratings,correlation, type= 'user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        rating_diff = (ratings - mean_user_rating[:,np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + correlation.dot(rating_diff) / np.array([np.abs(correlation).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(correlation) / np.array([np.abs(correlation).sum(axis=1)])
        
    return pred    

In [51]:
user_prediction = predict(train_matrix, user_correlation, type = 'user')
item_prediction = predict(train_matrix, item_correlation, type = 'item')

In [60]:
item_prediction.shape

(828, 8051)

### 10. Use RMSE to evaluate the predictions.

In [64]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten() 
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

In [65]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_matrix)))

User-based CF RMSE: 7.615419189906161
Item-based CF RMSE: 7.6146548279559525
