In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

####  Read the books dataset and explore it


In [2]:
#reading in csv files
books=pd.read_csv("BX-Books.csv", encoding="utf-8") 
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,312956762,Die Hard: With a Vengeance - A Novel,D. Chiel,1995,St Martins Pr
1,034545006X,The Taking,J. D. Landis,2003,Ballantine Books
2,1579651372,Ruskin's Rose: A Venetian Love Story,Mimma Balia,2000,Artisan Publishers
3,416652700,The Pooh cook book: inspired by \Winnie-the-Po...,";Katie Stewart""",1971,Methuen
4,1853114103,Lord Have Mercy,"\\""Ron\""""",2001,Canterbury Press


In [3]:
books.tail()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
271353,1573222623,Drinking Coffee Elsewhere (Today Show Book Clu...,ZZ Packer,2003,Riverhead Books
271354,1573222348,Drinking Coffee Elsewhere (Alex Awards (Awards)),ZZ Packer,2003,Riverhead Books
271355,1573223786,Drinking Coffee Elsewhere,Zz Packer,2004,Riverhead Books
271356,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing
271357,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley


In [4]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271358 entries, 0 to 271357
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271358 non-null  object
 1   book_title           271358 non-null  object
 2   book_author          271356 non-null  object
 3   year_of_publication  271358 non-null  int64 
 4   publisher            271356 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.4+ MB


#### Clean up NaN values

In [5]:
books[books.isna().any(axis=1)]# records containing null values

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
70531,193169656X,Tyrant Moon,Elaine Corvidae,2002,
159822,1931696993,Finders Keepers,Linnea Sinclair,2001,
271356,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing
271357,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley


In [6]:
#dropping records
books=books.dropna()

In [7]:
books.shape# 4 records have been dropped 

(271354, 5)

#### Read the data where ratings are given by users



In [8]:
ratings=pd.read_csv("BX-Book-Ratings.csv",encoding="latin-1")# utf-8 encoding doesn't work

In [9]:
ratings.head()


Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1048575 non-null  int64 
 1   isbn     1048575 non-null  object
 2   rating   1048575 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [11]:
#taking the first 10,000 rows of the dataset because of Out of Memory error

ratings=ratings[0:10000]


In [12]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


#### Creating Merged Dataset

In [13]:
#merging with books dataset 
df_merged=pd.merge(ratings, books, on='isbn')

In [14]:
df_merged.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


In [15]:
# unique users and books
unique_user_count=df_merged.user_id.nunique()
unique_book_count=df_merged.isbn.nunique()

unique_user_count, unique_book_count

(828, 8051)


* Convert ISBN variables to numeric numbers in the correct order

* Convert the user_id variable to numeric numbers in the correct order

* Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1


In [16]:
unique_users_list=df_merged.user_id.unique()
unique_books_list=df_merged.isbn.unique()


In [17]:
#creating dictionary to map sequential values from 0 to n-1
unique_users_dict={user:index for index,user in enumerate(unique_users_list) }
unique_books_dict={book:index for index,book in enumerate(unique_books_list)}

In [18]:
unique_users_dict[276726]

1

In [19]:
#mapping function
def mapping_users(key):
    return unique_users_dict[key]
def mapping_books(key):
    return unique_books_dict[key]
    

In [20]:
# adding new index values to dataframe
df_merged['ordered_user_id']=df_merged['user_id'].apply(mapping_users)
df_merged['ordered_book_id']=df_merged['isbn'].apply(mapping_books)

In [21]:
df_merged.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,ordered_user_id,ordered_book_id
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


Re-index the columns to build a matrix

In [22]:
df_merged=df_merged.reindex(columns=['ordered_user_id','ordered_book_id','rating','book_title','book_author','year_of_publication','publisher'])

In [23]:
df_merged.head()

Unnamed: 0,ordered_user_id,ordered_book_id,rating,book_title,book_author,year_of_publication,publisher
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


#### Building Model

Split your data into two sets (training and testing)



In [24]:
df_train,df_test=train_test_split(df_merged,test_size=0.2)

Using Memory-Based Collaborative Filtering

In [26]:
# creating user-book matrix

train_user_book_matrix=np.zeros((unique_user_count, unique_book_count))

#indices of matrix are one behind.
#Ex: Rating position of User 1, Book 1 would be at [0,0] in the matrix


for df_train_row in df_train.itertuples():
    
    # Each df_train_row is one row of the training data: 
    # first row output: Pandas(Index=2156, ordered_user_id=109, ordered_book_id=1766, rating=0, book_title='Snow Country'...)
    train_user_book_matrix[df_train_row[1]-1,df_train_row[2]-1]=df_train_row[3]

    

In [27]:
#test matrix
test_user_book_matrix= np.zeros((unique_user_count, unique_book_count))
for df_test_row in df_test.itertuples():
    test_user_book_matrix[df_test_row[1]-1,df_test_row[2]-1]=df_test_row[3]

In [28]:
# calculating cosine similarity using Pairwise modeling, for User-based modeling as well as item-based modeling

#User-based similarity
user_sim=pairwise_distances(train_user_book_matrix,metric='cosine') #measures cosine of every pair of user vectors

#item-based similarity
item_sim=pairwise_distances(train_user_book_matrix.T,metric='cosine')# measures cosine of every pair of book vectors

In [29]:
print(user_sim)
print("shape of user_sim is ", user_sim.shape)
print(item_sim)
print("shape of every item_sim is", item_sim.shape)

[[0. 1. 1. ... 1. 1. 1.]
 [1. 0. 1. ... 1. 1. 1.]
 [1. 1. 0. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 0. 1. 1.]
 [1. 1. 1. ... 1. 0. 1.]
 [1. 1. 1. ... 1. 1. 0.]]
shape of user_sim is  (828, 828)
[[0. 1. 1. ... 1. 1. 1.]
 [1. 0. 1. ... 1. 1. 1.]
 [1. 1. 0. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 0. 1. 1.]
 [1. 1. 1. ... 1. 0. 1.]
 [1. 1. 1. ... 1. 1. 0.]]
shape if every item_sim is (8051, 8051)


#### Model Predictions

In [35]:
def model_prediction(matrix,memory_type):
    if memory_type=="user": # user-based recommendation
        average_user_rating=matrix.mean(axis=1) # calculating average rating of each user
        rating_diff=matrix-average_user_rating.reshape(-1,1)# subtracting each individual rating from their average
        
        #combining rating diff matrix with similarity matrix
        prediction= average_user_rating.reshape(-1,1) + user_sim.dot(rating_diff)/np.array([np.abs(user_sim).sum(axis=1)]).T
        
        #item-based recommendation
    elif memory_type=="item":
        prediction=matrix.dot(item_sim)/np.array([np.abs(item_sim).sum(axis=1)])
        
    return prediction

#### Model Evaluation

In [36]:
user_based_predictions=model_prediction(train_user_book_matrix,"user")
item_based_predictions=model_prediction(train_user_book_matrix,"item")

In [38]:
print(user_based_predictions)

[[-0.00139317 -0.00139317  0.0022344  ...  0.00948953 -0.00139317
  -0.00139317]
 [ 0.00403098 -0.00201497  0.0016126  ...  0.00886774 -0.00201497
  -0.00201497]
 [ 0.064223    0.05817517  0.06180387 ...  0.06906127  0.05817517
   0.05817517]
 ...
 [ 0.00403098 -0.00201497  0.0016126  ...  0.00886774 -0.00201497
  -0.00201497]
 [ 0.00403098 -0.00201497  0.0016126  ...  0.00886774 -0.00201497
  -0.00201497]
 [ 0.00403098 -0.00201497  0.0016126  ...  0.00886774 -0.00201497
  -0.00201497]]


In [39]:
print(item_based_predictions)

[[0.         0.00062112 0.0006212  ... 0.00062153 0.00062112 0.00062112]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.06012422 0.06012422 0.06013169 ... 0.06016418 0.06012422 0.06012422]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
