In [4]:
import pandas as pd  # Load and manipulate data, provides data structure in form of data
import numpy as np   # 
import scipy as sp

In [None]:
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
#Used for KNN algorithum, various ml algo are part of it
from scipy.spatial.distance import correlation

from sklearn.metrics.pairwise import pairwise_distances
#cal distance between two points or a major pairs of points
from contextlib import contextmanager
#For resource management

In [None]:
from scipy.sparse import csr_matrix

In [None]:
import matplotlib.pyplot as plt
%matplotlibinline

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Reading CSV files

### 1.Books data

In [None]:
books = pd.read_csv('BX-Books.csv', sep = ';',error_bad_lines=False)
books.columns=["ISBN","title","author","yearOfPublication","publisher","imgUrls","imgLm","imgurlLm"]
books.head()

In [None]:
books.shape

In [None]:
books.dtypes

### 2. Books Rating Data

In [None]:
ratings = pd.read_csv('BX-Book-Ratings.csv', sep = ';',error_bad_lines=False)
ratings.columns=["userID","ISBN","Rating"]
ratings.head()

In [None]:
ratings.shape

In [None]:
ratings.dtypes

### 3. USER's CSV

In [None]:
users = pd.read_csv('BX-Users.csv', sep = ';',error_bad_lines=False)
users.columns=['userID','Location','Age']
users.head()

In [None]:
users.dtypes

In [None]:
users.shape

#  1. Books data preprocessing

In [None]:
print(books.shape)

In [None]:
books.head()

In [None]:
books.dtypes

In [None]:
pd.set_option('display.max_colwidth',-1)

In [None]:
books.yearOfPublication.unique()

We can observe that 
                    1. DK Publishing Inc and Gallimard has come in the place of year
                    2. some year are 0 
                    3. Few year values are not valid i.e.2037

In [None]:
# Location where year Of publication is DK Publishing Inc
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]

In [None]:
books.loc[books.ISBN == '078946697X','yearOfPublication']= 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "JMichale Teitelbum"
books.loc[books.ISBN == '078946697X','publisher'] = "Dk Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\";Michael Teitelbaum"

In [None]:
books.loc[books.ISBN == '0789466953','yearOfPublication']= 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "Dk Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers"

In [None]:
books.loc[books.yearOfPublication == 'Gallimard',:]

In [None]:
books.loc[books.ISBN == '2070426769','yearOfPublication']= 2003
books.loc[books.ISBN == '2070426769','bookAuthor'] = "Jean-Marie Gustave Le ClÃ?Â©zio"
books.loc[books.ISBN == '2070426769','publisher'] = "Gallimard"
books.loc[books.ISBN == '2070426769','bookTitle'] = "Peuple du ciel, suivi de 'Les Bergers"

In [None]:
#Set invalid parsing to NaN
books.yearOfPublication = pd.to_numeric(books.yearOfPublication,errors = 'coerce')

In [None]:
print(sorted(books['yearOfPublication'].unique()))

In [None]:
# Means value of YOP
books.yearOfPublication.mean()

In [None]:
# Let's set YOP >2006 and 0 replace it with mean value
# 1. First get the value and replace it with NaN
# 2. Set NaN value with mean of YOP

books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0), 'yearOfPublication']  = np.NaN
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace = True)


In [None]:
books.yearOfPublication = books.yearOfPublication.astype(np.int32)

In [None]:
# Find books info which does not have any publisher name
books.loc[books.publisher.isnull(),:]

In [None]:
# Replace that publisher name with Other
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'

# 2. Users data pre-processing

### 1. Users data

In [None]:
users.userID.values

In [None]:
users.dtypes

In [None]:
# fetching unique user age from users data
print(sorted(users.Age.unique()))

We can see that some users-age is in range 0-5 AND 90-244 which is not practically possible

As well as converting the dtype is float64


In [None]:
users.Age.mean()

In [None]:
# 1. Let's Convert age value >90 and <5 with NaN
# 2. then replace it with mean of overall age
# 3. Now we will change the dtype with integer

users.loc[(users.Age>90)|(users.Age<5),'Age'] = np.nan
users.Age = users.Age.fillna(users.Age.mean())
users.Age = users.Age.astype(np.int32)

In [None]:
#Let's check the refined data
print(sorted(users.Age.unique()))

# 3. Rating data Pre- processing

In [None]:
#Identifies the size and column details
print(ratings.shape)
print(list(ratings.columns))

In [None]:
ratings.head()

In [None]:
# No null value is present
ratings.isnull().sum()

### Rating distribution

In [None]:
plt.rc("font",size=15)
ratings.Rating.value_counts(sort = False).plot(kind = 'bar')
plt.title("Rating Sitribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.plot()

--> WE can observe that the rating is from 0-10 

--> The rating for 0 is heigher  

In [None]:
''' Here 0 --> Row 1-->column '''
n_users = users.shape[0] #get the number of rows of users data
n_books = books.shape[0] #get the number of rows of books data
print(n_users)
print(n_books)
print(n_users*n_books) 

In [None]:
''' comparing the ratings and books  data set with the help of ISBN
 If the ISBN value is same then put the book into new_retings dataset '''

new_ratings = ratings[ratings.ISBN.isin(books.ISBN)]


In [None]:
''' In new_rating further filter with new_ratings and users data with the help of usersID
    If the userID of both dataSet will match then put it into new_ratings'''

new_ratings = new_ratings[new_ratings.userID.isin(users.userID)]


In [None]:
print(ratings.shape) # Already existing ratings data
print(new_ratings.shape)  # newly formed data with the help of comparison  

In [None]:
new_ratings.head() 

In [None]:
new_ratings.dtypes

In [None]:
''' sparsity can be achieve by from the users in greater rate only a limited number of items
 It is used in scenario of collabrative filtering'''

sparsity = 1.0 -len(new_ratings)/float(n_users * n_books)
# print(sparsity)
print("The sparsity level of book crossing is " + str(sparsity *100)+'%')

In [None]:
ratings.Rating.unique()

In [None]:
# divide ratings into two parts 
# 1. rating without 0
# 2. rating with 0

rating_explict = new_ratings[new_ratings.Rating!= 0]
rating_implict = new_ratings[new_ratings.Rating == 0]

In [None]:
# identifying the users who are not given zero rating
user_exp_ratings = users[users.userID.isin(rating_explict.userID)]

# identifying the users who are not given zero rating
user_imp_ratings = users[users.userID.isin(rating_implict.userID)]

In [None]:
#checking shape
print(new_ratings.shape)
print(rating_explict.shape)
print(rating_implict.shape)

In [None]:
sns.countplot(data =rating_explict,x='Rating' )
plt.show()


#  Popularity Based Recommendation

In [None]:
''' Taking the each book by grouping with the help of their ISBN. sum all the rating given by different users and arrange it in 
     descending order so that those top most books can be recommended to any of the users '''

ratings_count = pd.DataFrame(rating_explict.groupby(['ISBN'])['Rating'].sum())
top7 = ratings_count.sort_values('Rating', ascending = False).head(7)

print("Following are the top rated books")
top7.merge(books, left_index = True ,right_on = 'ISBN')

# Collborative filtering based Recommendation Syatem

In [None]:
''' Let's find the count of unique userID '''

count1 = rating_explict['userID'].value_counts();
#print(count1)

In [None]:
''' Let's  Keept the record of users who's buying count is more than 100 i.e. which are more frequent users''' 
''' and store  the result into rating_explict '''

rating_explict = rating_explict[rating_explict['userID'].isin(count1[count1 >=100].index)]
#print(rating_explict)

In [None]:
''' finding the count of unique book rating'''
count = rating_explict['Rating'].value_counts()

In [None]:
''' here we are keeping the details in rating_exp by considering count of book rating with greater than or equal to 100
    of the frequent users which indicates that the frequent user ratings are being consiered for the recommendation '''

rating_explict = rating_explict[rating_explict['Rating'].isin(count[count >=100].index)]
print(rating_explict)

In [None]:
''' Rating matrix generation'''
ratingMatrix = rating_explict.pivot(index='userID',columns = 'ISBN',values = 'Rating')
userID =ratingMatrix.index
ISBN = ratingMatrix.columns

print(ratingMatrix.shape)
ratingMatrix.head()

In [None]:
''' Replace NaN val with 0'''
ratingMatrix.fillna(0,inplace = True)
ratingMatrix = ratingMatrix.astype(np.int32)

In [None]:
ratingMatrix.head()

In [None]:
global metric,k
k=10
metric ='cosine'

#  Collebrative filtering using KNN

## in order to find out which books are popular, we need to combine book data with rating data

In [None]:
books = pd.read_csv("BX-Books.csv",sep=';',error_bad_lines = False)
books.columns=["ISBN","title","author","yearOfPublication","publisher","imgUrls","imgLm","imgurlLm"]

users = pd.read_csv('BX-Users.csv', sep = ';',error_bad_lines=False)
users.columns=['userID','Location','Age']

ratings = pd.read_csv('BX-Book-Ratings.csv', sep = ';',error_bad_lines=False)
ratings.columns=["userID","ISBN","Rating"]

In [None]:
combine_book_rating = pd.merge(ratings,books,on='ISBN')
columns = ["author","yearOfPublication","publisher","imgUrls","imgLm","imgurlLm"]

In [None]:
combine_book_rating = combine_book_rating.drop(columns,axis=1)
combine_book_rating.head()

## Group by book titles and create new column for total rating count

In [None]:
combine_book_rating = combine_book_rating.dropna(axis =0,subset=['title'])
#drop NaN values

In [None]:
book_ratingCount = (combine_book_rating.groupby(by=['title'])['Rating'].count().reset_index()
                   .rename(columns = {'Rating':'TotRatingCount'})[['title','TotRatingCount']])
#group by title considering rating columns count
#reset the index as we want

In [None]:
book_ratingCount.head()

In [None]:
rating_with_totalratingCount = combine_book_rating.merge(book_ratingCount, left_on = 'title', right_on = 'title', how= 'inner')
#left_on indicates column or index names to join the left dataframe
#right_on indicates column or index level names to join on in the right dataframe
#how indicates type of merge(joins) to be performed as if in SQL.values are left right and inner

rating_with_totalratingCount.head()

In [None]:
pd.set_option('display.float_format',lambda x:'%.3f'%x)
print(book_ratingCount['TotRatingCount'].describe())

The median book has been rated only once. Let's look at the top of the distribution

In [None]:
print(book_ratingCount['TotRatingCount'].quantile(np.arange(.9,1,.01)))
#it is to consider the range of top 10% with difference of 1%. i.e from 90% to 100% identfy the total rating count for each
#of the quantile

About 1% of the books received 50 or more ratings. Because we have so many books in our data.
we will limit it to the top 1% and this will guve us 7085 unique books

In [None]:
#popularity_threshold = 50
rating_popular_book = rating_with_totalratingCount.query('TotRatingCount >= 50')

In [None]:
print(rating_with_totalratingCount.shape)
rating_with_totalratingCount.head()

In [None]:
print(rating_popular_book.shape)
rating_popular_book.head()

In [None]:
rating_popular_book['ISBN'].nunique()#It gives the information of unique books are there with respect to ISBN

## Filter to users in US and Canada Only

In order  to improve computing speed and not eun into the "MemoryError" issue, we will limit our user data to those in the US and canda and the user dataandtotal rating count data

In [None]:
combined = rating_popular_book.merge(users, left_on = 'userID', right_on = 'userID', how ='left')
#here combine data is been generated by using the principle of joins as in DataSet concepts 
#Join between rating_popular_book and user by using left join

In [None]:
usCandaUserRating = combined[combined['Location'].str.contains("usa|canada")]
usCandaUserRating = usCandaUserRating.drop("Age",axis = 1)
usCandaUserRating.head()

In [None]:
if not usCandaUserRating[usCandaUserRating.duplicated(['userID','title'])].empty:
    initial_rows = usCandaUserRating.shape[0] #identify the no. of rows corresponding to countries US and Canada
    print('Initial dataframe shape {0}'.format(usCandaUserRating.shape))
    
    usCandaUserRating = usCandaUserRating.drop_duplicates(['userID','title'])
    current_rows = usCandaUserRating.shape[0]#identifying the no of rows corresponding to us and canada userswithout duplicates
    
    print('New dataframe shape {0}'.format(usCandaUserRating.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows ))

In [None]:
usCandaUserRating_pivot = usCandaUserRating.pivot(index ='title',columns='userID',values='Rating').fillna(0)
usCandaUserRating_matrix = csr_matrix(usCandaUserRating_pivot.values)
#create pivot table and convert the values into matrix

# Finding nearest neighbours

In [None]:
model_knn = NearestNeighbors(metric ='cosine',algorithm = 'brute')
model_knn.fit(usCandaUserRating_matrix)

## Test our model and make recommendations

In [None]:
query_index = np.random.choice(usCandaUserRating_pivot.shape[0])
distances,indices =model_knn.kneighbors(usCandaUserRating_pivot.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)
for i in range(0,len(distance.flatten())):
    if i ==0:
         print("Recommendations for {0}: \n" .format(usCandaUserRating_pivot.index[query_index]))
    else:
        print("{0}:{1}, with distance of {2}:".format(i,usCandaUserRating_pivot.index[indices.flatten()[i]],distances.flatten()[i]))

In [None]:
usCandaUserRating_pivot2 =usCandaUserRating.pivot(index='userID',columns='title',values='Rating').fillna(0)

In [None]:
usCandaUserRating_pivot2.head()

In [None]:
usCandaUserRating_pivot2.shape

In [None]:
x = usCandaUserRating_pivot2.values.T
x.shape

In [None]:
import sklearn
from sklearn.decomposition import TruncatesSVD