### Step 1: Importing required libraries

In [1]:
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

import warnings
warnings.filterwarnings('ignore')

# Step 2: Dataset Loading & Preprocessing

In [2]:
# Loading Dataset

dataset = pd.read_csv('book.csv',encoding = "ISO-8859-1")

In [3]:
# dataset first five row display.

dataset.head()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6


In [4]:
# Dropping Unnamed column

dataset = dataset.drop(['Unnamed: 0'],axis=1)

In [5]:
# renaming column names

dataset.columns = ['User_ID', 'Book_Title', 'Book_Rating']

In [6]:
dataset.Book_Title.nunique()

9659

In [7]:
# Creating Book_ID Column.

dataset['Book_ID'] = dataset.Book_Title.astype('category').cat.codes

In [8]:
dataset.head()

Unnamed: 0,User_ID,Book_Title,Book_Rating,Book_ID
0,276726,Classical Mythology,5,1443
1,276729,Clara Callan,3,1440
2,276729,Decision in Normandy,6,1883
3,276736,Flu: The Story of the Great Influenza Pandemic...,8,2722
4,276737,The Mummies of Urumchi,6,7969


 ### Building Matrix Dataframes with rating values..

In [9]:
matrix_df = pd.pivot_table(dataset, values='Book_Rating', index='User_ID', columns='Book_ID')

In [10]:
matrix_df.head()

Book_ID,0,1,2,3,4,5,6,7,8,9,...,9649,9650,9651,9652,9653,9654,9655,9656,9657,9658
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Replacing NaN by Book Average
final_book = matrix_df.fillna(matrix_df.mean(axis=0))

In [12]:
final_book.head()

Book_ID,0,1,2,3,4,5,6,7,8,9,...,9649,9650,9651,9652,9653,9654,9655,9656,9657,9658
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,8.0,6.0,4.0,7.0,8.0,7.0,10.0,10.0,7.0,10.0,...,8.0,9.0,10.0,7.0,8.0,8.0,9.0,7.0,5.0,4.0
9,8.0,6.0,4.0,7.0,8.0,7.0,10.0,10.0,7.0,10.0,...,8.0,9.0,10.0,7.0,8.0,8.0,9.0,7.0,5.0,4.0
10,8.0,6.0,4.0,7.0,8.0,7.0,10.0,10.0,7.0,10.0,...,8.0,9.0,10.0,7.0,8.0,8.0,9.0,7.0,5.0,4.0
12,8.0,6.0,4.0,7.0,8.0,7.0,10.0,10.0,7.0,10.0,...,8.0,9.0,10.0,7.0,8.0,8.0,9.0,7.0,5.0,4.0
14,8.0,6.0,4.0,7.0,8.0,7.0,10.0,10.0,7.0,10.0,...,8.0,9.0,10.0,7.0,8.0,8.0,9.0,7.0,5.0,4.0


In [13]:
# Replacing NaN by user Average
final_user = matrix_df.apply(lambda row: row.fillna(row.mean()), axis=1)

In [14]:
final_user.head()

Book_ID,0,1,2,3,4,5,6,7,8,9,...,9649,9650,9651,9652,9653,9654,9655,9656,9657,9658
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,...,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429,5.571429
9,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
10,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
12,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
14,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,...,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333,5.333333


In [16]:
# user similarity on replacing NAN by user avg

b = cosine_similarity(final_user)
np.fill_diagonal(b, 0 )
similarity_with_user = pd.DataFrame(b,index=final_user.index)
similarity_with_user.columns=final_user.index
similarity_with_user.head()

User_ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.999994,0.999994,0.999994,0.999993,0.999994,0.999977,0.999994,0.999994,0.999994,...,0.999994,0.999994,0.999994,0.999971,0.999993,0.999994,0.999994,0.999955,0.999994,0.999992
9,0.999994,0.0,1.0,1.0,0.999999,1.0,0.999984,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999977,0.999999,1.0,1.0,0.999961,1.0,0.999998
10,0.999994,1.0,0.0,1.0,0.999999,1.0,0.999984,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999977,0.999999,1.0,1.0,0.999961,1.0,0.999998
12,0.999994,1.0,1.0,0.0,0.999999,1.0,0.999984,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999977,0.999999,1.0,1.0,0.999961,1.0,0.999998
14,0.999993,0.999999,0.999999,0.999999,0.0,0.999999,0.999982,0.999999,0.999999,0.999998,...,0.999999,0.999999,0.999999,0.999976,0.999998,0.999999,0.999999,0.99996,0.999999,0.999997


# 1. Content based filtering

This algorithm recommends products which are similar to the ones that a user has liked in the past.

![content based filtering image](https://github.com/ShrikantUppin/Recommendation-Engine/blob/main/content%20based%20filtering.png?raw=true)

The similarity between the two users is the similarity between the rating vectors. A quantifying metric is needed in order to measure the similarity between the user’s vectors. 

* Jaccard similarity

* Cosine similarity

* Pearson correlation coefficient 

are some of the commonly used distance and similarity metrics. 

The cosine similarity measure between two nonzero user vectors 

 two term-frequency vectors (Profile vectors & Item vector) may have many 0 values in common, meaning that the corresponding documents do not share many words, but this does not make them similar. We need a measure that will focus on the words that the two documents do have in common, and the occurrence frequency of such words. In other words, we need a measure for numeric data that ignores zero-matches.

 ### Consider the example of Netflix. 
 
 #### profile vector

They save all the information related to each user in a vector form. This vector contains the past behavior of the user, i.e. the movies liked/disliked by the user and the ratings given by them. This vector is known as the profile vector. 

#### Item vector 

All the information related to movies is stored in another vector called the item vector. Item vector contains the details of each movie, like genre, cast, director, etc.

The content-based filtering algorithm finds the cosine of the angle between the profile vector and item vector, i.e. cosine similarity. Suppose A is the profile vector and B is the item vector, then the similarity between them can be calculated as:

![](https://github.com/ShrikantUppin/Recommendation-Engine/blob/main/cosine.png?raw=true)

In [None]:
# user similarity on replacing NAN by item(book) avg

cosine = cosine_similarity(final_book)
np.fill_diagonal(cosine, 0 )
similarity_with_book = pd.DataFrame(cosine,index=final_book.index)
similarity_with_book.columns=final_user.index
similarity_with_book.head()

In [None]:
def find_n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [None]:
# top 30 neighbours for each user
sim_user_30_u = find_n_neighbours(similarity_with_user,30)
sim_user_30_u.head(5)

In [None]:
# top 30 neighbours for each user
sim_user_30_m = find_n_neighbours(similarity_with_book,30)
sim_user_30_m.head()

In [None]:
def get_user_similar_books( user1, user2 ):
    common_books = Rating_avg[Rating_avg.userId == user1].merge(
        Rating_avg[Rating_avg.userId == user2],
        on = "Book_ID",
        how = "inner" )
    return common_books.merge( movies, on = 'Book_ID' )