# <span style= "color:cyan"> BUILDING A RECOMMENDATION SYSTEM </SPAN>

Load Libraries

In [410]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from surprise import Reader, Dataset

from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

#### <span style= "color:orange"> Loading the dataset </SPAN>

In [310]:
def read_data(path, error_bad_lines = False, encoding = 'latin-1', sep=';', on_bad_lines = 'skip'):

    "A simple function that reads the data"
    
    data = pd.read_csv(path, error_bad_lines = error_bad_lines, encoding = encoding, sep = sep)
    return data

book_ratings = read_data(r'C:\Users\user\Documents\Recommendation Systems\recommendation_system_project\BX-Book-Ratings.csv')
books = read_data(r'C:\Users\user\Documents\Recommendation Systems\recommendation_system_project\BX-Books.csv')
users = read_data(r'C:\Users\user\Documents\Recommendation Systems\recommendation_system_project\BX-Users.csv')

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


we have three datasets:
* `books`
* `users`
* `rating`

Let us explore them by viewing first five rows of each

In [311]:
""" calling on variable book_ratings to view the first 5 rows"""

book_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [312]:
""" calling on variable books to view the first five rows"""

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [313]:
""" calling on variable users to view the first 5 rows"""

users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


#### <span style= "color:orange"> Preliminary Data understanding </SPAN>

In [314]:

def get_info_shape_stats(dataset, dataset_name):

    """A simple function to check the shape, info and descriptive statistics of the dataset"""
    
    print('The Dataset:', dataset_name )
    print(f"has {dataset.shape[0]} rows and {dataset.shape[1]} columns")
    print('---------------------------')
    print('---------------------------')
    print(dataset.info())
    print('---------------------------')
    print('----------------------------')
    print(dataset.describe())

In [315]:
"""calling on the function get_info_shape_stats"""

get_info_shape_stats(book_ratings, 'Book Ratings')

The Dataset: Book Ratings
has 1149780 rows and 3 columns
---------------------------
---------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB
None
---------------------------
----------------------------
            User-ID   Book-Rating
count  1.149780e+06  1.149780e+06
mean   1.403864e+05  2.866950e+00
std    8.056228e+04  3.854184e+00
min    2.000000e+00  0.000000e+00
25%    7.034500e+04  0.000000e+00
50%    1.410100e+05  0.000000e+00
75%    2.110280e+05  7.000000e+00
max    2.788540e+05  1.000000e+01


In [316]:
"""calling on the function get_info_shape_stats"""

get_info_shape_stats(books, 'Books')

The Dataset: Books
has 271360 rows and 8 columns
---------------------------
---------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB
None
---------------------------
----------------------------
              ISBN      Book-Title      Book-Author  Year-Of-Publication  \
count       271360          271360           271359               271360   
unique      271360          24

* There are columns labelled None, with numerous null values, these will be analyzed during the data cleaning stage

In [317]:
"""calling on the function get_info_shape_stats"""

get_info_shape_stats(users, 'Users')

The Dataset: Users
has 278858 rows and 3 columns
---------------------------
---------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB
None
---------------------------
----------------------------
            User-ID            Age
count  278858.00000  168096.000000
mean   139429.50000      34.751434
std     80499.51502      14.428097
min         1.00000       0.000000
25%     69715.25000      24.000000
50%    139429.50000      32.000000
75%    209143.75000      44.000000
max    278858.00000     244.000000


In [318]:
def data_types(data, dataset_name):

    """A simple function to check the data types on th datasets """

    print("Dataset:",dataset_name, "has",len( data.select_dtypes(include='number').columns),
                "Numeric columns")
    
    print("and", len(data.select_dtypes(include='object').columns),
          "Categorical columns")

    print('*****************************************************')
    print('*****************************************************')

    print('Numerical Columns:', data.select_dtypes(include='number').columns)
    print('Categorical Coulumns:', data.select_dtypes(include='object').columns)

In [319]:
""" calling on the data_types function """

data_types(users, 'Users') 

Dataset: Users has 2 Numeric columns
and 1 Categorical columns
*****************************************************
*****************************************************
Numerical Columns: Index(['User-ID', 'Age'], dtype='object')
Categorical Coulumns: Index(['Location'], dtype='object')


In [320]:
""" calling on the data_types function """

data_types(books, 'Books')

Dataset: Books has 0 Numeric columns
and 8 Categorical columns
*****************************************************
*****************************************************
Numerical Columns: Index([], dtype='object')
Categorical Coulumns: Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')


In [321]:
""" calling on the data_types function """

data_types(book_ratings, 'Book Ratings')

Dataset: Book Ratings has 2 Numeric columns
and 1 Categorical columns
*****************************************************
*****************************************************
Numerical Columns: Index(['User-ID', 'Book-Rating'], dtype='object')
Categorical Coulumns: Index(['ISBN'], dtype='object')


#### <span style= "color:orange"> Data Cleaning </SPAN>

Duplicates

In [322]:
duplicates = []

def check_duplicates(data):

    """Function that iterates through the rows of our dataset to check whether they are duplicated or not"""
    
    for i in data.duplicated():
        duplicates.append(i)
    duplicates_set = set(duplicates)
    if(len(duplicates_set) == 1):
        print('The Dataset has No Duplicates')

    else:
        duplicates_percentage = np.round(((sum(duplicates)/len(data)) * 100 ), 2)
        print(f'Duplicated rows constitute of {duplicates_percentage} % of our dataset')

In [323]:
check_duplicates(book_ratings) # checking for duplicates in book_ratings

The Dataset has no Duplicates


In [324]:
check_duplicates(books) # checking for duplicates in books

The Dataset has no Duplicates


In [325]:
check_duplicates(users) # checking for duplicates in users

The Dataset has no Duplicates


Missing Values

In [326]:
def missing_values(data):

    """ Function for checking null values in percentage in relation to length of the dataset """

    if data.isnull().any().any() == False :

        print("There Are No Missing Values")

    else:

        missing_values = data.isnull().sum().sort_values(ascending=False)

        missing_val_percent = ((data.isnull().sum()/len(data)).sort_values(ascending=False))

        missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage %': missing_val_percent})

        return missing_df[missing_df['Percentage %'] > 0]

In [327]:
missing_values(book_ratings) # checking for missing values in book ratings

There Are No Missing Values


In [328]:
missing_values(books) # checking for missing values in books

Unnamed: 0,Missing Values,Percentage %
Image-URL-L,3,1.1e-05
Publisher,2,7e-06
Book-Author,1,4e-06


In [329]:
missing_values(users) # checking for missing values in users

Unnamed: 0,Missing Values,Percentage %
Age,110762,0.397199


In [330]:
def dropping_columns(data, columns):

    """A simple function to drop columns with missing values"""

    drop_column = data.drop(columns=columns, inplace = True)
    
    return drop_column

columns_to_drop = users[['Age']]

dropping_columns(users, columns_to_drop)

In [333]:
def drop_rows(data, columns):
    
    """A simple function to remove the rows of columns that have missing values """
    
    new_data = data.dropna(subset=columns, inplace=True)
    return new_data

col = ['Image-URL-L', 'Publisher', 'Book-Author']
drop_rows(books, col)

In [335]:
book_ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

#### <span style= "color:orange"> Feature Selection and EDA </SPAN>

In [346]:
def merge_dataframe(data_0, data_1, merge_column):
    """A function to merge the datasets based on a given column"""
    new_df = data_0.merge(data_1, on=merge_column)
    return new_df

df_rating = merge_dataframe(users, book_ratings, "User-ID")
df_rating

Unnamed: 0,User-ID,Location,ISBN,Book-Rating
0,2,"stockton, california, usa",0195153448,0
1,7,"washington, dc, usa",034542252,0
2,8,"timmins, ontario, canada",0002005018,5
3,8,"timmins, ontario, canada",0060973129,0
4,8,"timmins, ontario, canada",0374157065,0
...,...,...,...,...
1149775,278854,"portland, oregon, usa",0425163393,7
1149776,278854,"portland, oregon, usa",0515087122,0
1149777,278854,"portland, oregon, usa",0553275739,6
1149778,278854,"portland, oregon, usa",0553578596,0


In [347]:
missing_values(df_rating) # checking for missing values

There Are No Missing Values


In [348]:
check_duplicates(df_rating) # checking for duplicates

The Dataset has no Duplicates


In [349]:
get_info_shape_stats(df_rating, 'Merged DataFrame') # checking the dataset info

The Dataset: Merged DataFrame
has 1149780 rows and 4 columns
---------------------------
---------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1149780 entries, 0 to 1149779
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   Location     1149780 non-null  object
 2   ISBN         1149780 non-null  object
 3   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 43.9+ MB
None
---------------------------
----------------------------
            User-ID   Book-Rating
count  1.149780e+06  1.149780e+06
mean   1.403864e+05  2.866950e+00
std    8.056228e+04  3.854184e+00
min    2.000000e+00  0.000000e+00
25%    7.034500e+04  0.000000e+00
50%    1.410100e+05  0.000000e+00
75%    2.110280e+05  7.000000e+00
max    2.788540e+05  1.000000e+01


In [351]:
""" merging the new dataset with the book dataset """
df_books = merge_dataframe(books, df_rating, 'ISBN')
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Location,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,"stockton, california, usa",0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,"timmins, ontario, canada",5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,"ottawa, ontario, canada",0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,"n/a, n/a, n/a",8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,"sudbury, ontario, canada",0


In [353]:
get_info_shape_stats(df_books, "Combined Dataset") # check merged dataset info

The Dataset: Combined Dataset
has 1031129 rows and 11 columns
---------------------------
---------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031129 entries, 0 to 1031128
Data columns (total 11 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   ISBN                 1031129 non-null  object
 1   Book-Title           1031129 non-null  object
 2   Book-Author          1031129 non-null  object
 3   Year-Of-Publication  1031129 non-null  object
 4   Publisher            1031129 non-null  object
 5   Image-URL-S          1031129 non-null  object
 6   Image-URL-M          1031129 non-null  object
 7   Image-URL-L          1031129 non-null  object
 8   User-ID              1031129 non-null  int64 
 9   Location             1031129 non-null  object
 10  Book-Rating          1031129 non-null  int64 
dtypes: int64(2), object(9)
memory usage: 94.4+ MB
None
---------------------------
----------------------

In [354]:

missing_values(df_books) # check for missing values

There Are No Missing Values


In [355]:
check_duplicates(df_books) # check for duplicates

The Dataset has no Duplicates


## Popularity Based Recommendation System

In [385]:
def calculate_popularity(df, column_name):

    """Calculates the popularity of values in a specific column of a dataframe"""

    popularity_df = pd.DataFrame(df[column_name].value_counts())
    return popularity_df

popularity_df = calculate_popularity(df_books, 'Book-Title')
popularity_df.head(20)

Unnamed: 0,Book-Title
Wild Animus,2502
The Lovely Bones: A Novel,1295
The Da Vinci Code,898
A Painted House,838
The Nanny Diaries: A Novel,828
Bridget Jones's Diary,815
The Secret Life of Bees,774
Divine Secrets of the Ya-Ya Sisterhood: A Novel,740
The Red Tent (Bestselling Backlist),723
Angels &amp; Demons,670


In [386]:

def filter_active_users(dataframe, threshold):

    """Filter the dataframe to include only users who have actively rated more than the threshold"""
    
    # Filter the DataFrame based on the count of each unique User-ID
    user_counts = dataframe['User-ID'].value_counts()
    filter = user_counts > threshold

    # Get the index values of the filtered rows
    filtered_index = filter[filter].index

    # Create a new DataFrame by selecting only the rows where User-ID is in the filtered index
    filtered_df = dataframe[dataframe['User-ID'].isin(filtered_index)]

    return filtered_df

df_filtered = filter_active_users(df_books, 300)
df_filtered.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Location,Book-Rating
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,"n/a, n/a, n/a",8
6,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,85526,"victoria, british columbia, canada",0
10,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,177458,"ottawa, ontario, canada",0
21,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,110912,"milpitas, california, usa",10
26,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,197659,"indiana, pennsylvania, usa",9


In [369]:
def calculate_rating_count(dataframe):

    """A Simple Function to Calculate the Number of Times each book has been rated"""

    # Group the dataframe by 'Book-Title' and count the occurrences of 'Book-Rating' for each title
    rating_count = dataframe.groupby('Book-Title')['Book-Rating'].count().reset_index()

    # Rename the 'Book-Rating' column to 'rating_count'
    rating_count.rename(columns={'Book-Rating': 'rating_count'}, inplace=True)

    # Merge the original dataframe with the 'rating_count' dataframe based on 'Book-Title'
    new_df = dataframe.merge(rating_count, on='Book-Title')

    # Display the first few rows of the merged dataframe
    return new_df

new_book_df = calculate_rating_count(df_filtered)
new_book_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Location,Book-Rating,rating_count
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,"n/a, n/a, n/a",8,3
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,85526,"victoria, british columbia, canada",0,3
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,177458,"ottawa, ontario, canada",0,3
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,110912,"milpitas, california, usa",10,2
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,197659,"indiana, pennsylvania, usa",9,2


In [390]:
def filter_rating_count(dataframe, threshold):
    
    """A Simple Funtion to Filter the dataframe based on a minimum rating count"""

    # Apply the filter to the 'dataframe' using the 'loc' function
    filtered_df = dataframe.loc[dataframe['rating_count'] >= threshold, :]

    # Display the first few rows of the filtered dataframe
    return filtered_df

final_df = filter_rating_count(new_book_df, 50)
final_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Location,Book-Rating,rating_count
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,11676,"n/a, n/a, n/a",9,88
6,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,36836,"raleigh, north carolina, usa",0,88
7,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,46398,"san antonio, texas, usa",9,88
8,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,113270,"evanston, illinois, usa",0,88
9,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,113519,"pleasanton, california, usa",0,88


In [391]:
get_info_shape_stats(final_df, 'Final DataFrame')

The Dataset: Final DataFrame
has 35871 rows and 12 columns
---------------------------
---------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 35871 entries, 5 to 171959
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ISBN                 35871 non-null  object
 1   Book-Title           35871 non-null  object
 2   Book-Author          35871 non-null  object
 3   Year-Of-Publication  35871 non-null  object
 4   Publisher            35871 non-null  object
 5   Image-URL-S          35871 non-null  object
 6   Image-URL-M          35871 non-null  object
 7   Image-URL-L          35871 non-null  object
 8   User-ID              35871 non-null  int64 
 9   Location             35871 non-null  object
 10  Book-Rating          35871 non-null  int64 
 11  rating_count         35871 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 3.6+ MB
None
---------------------------
------

## Memory-Based Modelling

In [408]:
model_df = final_df[['User-ID','Book-Rating', 'Book-Author']]
reader = Reader()
data = Dataset.load_from_df(model_df, reader)

In [409]:
#Loading the data set
fggf

ValueError: could not convert string to float: 'Amy Tan'

In [397]:
# creating a user matrix for the book titles

user_item_matrix = final_df.pivot_table(
    index = 'User-ID',
    columns = 'Book-Title',
    values = 'Book-Rating',
    aggfunc = 'sum'
)
user_item_matrix = user_item_matrix.applymap(lambda x: 1 if x > 0 else 0)
user_item_matrix

Book-Title,1st to Die: A Novel,2nd Chance,4 Blondes,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Heartbreaking Work of Staggering Genius,A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),A Man in Full,A Map of the World,...,Wicked: The Life and Times of the Wicked Witch of the West,Wild Animus,Winter Moon,Winter Solstice,Wish You Well,Without Remorse,Wuthering Heights,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2276,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6251,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6543,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274301,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
274308,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
275970,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
277427,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [401]:
# using cosine similarity to calculate the relation between the vectors in the matrix

user_user_sim_matrix = pd.DataFrame(
    cosine_similarity(user_item_matrix)
)
user_user_sim_matrix.columns = user_item_matrix.index

user_user_sim_matrix

User-ID,2276,3363,4385,6251,6543,6575,7158,7346,8681,8936,...,270713,271284,273979,274004,274061,274301,274308,275970,277427,278418
0,1.000000,0.0,0.301511,0.000000,0.000000,0.043519,0.000000,0.090909,0.000000,0.123091,...,0.116052,0.0,0.000000,0.134840,0.0,0.000000,0.000000,0.000000,0.000000,0.0
1,0.000000,1.0,0.000000,0.000000,0.000000,0.102062,0.171499,0.106600,0.000000,0.000000,...,0.136083,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
2,0.301511,0.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.150756,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
3,0.000000,0.0,0.000000,1.000000,0.161515,0.180579,0.000000,0.062869,0.078811,0.000000,...,0.000000,0.0,0.050572,0.046625,0.0,0.076139,0.000000,0.000000,0.055728,0.0
4,0.000000,0.0,0.000000,0.161515,1.000000,0.223607,0.062622,0.077850,0.097590,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.091287,0.069007,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,0.000000,0.0,0.000000,0.076139,0.000000,0.052705,0.000000,0.137620,0.000000,0.223607,...,0.140546,0.0,0.000000,0.122474,0.0,1.000000,0.105409,0.000000,0.048795,0.0
493,0.000000,0.0,0.000000,0.000000,0.000000,0.041667,0.070014,0.043519,0.000000,0.117851,...,0.111111,0.0,0.000000,0.064550,0.0,0.105409,1.000000,0.102062,0.154303,0.0
494,0.000000,0.0,0.000000,0.000000,0.091287,0.153093,0.000000,0.106600,0.133631,0.000000,...,0.136083,0.0,0.000000,0.000000,0.0,0.000000,0.102062,1.000000,0.094491,0.0
495,0.000000,0.0,0.000000,0.055728,0.069007,0.038576,0.000000,0.040291,0.000000,0.000000,...,0.051434,0.0,0.000000,0.059761,0.0,0.048795,0.154303,0.094491,1.000000,0.0


In [403]:
# change the data index to Customer ID 
user_user_sim_matrix['User-ID'] = user_item_matrix.index
user_user_sim_matrix = user_user_sim_matrix.set_index('User-ID')
user_user_sim_matrix 

User-ID,2276,3363,4385,6251,6543,6575,7158,7346,8681,8936,...,270713,271284,273979,274004,274061,274301,274308,275970,277427,278418
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2276,1.000000,0.0,0.301511,0.000000,0.000000,0.043519,0.000000,0.090909,0.000000,0.123091,...,0.116052,0.0,0.000000,0.134840,0.0,0.000000,0.000000,0.000000,0.000000,0.0
3363,0.000000,1.0,0.000000,0.000000,0.000000,0.102062,0.171499,0.106600,0.000000,0.000000,...,0.136083,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
4385,0.301511,0.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.150756,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
6251,0.000000,0.0,0.000000,1.000000,0.161515,0.180579,0.000000,0.062869,0.078811,0.000000,...,0.000000,0.0,0.050572,0.046625,0.0,0.076139,0.000000,0.000000,0.055728,0.0
6543,0.000000,0.0,0.000000,0.161515,1.000000,0.223607,0.062622,0.077850,0.097590,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.091287,0.069007,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274301,0.000000,0.0,0.000000,0.076139,0.000000,0.052705,0.000000,0.137620,0.000000,0.223607,...,0.140546,0.0,0.000000,0.122474,0.0,1.000000,0.105409,0.000000,0.048795,0.0
274308,0.000000,0.0,0.000000,0.000000,0.000000,0.041667,0.070014,0.043519,0.000000,0.117851,...,0.111111,0.0,0.000000,0.064550,0.0,0.105409,1.000000,0.102062,0.154303,0.0
275970,0.000000,0.0,0.000000,0.000000,0.091287,0.153093,0.000000,0.106600,0.133631,0.000000,...,0.136083,0.0,0.000000,0.000000,0.0,0.000000,0.102062,1.000000,0.094491,0.0
277427,0.000000,0.0,0.000000,0.055728,0.069007,0.038576,0.000000,0.040291,0.000000,0.000000,...,0.051434,0.0,0.000000,0.059761,0.0,0.048795,0.154303,0.094491,1.000000,0.0
