# <span style= "color:cyan"> BUILDING A RECOMMENDATION SYSTEM </SPAN>

Load Libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, SVDpp
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import scipy
import math
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'surprise'

#### <span style= "color:orange"> Loading the dataset </SPAN>

In [None]:
def read_data(path, error_bad_lines = False, encoding = 'latin-1', sep=';', on_bad_lines = 'skip'):

    "A simple function that reads the data"
    
    data = pd.read_csv(path, error_bad_lines = error_bad_lines, encoding = encoding, sep = sep)
    return data

book_ratings = read_data(r'C:\Users\user\Documents\Recommendation Systems\recommendation_system_project\BX-Book-Ratings.csv')
books = read_data(r'C:\Users\user\Documents\Recommendation Systems\recommendation_system_project\BX-Books.csv')
users = read_data(r'C:\Users\user\Documents\Recommendation Systems\recommendation_system_project\BX-Users.csv')

we have three datasets:
* `books`
* `users`
* `rating`

Let us explore them by viewing first five rows of each

In [None]:
""" calling on variable book_ratings to view the first 5 rows"""

book_ratings.head()

In [None]:
""" calling on variable books to view the first five rows"""

books.head()

In [None]:
""" calling on variable users to view the first 5 rows"""

users.head()

#### <span style= "color:orange"> Preliminary Data understanding </SPAN>

In [None]:

def get_info_shape_stats(dataset, dataset_name):

    """A simple function to check the shape, info and descriptive statistics of the dataset"""
    
    print('The Dataset:', dataset_name )
    print(f"has {dataset.shape[0]} rows and {dataset.shape[1]} columns")
    print('---------------------------')
    print('---------------------------')
    print(dataset.info())
    print('---------------------------')
    print('----------------------------')
    print(dataset.describe())

In [None]:
"""calling on the function get_info_shape_stats"""

get_info_shape_stats(book_ratings, 'Book Ratings')

In [None]:
"""calling on the function get_info_shape_stats"""

get_info_shape_stats(books, 'Books')

* There are columns labelled None, with numerous null values, these will be analyzed during the data cleaning stage

In [None]:
"""calling on the function get_info_shape_stats"""

get_info_shape_stats(users, 'Users')

In [None]:
def data_types(data, dataset_name):

    """A simple function to check the data types on th datasets """

    print("Dataset:",dataset_name, "has",len( data.select_dtypes(include='number').columns),
                "Numeric columns")
    
    print("and", len(data.select_dtypes(include='object').columns),
          "Categorical columns")

    print('*****************************************************')
    print('*****************************************************')

    print('Numerical Columns:', data.select_dtypes(include='number').columns)
    print('Categorical Coulumns:', data.select_dtypes(include='object').columns)

In [None]:
""" calling on the data_types function """

data_types(users, 'Users') 

In [None]:
""" calling on the data_types function """

data_types(books, 'Books')

In [None]:
""" calling on the data_types function """

data_types(book_ratings, 'Book Ratings')

#### <span style= "color:orange"> Data Cleaning </SPAN>

Duplicates

In [None]:
duplicates = []

def check_duplicates(data):

    """Function that iterates through the rows of our dataset to check whether they are duplicated or not"""
    
    for i in data.duplicated():
        duplicates.append(i)
    duplicates_set = set(duplicates)
    if(len(duplicates_set) == 1):
        print('The Dataset has No Duplicates')

    else:
        duplicates_percentage = np.round(((sum(duplicates)/len(data)) * 100 ), 2)
        print(f'Duplicated rows constitute of {duplicates_percentage} % of our dataset')

In [None]:
check_duplicates(book_ratings) # checking for duplicates in book_ratings

In [None]:
check_duplicates(books) # checking for duplicates in books

In [None]:
check_duplicates(users) # checking for duplicates in users

Missing Values

In [None]:
def missing_values(data):

    """ Function for checking null values in percentage in relation to length of the dataset """

    if data.isnull().any().any() == False :

        print("There Are No Missing Values")

    else:

        missing_values = data.isnull().sum().sort_values(ascending=False)

        missing_val_percent = ((data.isnull().sum()/len(data)).sort_values(ascending=False))

        missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage %': missing_val_percent})

        return missing_df[missing_df['Percentage %'] > 0]

In [None]:
missing_values(book_ratings) # checking for missing values in book ratings

In [None]:
missing_values(books) # checking for missing values in books

In [None]:
missing_values(users) # checking for missing values in users

In [None]:
def dropping_columns(data, columns):

    """A simple function to drop columns with missing values"""

    drop_column = data.drop(columns=columns, inplace = True)
    
    return drop_column

columns_to_drop = users[['Age']]

dropping_columns(users, columns_to_drop)

In [None]:
def drop_rows(data, columns):
    
    """A simple function to remove the rows of columns that have missing values """
    
    new_data = data.dropna(subset=columns, inplace=True)
    return new_data

col = ['Image-URL-L', 'Publisher', 'Book-Author']
drop_rows(books, col)

#### <span style= "color:orange"> Feature Selection and EDA </SPAN>

In [None]:
def merge_dataframe(data_0, data_1, merge_column):
    """A function to merge the datasets based on a given column"""
    new_df = data_0.merge(data_1, on=merge_column)
    return new_df

df_rating = merge_dataframe(users, book_ratings, "User-ID")
df_rating

In [None]:
missing_values(df_rating) # checking for missing values

In [None]:
check_duplicates(df_rating) # checking for duplicates

In [None]:
get_info_shape_stats(df_rating, 'Merged DataFrame') # checking the dataset info

In [None]:
""" merging the new dataset with the book dataset """
df_books = merge_dataframe(books, df_rating, 'ISBN')
df_books.head()

In [None]:
get_info_shape_stats(df_books, "Combined Dataset") # check merged dataset info

In [None]:

missing_values(df_books) # check for missing values

In [None]:
check_duplicates(df_books) # check for duplicates

## Popularity Based Recommendation System

In [None]:
def calculate_popularity(df, column_name):

    """Calculates the popularity of values in a specific column of a dataframe"""

    popularity_df = pd.DataFrame(df[column_name].value_counts())
    return popularity_df

popularity_df = calculate_popularity(df_books, 'Book-Title')
popularity_df.head(20)

In [None]:

def filter_active_users(dataframe, threshold):

    """Filter the dataframe to include only users who have actively rated more than the threshold"""
    
    # Filter the DataFrame based on the count of each unique User-ID
    user_counts = dataframe['User-ID'].value_counts()
    filter = user_counts > threshold

    # Get the index values of the filtered rows
    filtered_index = filter[filter].index

    # Create a new DataFrame by selecting only the rows where User-ID is in the filtered index
    filtered_df = dataframe[dataframe['User-ID'].isin(filtered_index)]

    return filtered_df

df_filtered = filter_active_users(df_books, 300)
df_filtered.head()

In [None]:
def calculate_rating_count(dataframe):

    """A Simple Function to Calculate the Number of Times each book has been rated"""

    # Group the dataframe by 'Book-Title' and count the occurrences of 'Book-Rating' for each title
    rating_count = dataframe.groupby('Book-Title')['Book-Rating'].count().reset_index()

    # Rename the 'Book-Rating' column to 'rating_count'
    rating_count.rename(columns={'Book-Rating': 'rating_count'}, inplace=True)

    # Merge the original dataframe with the 'rating_count' dataframe based on 'Book-Title'
    new_df = dataframe.merge(rating_count, on='Book-Title')

    # Display the first few rows of the merged dataframe
    return new_df

new_book_df = calculate_rating_count(df_filtered)
new_book_df.head()

In [None]:
def filter_rating_count(dataframe, threshold):
    
    """A Simple Funtion to Filter the dataframe based on a minimum rating count"""

    # Apply the filter to the 'dataframe' using the 'loc' function
    filtered_df = dataframe.loc[dataframe['rating_count'] >= threshold, :]

    # Display the first few rows of the filtered dataframe
    return filtered_df

rating_more_50 = filter_rating_count(new_book_df, 50)
rating_more_50.head()

If you preview the user ID and Book-Tittle you will get that a user has rated a book more than once. This can be brought about reading the book multiple times and having different different opinions on it. Let's preview the dataset that coontains the two columns 

In [None]:
book_user_id_df = rating_more_50[['User-ID', 'Book-Title']]
book_user_id_df

In [None]:
check_duplicates(book_user_id_df)

Let's go ahead and create the final dataframe and remove the duplicates in the two columns 

In [None]:
final_df = rating_more_50.drop_duplicates(subset=['User-ID', 'Book-Title'])
final_df.head()

In [None]:
get_info_shape_stats(final_df, 'Final DataFrame')

## Model Based Collaborative Filtering Recommender

>> Collaborative filtering is a method of making automatic predictions(i.e filtering) about the interests of a user by collecting preferences or taste information from many users on the aggregate(i.e collaborating). There are two main apporoaches to collaborative filtering :

>> * Item - Item CF : "Users who like this item also liked..."
>> * User - Item CF : "Users who are similar to you also liked"
 
>> Model based collaborative filtering approach involves building machine learning algorithms to predict user's ratings. They involve dimensionality reduction methods that reduce high dimensional matrix containing abundant number of missing values with a much smaller matrix in a lower-dimensional space.
The goal of this section is to compare SVD and SVDpp algorithms, try optimizing parameters and explore obtained results.Let's start by preparing our dataset for modelling

In [None]:
# creating a new dataframe that contains only the relevant columns 

final_df.rename(columns = {'User-ID':'user_id' ,'ISBN':'isbn' ,'Book-Rating':'book_rating'},inplace=True)

In [None]:
""" Filtering out least active users """

user_ratings_threshold = 3

filter_users = final_df['user_id'].value_counts()
filter_users_list = filter_users[filter_users >= user_ratings_threshold].index.to_list()

df_ratings_top = final_df[final_df['user_id'].isin(filter_users_list)]

print('Filter: users with at least %d ratings\nNumber of records: %d' % (user_ratings_threshold, len(df_ratings_top))) 

In [None]:
book_ratings_threshold_perc = 0.1
book_ratings_threshold = len(df_ratings_top['isbn'].unique()) * book_ratings_threshold_perc

filter_books_list = df_ratings_top['isbn'].value_counts().head(int(book_ratings_threshold)).index.to_list()
df_ratings_top = df_ratings_top[df_ratings_top['isbn'].isin(filter_books_list)]

print('Filter: Top %d%% Most Frequently Rated Books\nNumber of records: %d' % (book_ratings_threshold_perc*100, len(df_ratings_top)))

# SVD (Singular Value Decomposition)

>> SVD is a widely used matrix decomposition method that works by reducing dimensionality of the user item matrix by extracting its latent factors and capturing underlying patterns

In [None]:
def evaluate_model(df, model_class, rating_scale=(1, 10), cv=3):

    """ A function to read our data into a Suprise Dataset format, instatiate model and perform cross validation"""

    reader = Reader(rating_scale=rating_scale)
    data = Dataset.load_from_df(df[['user_id', 'isbn', 'book_rating']], reader)
    
    model = model_class()
    cv_results = cross_validate(model, data, cv=cv)
    cv_results_df = pd.DataFrame(cv_results).mean()
    
    return cv_results_df

In [None]:
df = df_ratings_top.copy()
svd_results = evaluate_model(df, SVD)
print("SVD Results:")
print(svd_results)

# SVDpp

>> The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.Implicit ratings refer to user interactions or behaviors that indirectly reflect their preferences or interests towards items in a recommender system.Unlike explicit ratings implicit ratings are derived from user actions such as clicks, views, purchases, time spent, or other forms of interactions with items.

In [None]:
svdpp_results = evaluate_model(df, SVDpp)
print("SVDpp Results:")
print(svdpp_results)

The test_RMSE for SVD is much more better. We will go ahead and do some hyperparameter tuning on the SVD model

## Optimizing SVD Model

In [None]:
df = df_ratings_top.copy()
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'isbn', 'book_rating']], reader)

param_grid = {
    'n_factors': [10, 100, 500],
    'n_epochs': [5, 20, 50], 
    'lr_all': [0.001, 0.005, 0.02],
    'reg_all': [0.005, 0.02, 0.1]}

gs_model = GridSearchCV(
    algo_class = SVD,
    param_grid = param_grid,
    n_jobs = -1,
    joblib_verbose = 5)

gs_model.fit(data)

# Train the SVD model with the parameters that minimise the root mean squared error
best_SVD = gs_model.best_estimator['rmse']
print("Tuned SVD Model RMSE", gs_model.best_score['rmse'])
print("Best Paramers", gs_model.best_params['rmse'])

Great ! We see a reduced RMSE, this is an indication of improved performance

## Hybrid Recommendation System 

>>  Hybrid recommender system is a special type of recommender system that combines both content and collaborative filtering method.

## LightFM 

>> LightFM is a hybrid matrix factorisation model representing users and items as linear combinations of their content features’ latent factors. The model outperforms both collaborative and content-based models in cold-start or sparse interaction data scenarios (using both user and item metadata), and performs at least as well as a pure collaborative matrix factorisation model where interaction data is abundant.

In [None]:
from scipy.sparse import coo_matrix as cm
import lightfm as lf

# this is because I re-indexed all users and books to start from zero
numUsers = ratings.user_id.max()+1
numBooks = ratings.book_id.max()+1

ratSparse = cm((ratings.rating, (ratings.user_id, ratings.book_id)),
               shape=(numUsers, numBooks))
