# Book-Oracle: Basic Recommendation System

- Develop a basic Recommendation System
- 26.11.2023
- Janina, Oliwia, Neha, Nina

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

#Modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix, hstack
from sklearn.neighbors import NearestNeighbors

#NLP
import nltk

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = "{:,.2f}".format

RSEED = 42

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
df = pd.read_csv('data/kaggle_full_df.csv')
df['country'].fillna('unknown', inplace=True)
df.head(3)

Unnamed: 0,book_title,book_author,year_of_publication,publisher,image_url_m,common_identifier,user_id,isbn,book_rating,age,city,country
0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1,2,195153448,0,18,stockton,usa
1,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1,269782,801319536,7,30,edmonton,canada
2,Pay It Forward: A Novel,Catherine Ryan Hyde,2000,Simon &amp; Schuster,http://images.amazon.com/images/P/0684862719.0...,2392,269782,684862719,8,30,edmonton,canada


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005487 entries, 0 to 1005486
Data columns (total 12 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   book_title           1005487 non-null  object
 1   book_author          1005487 non-null  object
 2   year_of_publication  1005487 non-null  object
 3   publisher            1005487 non-null  object
 4   image_url_m          1005487 non-null  object
 5   common_identifier    1005487 non-null  int64 
 6   user_id              1005487 non-null  int64 
 7   isbn                 1005487 non-null  object
 8   book_rating          1005487 non-null  int64 
 9   age                  1005487 non-null  int64 
 10  city                 1005209 non-null  object
 11  country              1005487 non-null  object
dtypes: int64(4), object(8)
memory usage: 92.1+ MB


## TO DOs before Modelling

1. Resolve issue of users who gave multiple ratings
2. Resolve Nan in Country & City

In [4]:
# example of users rating the same book multiple times
df.groupby(['book_title', 'book_author', 'user_id']).size().reset_index(name='Count').sort_values(by='Count', ascending=False).query("Count > 1").head(10)

Unnamed: 0,book_title,book_author,user_id,Count
551152,"Phonics Fun: Reading Program, Pack 4 (Clifford...",Francie Alexander,185233,12
578426,Ranma 1/2 (Ranma 1/2),Rumiko Takahashi,156111,6
272856,Flame Of Recca (Flame Of Recca),Nobuyuki Anzai,10354,5
42106,Adventures Of Huckleberry Finn,Mark Twain,240258,5
424530,Life And Teaching Of The Masters Of The Far Ea...,Baird T. Spalding,187763,4
145978,Chobits (Chobits),Clamp,9227,4
145980,Chobits (Chobits),Clamp,38023,4
145984,Chobits (Chobits),Clamp,196160,4
145985,Chobits (Chobits),Clamp,224904,4
431933,Little Women,Louisa May Alcott,203240,4


## Most Basic Recommendation System

Let's create the most basic recommendation system, based on EXPLICIT rating (1-10) and readers from usa, canada & uk

#### Subset data
- only EXCPLICIT rating and users from USA & Canada

In [5]:
#Only Rating above 0
df = df[df['book_rating']>0]

#Only users from US or Canada
df = df[df['country'].str.contains("usa|canada")]

df.shape

(303032, 12)

#### Create a new variable: Rating Count

In [6]:
#Add a new column with a total rating count for each book by common identifier
df['rating_count'] = df.groupby(['book_title', 'book_author'])['book_rating'].transform('count')

#Show a list of books that got the highest rating count, group by title and author to show unique books

df.groupby(['book_title', 'book_author', 'rating_count']).size().reset_index(name='Count').sort_values(by='rating_count', ascending=False).head(5)

Unnamed: 0,book_title,book_author,rating_count,Count
86042,The Lovely Bones: A Novel,Alice Sebold,614,614
79268,The Da Vinci Code,Dan Brown,420,420
91346,The Secret Life Of Bees,Sue Monk Kidd,387,387
103977,Wild Animus,Rich Shapero,352,352
90242,The Red Tent (Bestselling Backlist),Anita Diamant,351,351


In [7]:
df.head(3)

Unnamed: 0,book_title,book_author,year_of_publication,publisher,image_url_m,common_identifier,user_id,isbn,book_rating,age,city,country,rating_count
1,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1,269782,801319536,7,30,edmonton,canada,1
2,Pay It Forward: A Novel,Catherine Ryan Hyde,2000,Simon &amp; Schuster,http://images.amazon.com/images/P/0684862719.0...,2392,269782,684862719,8,30,edmonton,canada,26
3,Watership Down,Richard Adams,1976,Avon,http://images.amazon.com/images/P/0380002930.0...,3172,269782,140039589,10,30,edmonton,canada,99


#### Define book popularity threshold

In [8]:
popularity_threshold = 50
df = df[df['rating_count'] >= popularity_threshold]
df.shape

(51972, 13)

#### Define user activity threshold

In [9]:
#Subset only users with more than 30 ratings

user_rating_counts = df['user_id'].value_counts()
df = df[df['user_id'].isin(user_rating_counts[user_rating_counts >= 30].index)]
df.shape

(4430, 13)

#### Create a Pivot Matrix

In [10]:
movie_features_df = df.pivot_table(index='book_title',columns='user_id',values='book_rating').fillna(0)
movie_features_df.head()

user_id,4017,6251,6575,7346,8454,13552,16795,21014,22625,23872,...,234828,235105,235282,236283,240567,241980,242083,255489,258534,270713
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,8.0,0.0,0.0,8.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,8.0,0.0,7.0,0.0,0.0,0.0
1St To Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
2Nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
A Bend In The Road,0.0,0.0,1.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Case Of Need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Train a KNN model for item-based collaborative filtering

In [11]:
#Convert our table to a matrix
movie_features_df_matrix = csr_matrix(movie_features_df.values)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

#### Assess predictions of the model

In [25]:
#Choose a random book from our dataset
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

#Print predicted books

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

129
Recommendations for East Of Eden (Oprah'S Book Club):

1: Daughter Of Fortune, with distance of 0.4406141482223369:
2: Cold Mountain, with distance of 0.5394656862928108:
3: Fried Green Tomatoes At The Whistle Stop Cafe, with distance of 0.5804725022214652:
4: Plain Truth, with distance of 0.5835207496748096:
5: Sticks &Amp; Scones, with distance of 0.5876945912236327:


## Pipline Architecture

## Sample Size

## Modelling

## Evaluation

## Error Analysis