# Book-Oracle

Book-Oracle is a book recommendation app which suggest books based on preferences.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np

#Modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

#NLP
import nltk

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = "{:,.2f}".format

#Custom functions
from data_cleaning import clean_ratings, clean_users, clean_books

RSEED = 42

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [None]:
ratings_df = pd.read_csv('data/ratings.csv')
books_df = pd.read_csv('data/books.csv')
users_df = pd.read_csv('data/users.csv')

print("Ratings table has {} rows and {} columns".format(ratings_df.shape[0], ratings_df.shape[1]))
print("Books table has {} rows and {} columns".format(books_df.shape[0], books_df.shape[1]))
print("Users table has {} rows and {} columns".format(users_df.shape[0], users_df.shape[1]))

## Data cleaning

#### Clean Books Dataset

In [None]:
books_df, common_identifier_dict = clean_books(books_df)

print("Books table has {} rows and {} columns".format(books_df.shape[0], books_df.shape[1]))

In [None]:
books_df.head(3)

#### Clean Ratings Dataset

In [None]:
ratings_df = clean_ratings(ratings_df)

#assign common identifier and convert to int
ratings_df['common_identifier'] = ratings_df['isbn'].map(common_identifier_dict)

print("Ratings table has {} rows and {} columns".format(ratings_df.shape[0], ratings_df.shape[1]))

#check how many rows in ratings_df do not have a common identifier in books_df
print("")
print("There are {} rows in ratings_df that do not have a common identifier in books_df".format(ratings_df[~ratings_df['common_identifier'].isin(books_df['common_identifier'])].shape[0]))

ratings_df.sort_values(by="common_identifier", ascending=True).head(3)

#### Clean Users Dataset

In [None]:
users_df = clean_users(users_df)

print("Users table has {} rows and {} columns".format(users_df.shape[0], users_df.shape[1]))
users_df.head(3)

## Merge tables

In [None]:
#Merge all tables
df = books_df.merge(ratings_df, on='common_identifier', how='inner')
df = df.merge(users_df, on='user_id', how='inner')

#check for missing values
print("There are {} missing values in df".format(df.isnull().sum().sum()))

#check for duplicates
print("There are {} duplicates in df".format(df.duplicated().sum()))

df.head(3)

In [None]:
df.info()

## Save cleaned data

In [None]:
df.to_csv('data/kaggle_full_df.csv', index=False)

## Exploratory Data Analysis

In [None]:
#check for outliers
df.describe()

In [None]:
#Book rating distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
sns.countplot(ax=axes[0, 0], x='book_rating', data=df, palette='hls')
axes[0, 0].set_title('Distribution of book ratings')
axes[0, 0].set_xlabel('Book rating')
axes[0, 0].set_ylabel('Count')

#User rating distribution
sns.countplot(ax=axes[0, 1], x='age', data=df, palette='hls')
axes[0, 1].set_title('Distribution of age')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Count')
axes[0, 1].tick_params(axis='x', rotation=90)
axes[0, 1].xaxis.set_major_locator(plt.MultipleLocator(5))

#Top 15 countries
sns.countplot(ax=axes[1, 0], y='country', data=df, palette='hls', order=df['country'].value_counts().iloc[:15].index)
axes[1, 0].set_title('TOP 15 countries')
axes[1, 0].set_xlabel('Count')
axes[1, 0].set_ylabel('Country')

#Top 15 cities
sns.countplot(ax=axes[1, 1], y='city', data=df, palette='hls', order=df['city'].value_counts().iloc[:15].index)
axes[1, 1].set_title('TOP 15 cities')
axes[1, 1].set_xlabel('Count')
axes[1, 1].set_ylabel('City')

plt.subplots_adjust(hspace=0.5)

#print percentage of users that come from English speaking countries

print("Percentage of users that come from English speaking countries: {:.2f}%".format(df[df['country'].isin(['usa', 'canada', 'united kingdom', 'australia', 'new zealand', 'ireland'])].shape[0]/df.shape[0]*100))

#print percentage of ratings that are implicit (rating equals 0)

print("Percentage of ratings that are implicit (user interacted with a book, but hasn't given a rating): {:.2f}%".format(df[df['book_rating']==0].shape[0]/df.shape[0]*100))

In [None]:
#% of users that have rated more than 30 books

print("Percentage of users that have rated more than 30 books: {:.2f}%".format(df['user_id'].value_counts()[df['user_id'].value_counts()>30].shape[0]/df['user_id'].value_counts().shape[0]*100))

# % of books that have more than 50 ratings

print("Percentage of books that have more than 50 ratings: {:.2f}%".format(df['common_identifier'].value_counts()[df['common_identifier'].value_counts()>50].shape[0]/df['common_identifier'].value_counts().shape[0]*100))

In [None]:
#Explore number of books rated per user
print("95% of users have rated {} or less books".format(df['user_id'].value_counts().quantile(0.95)))
print("90% of users have rated {} or less books".format(df['user_id'].value_counts().quantile(0.9)))
print("75% of users have rated {} or less books".format(df['user_id'].value_counts().quantile(0.75)))
print("50% of users have rated {} or less books".format(df['user_id'].value_counts().quantile(0.5)))

#print quantiles of number of book ratings per user
print("Quantiles of number of book ratings per user:")
print(df['user_id'].value_counts().quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))

In [None]:
#Explore number of ratings per book

print("95% of books have {} or less ratings".format(df['common_identifier'].value_counts().quantile(0.95)))
print("90% of books have {} or less ratings".format(df['common_identifier'].value_counts().quantile(0.9)))
print("75% of books have {} or less ratings".format(df['common_identifier'].value_counts().quantile(0.75)))
print("50% of books have {} or less ratings".format(df['common_identifier'].value_counts().quantile(0.5)))

#print quantiles of number of book ratings per book
print("Quantiles of number of book ratings per book:")
print(df['common_identifier'].value_counts().quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))

In [None]:
fig,axes = plt.subplots(2, 2, figsize=(15, 15))

#Visualise titles with most ratings
sns.countplot(ax=axes[0, 0], y='book_title', data=df, palette='hls', order=df['book_title'].value_counts().iloc[:15].index)
axes[0, 0].set_title('TOP 15 books with most ratings')
axes[0, 0].set_xlabel('Count')

#Visualise authors with most ratings
sns.countplot(ax=axes[0, 1], y='book_author', data=df, palette='hls', order=df['book_author'].value_counts().iloc[:15].index)
axes[0, 1].set_title('TOP 15 authors with most ratings')
axes[0, 1].set_xlabel('Count')

#Visualise titles with most explicit ratings
sns.countplot(ax=axes[1, 0], y='book_title', data=df[df['book_rating']!=0], palette='hls', order=df[df['book_rating']!=0]['book_title'].value_counts().iloc[:15].index)
axes[1, 0].set_title('TOP 15 books with most explicit ratings')
axes[1, 0].set_xlabel('Count')

#Visualise authors with most explicit ratings, truncate titles above 30 characters
sns.countplot(ax=axes[1, 1], y='book_author', data=df[df['book_rating']!=0], palette='hls', order=df[df['book_rating']!=0]['book_author'].value_counts().iloc[:15].index)
axes[1, 1].set_title('TOP 15 authors with most explicit ratings')
axes[1, 1].set_xlabel('Count')

plt.subplots_adjust(hspace=0.5)

## Pipline Architecture

## Sample Size

## Modelling

## Evaluation

## Error Analysis