# **Advanced machine learning for data science 2 - Project**

## <font color="red"> **Submission 2 - Book Recommendation System** </font>

### <font color="darksalmon"> Group 1 : Maxime Bisiaux - Vincent Fernandez - Audrey Laborde - Weilun Lin - Charlotte Padovani - Jie Su </font>

###  <font color="pink6"> Import and print the dataset </font>

<i> Link of the dataset : https://www.kaggle.com/datasets/alhanoofat/goodreadsbest1500books </i>

In [None]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Import dataset
df = pd.read_csv("Goodreads_best1500books.csv")

In [None]:
# Rename the first column
df.rename(columns={df.columns[0]: 'index'}, inplace=True)

In [None]:
print(df.head(2))

   index                                          book_name      author_name  \
0      0  Harry Potter and the Deathly Hallows (Harry Po...     J.K. Rowling   
1      1            The Hunger Games (The Hunger Games, #1)  Suzanne Collins   

    book_genre  year_published edition_language avg_rating no_of_raters  \
0      Fantasy            2007          English       4.61    2,530,201   
1  Young Adult            2008          English       4.33    5,856,382   

     score no_of_ppl_voted                                           book_url  
0  392,793           3,968  https://www.goodreads.com/book/show/136251.Har...  
1  289,899           2,958  https://www.goodreads.com/book/show/2767052-th...  


<b> Data dictionary </b>

<img src='https://drive.google.com/uc?id=1C9OHyVnkmxXBJZWkiOtJqiOjADqb3MJ0'/>

### <font color="pink4"> Data preparation/cleaning </font>

In [None]:
# Do not take into account the comma of the thousands separator 
df["no_of_raters"] = df["no_of_raters"].str.replace(",", "")

In [None]:
# Clean the data
df["avg_rating"] = df["avg_rating"].replace("it", None)
df["avg_rating"] = df["avg_rating"].replace("liked", None)
df["avg_rating"] = df["avg_rating"].replace("really", None)

df["no_of_raters"] = df["no_of_raters"].replace("avg", None)
df["no_of_raters"] = df["no_of_raters"].replace("rating", None)

### <font color="pink4"> Recommendation part - Cosine similarity (content based system) and popularity based system </font>

In [None]:
# Select three columns ("author_name", "book_genre" and "year_published") to build the recommendation
features = ['author_name', 'book_genre', 'year_published']

for feature in features:
    df[feature] = df[feature].fillna('')

In [None]:
# Define a function to extract values for each row for the 3 features and put them together in one column
def combined_features(row):
    return str(row['author_name'])+" "+str(row['book_genre'])+" "+str(row['year_published'])

df["combined_features"] = df.apply(combined_features, axis=1)

In [None]:
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.         0.         ... 0.28867513 0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.5       ]
 ...
 [0.28867513 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.5        ... 0.         0.         1.        ]]


In [None]:
books_user_likes = "Running with Scissors"

def get_index_from_book_name(book_name):
    return df[df.book_name == book_name]["index"].values[0]

books_index = get_index_from_book_name(books_user_likes)

In [None]:
similar_books = list(enumerate(cosine_sim[books_index]))
sorted_similar_books = sorted(similar_books, key=lambda x:x[1], reverse=True)

In [None]:
def get_book_name_from_index(index):
    return df[df.index == index]["book_name"].values[0]

i = 0
for book in sorted_similar_books:
    print(get_book_name_from_index(book[0]))
    i = i + 1
    if i > 15:
        break

Running with Scissors
Dry
A Wolf at the Table
Possible Side Effects
Magical Thinking: True Stories
Don't Let's Go to the Dogs Tonight: An African Childhood
The Tender Bar
Youth
Middlesex
Atonement
The Devil in the White City: Murder, Magic, and Madness at the Fair That Changed America
The Other Boleyn Girl (The Plantagenet and Tudor Novels, #9)
The Five People You Meet in Heaven
Seabiscuit: An American Legend
Reading Lolita in Tehran: A Memoir in Books
The Eyre Affair (Thursday Next, #1)


### <font color="pink4"> Top 15 rating </font>

In [None]:
# Select columns ("book_genre" and "year_published") to build the recommendation
features = ['book_genre', 'year_published']

for feature in features:
    df[feature] = df[feature].fillna('')

In [None]:
# Define a function to extract values for each row for the 6 features and put them together in one column
def combined_features2(row):
    return str(row['book_genre'])+" "+str(row['year_published'])

df["combined_features2"] = df.apply(combined_features2, axis=1)

In [None]:
cv2 = CountVectorizer()

count_matrix2 = cv2.fit_transform(df['combined_features2'])
print("Count Matrix :", count_matrix2.toarray())

Count Matrix : [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
cosine_sim2 = cosine_similarity(count_matrix2)
print(cosine_sim2)

[[1.  0.  0.  ... 0.5 0.  0. ]
 [0.  1.  0.  ... 0.  0.  0. ]
 [0.  0.  1.  ... 0.  0.  1. ]
 ...
 [0.5 0.  0.  ... 1.  0.  0. ]
 [0.  0.  0.  ... 0.  1.  0. ]
 [0.  0.  1.  ... 0.  0.  1. ]]


In [None]:
books_user_likes = "The Casual Vacancy"

def get_index_from_book_name(book_name):
    return df[df.book_name == book_name]["index"].values[0]

books_index = int(get_index_from_book_name(books_user_likes))

In [None]:
similar_books = list(enumerate(cosine_sim2[books_index]))
sorted_similar_books = sorted(similar_books, key=lambda x:x[1], reverse=True)

In [None]:
# book_name, avg_rating

def get_book_name_from_index(index):
    return df[df.index == index]["book_name"].values[0]

def get_avg_rating_from_index(index):
    return df[df.index == index]["avg_rating"].values[0]

i = 0
lst = []
for book in sorted_similar_books:
    lst.append([get_book_name_from_index(book[0]), float(get_avg_rating_from_index(book[0]))])
    i = i + 1
    if i > 15:
        break

lst = sorted(lst, key=lambda x:x[1], reverse=True)
print(lst)

[["The Orphan Master's Son", 4.07], ["Tell the Wolves I'm Home", 4.04], ['The Light Between Oceans', 4.02], ['The Snow Child', 3.97], ['My Brilliant Friend (The Neapolitan Novels, #1)', 3.94], ['The Round House', 3.93], ['The Silver Linings Playbook', 3.92], ['The Unlikely Pilgrimage of Harold Fry (Harold Fry, #1)', 3.91], ["Where'd You Go, Bernadette", 3.9], ['The Hundred-Year-Old Man Who Climbed Out of the Window and Disappeared (The Hundred-Year-Old Man, #1)', 3.83], ["Billy Lynn's Long Halftime Walk", 3.8], ['Flight Behavior', 3.78], ["Mr. Penumbra's 24-Hour Bookstore (Mr. Penumbra's 24-Hour Bookstore, #1)", 3.75], ['Beautiful Ruins', 3.68], ['NW', 3.44], ['The Casual Vacancy', 3.3]]


In [None]:
# book_name, avg_rating and no_of_raters

def get_book_info_from_index(index):
    book_info = df[df.index == index]
    return book_info["book_name"].values[0], float(book_info["avg_rating"].values[0]), float(book_info["no_of_raters"].values[0])

i = 0
lst = []
for book in sorted_similar_books:
    book_info = get_book_info_from_index(book[0])
    lst.append(book_info)
    i = i + 1
    if i > 15:
        break

lst = sorted(lst, key=lambda x:(x[1], x[2]), reverse=True)

for book in lst:
    print(f"Title : {book[0]}, Average rating : {book[1]}, Number of raters : {book[2]}")

Title : The Orphan Master's Son, Average rating : 4.07, Number of raters : 80520.0
Title : Tell the Wolves I'm Home, Average rating : 4.04, Number of raters : 116143.0
Title : The Light Between Oceans, Average rating : 4.02, Number of raters : 365061.0
Title : The Snow Child, Average rating : 3.97, Number of raters : 106890.0
Title : My Brilliant Friend (The Neapolitan Novels, #1), Average rating : 3.94, Number of raters : 160965.0
Title : The Round House, Average rating : 3.93, Number of raters : 91056.0
Title : The Silver Linings Playbook, Average rating : 3.92, Number of raters : 211119.0
Title : The Unlikely Pilgrimage of Harold Fry (Harold Fry, #1), Average rating : 3.91, Number of raters : 138892.0
Title : Where'd You Go, Bernadette, Average rating : 3.9, Number of raters : 417379.0
Title : The Hundred-Year-Old Man Who Climbed Out of the Window and Disappeared (The Hundred-Year-Old Man, #1), Average rating : 3.83, Number of raters : 188880.0
Title : Billy Lynn's Long Halftime Wal

### <font color="pink4"> Additional information </font>

In [None]:
# Import packages
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from wordcloud import WordCloud
from matplotlib import colors 

In [None]:
# Top 50 author rating
author_rating = df.sort_values(by='avg_rating', ascending=False)[0:50]

fig = go.Figure(data=[go.Table(header=dict(values=['author_name', 'avg_rating']),
                 cells=dict(values=[author_rating['author_name'], author_rating['avg_rating']]))
                     ])
fig.show()

In [None]:
# Authors that have the most books in the 21st century
df.author_name.value_counts()

Stephen King        16
Rick Riordan        16
Scott McElhaney     13
Charlaine Harris    12
Jim Butcher         10
                    ..
Naomi Novik          1
Jon   Stewart        1
Jenny Downham        1
E. Lockhart          1
Chuck Hogan          1
Name: author_name, Length: 873, dtype: int64

In [None]:
# Cleaning process and modification of the data types
df["score"] = df["score"].str.replace(",", "")
df['score'] = df['score'].astype('int64')
df['no_of_raters'] = df['no_of_raters'].astype('int64')
df['no_of_raters'] = df['no_of_raters'].astype('int64')

In [None]:
# Top 50 books in the 21st century
book_rating = df.sort_values(by='score', ascending=False)[0:50]

fig = go.Figure(data=[go.Table(header=dict(values=['book_name', 'score', 'no_of_ppl_voted']),
                 cells=dict(values=[book_rating['book_name'], book_rating['score'], book_rating['no_of_ppl_voted']]))
                     ])
fig.show()

In [None]:
# The most rated books
book_visited = df.sort_values(by='no_of_raters', ascending=False)[0:50]

fig = go.Figure(data=[go.Table(header=dict(values=['book_name', 'no_of_raters']),
                 cells=dict(values=[book_visited['book_name'], book_visited['no_of_raters']]))
                     ])
fig.show()

In [None]:
# Top 10 fiction book recommendations by average rating
fiction_book = df[df['book_genre']=='Fiction']
fiction_book_rec = fiction_book.sort_values(by='avg_rating', ascending=False)[0:10]
fig = go.Figure(data=[go.Table(header=dict(values=['book_name', 'avg_rating'], fill_color='light blue'),
                 cells=dict(values=[fiction_book_rec['book_name'], fiction_book_rec['avg_rating']], fill_color='light grey'))
                     ])
fig.show()