In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("book ds.csv", encoding="ISO-8859-1")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8563 entries, 0 to 8562
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             8563 non-null   object 
 1   series            4586 non-null   object 
 2   author            8563 non-null   object 
 3   rating            8563 non-null   float64
 4   language          8563 non-null   object 
 5   isbn              8563 non-null   object 
 6   genres            8563 non-null   object 
 7   bookFormat        8554 non-null   object 
 8   edition           831 non-null    object 
 9   pages             8534 non-null   object 
 10  publisher         8563 non-null   object 
 11  publishDate       8563 non-null   object 
 12  firstPublishDate  6228 non-null   object 
 13  awards            8563 non-null   object 
 14  numRatings        8563 non-null   int64  
 15  ratingsByStars    8563 non-null   object 
 16  likedPercent      8563 non-null   int64  


In [4]:
# Drop Duplicates
df = df.drop_duplicates()

In [5]:
# Reset index of data frame
df.reset_index(inplace = True)

In [8]:
# Generate an id for each book
book_id = []
for i in range(0, len(df)):
    book_id.append(i)

# Add book_id column to dataframe
df["book_id"] = book_id

In [9]:
df = df[["title", "author", "genres", "book_id"]]
df.head()

Unnamed: 0,title,author,genres,book_id
0,The Hunger Games,Suzanne Collins,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",0
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)","['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",1
2,Twilight,Stephenie Meyer,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",2
3,The Book Thief,Markus Zusak (Goodreads Author),"['Historical Fiction', 'Fiction', 'Young Adult...",3
4,Animal Farm,"George Orwell, Russell Baker (Preface), C.M. W...","['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",4


In [14]:
# Remove square brackets from genre column
df["genres"] = df['genres'].str.strip('[]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["genres"] = df['genres'].str.strip('[]')


In [17]:
# Combine important columns: title and author
def combine_features(data):
    features = []
    for i in range(0, data.shape[0]):
        features.append(data["title"][i] + " " + data["author"][i])
    return features
    

In [18]:
df["combined_features"] = combine_features(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["combined_features"] = combine_features(df)


In [19]:
# Convert text from the new column to a matrix of word counts
cm = CountVectorizer().fit_transform(df["combined_features"])

In [20]:
# Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)

In [26]:
# Get the title of the book the reader likes
Title = df["title"][72]
Title

'Where the Red Fern Grows'

In [27]:
# Find the book id of the book that the user likes
book_id = df[df.title == Title]["book_id"].values[0]
book_id

72

In [29]:
# Create a list of tuples in the form (book_id, similarity score)
scores = list(enumerate(cs[book_id]))

In [30]:
# Sort the list of similar books in descending order

sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]

In [31]:
# Create a loop to print the first 5 books from the sorted list
j = 0
print(f"The 5 most recommended books to '{Title}' are:\n ")
for item in sorted_scores:
    book_title = df[df.book_id == item[0]]["title"].values[0]
    print(j+1, book_title)
    j = j+1
    if j >= 5:
        break

The 5 most recommended books to 'Where the Red Fern Grows' are:
 
1 Summer of the Monkeys
2 The Red and the Black
3 The Chronoliths
4 The Piano Lesson
5 The Red Box
