#  Day 7: Building a Book Recommendation System

In [1]:
# Import Required Libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the dataset
df = pd.read_csv('Goodreads-books.csv', error_bad_lines=False)



  df = pd.read_csv('Goodreads-books.csv', error_bad_lines=False)
Skipping line 3350: expected 12 fields, saw 13
Skipping line 4704: expected 12 fields, saw 13
Skipping line 5879: expected 12 fields, saw 13
Skipping line 8981: expected 12 fields, saw 13



In [3]:
# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   bookID                                              title  \
0       1  Harry Potter and the Half-Blood Prince (Harry ...   
1       2  Harry Potter and the Order of the Phoenix (Har...   
2       4  Harry Potter and the Chamber of Secrets (Harry...   
3       5  Harry Potter and the Prisoner of Azkaban (Harr...   
4       8  Harry Potter Boxed Set  Books 1-5 (Harry Potte...   

                      authors  average_rating        isbn         isbn13  \
0  J.K. Rowling/Mary GrandPré            4.57  0439785960  9780439785969   
1  J.K. Rowling/Mary GrandPré            4.49  0439358078  9780439358071   
2                J.K. Rowling            4.42  0439554896  9780439554893   
3  J.K. Rowling/Mary GrandPré            4.56  043965548X  9780439655484   
4  J.K. Rowling/Mary GrandPré            4.78  0439682584  9780439682589   

  language_code    num_pages  ratings_count  text_reviews_count  \
0           eng          652        2095690               2759

In [4]:
# Dataset Information
print("\nDataset Info:")
print(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7     num_pages         11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB
None


In [5]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
bookID                0
title                 0
authors               0
average_rating        0
isbn                  0
isbn13                0
language_code         0
  num_pages           0
ratings_count         0
text_reviews_count    0
publication_date      0
publisher             0
dtype: int64


In [7]:
df = df[['title', 'authors', 'average_rating', 'language_code', '  num_pages', 'publisher']]

In [8]:
# Rename columns for easier access
df.rename(columns={'  num_pages': 'num_pages'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'  num_pages': 'num_pages'}, inplace=True)


In [9]:
# Combine relevant features for recommendations
df['combined_features'] = (
    df['title'] + ' ' +
    df['authors'] + ' ' +
    df['publisher'] + ' ' +
    df['language_code']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_features'] = (


In [10]:
# Preview combined features
print("\nCombined Features:")
print(df['combined_features'].head())


Combined Features:
0    Harry Potter and the Half-Blood Prince (Harry ...
1    Harry Potter and the Order of the Phoenix (Har...
2    Harry Potter and the Chamber of Secrets (Harry...
3    Harry Potter and the Prisoner of Azkaban (Harr...
4    Harry Potter Boxed Set  Books 1-5 (Harry Potte...
Name: combined_features, dtype: object


In [11]:
# Use TF-IDF Vectorizer to convert text into numerical features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [12]:
# Check the shape of the TF-IDF matrix
print("\nTF-IDF Matrix Shape:", tfidf_matrix.shape)



TF-IDF Matrix Shape: (11123, 18900)


In [13]:
# Calculate cosine similarity between all books
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [16]:
def recommend_books(title, cosine_sim=cosine_sim, df=df):
    # Check if the title exists in the dataset
    matches = df[df['title'].str.contains(title, case=False, na=False)]
    
    if matches.empty:
        print(f"No matches found for the title '{title}'. Please try another title.")
        return pd.DataFrame()  # Return an empty DataFrame
    
    # Get the index of the first matching book
    idx = matches.index[0]
    
    # Get the similarity scores for all books with the given book
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 10 most similar books
    sim_scores = sim_scores[1:11]
    
    # Get the indices of the recommended books
    book_indices = [i[0] for i in sim_scores]
    
    # Return the titles of the recommended books
    return df.iloc[book_indices][['title', 'authors', 'average_rating']]


In [17]:
# Test with a sample book
book_title = "The Hunger Games"
print(f"\nBooks similar to '{book_title}':")
recommended_books = recommend_books(book_title)
print(recommended_books)


Books similar to 'The Hunger Games':
No matches found for the title 'The Hunger Games'. Please try another title.
Empty DataFrame
Columns: []
Index: []


In [18]:
# Test with a new book title
book_title = "Harry Potter"
print(f"\nBooks similar to '{book_title}':")
recommended_books = recommend_books(book_title)
print(recommended_books)



Books similar to 'Harry Potter':
                                                   title  \
615    Harry Potter and the Half-Blood Prince (Harry ...   
1      Harry Potter and the Order of the Phoenix (Har...   
4      Harry Potter Boxed Set  Books 1-5 (Harry Potte...   
3      Harry Potter and the Prisoner of Azkaban (Harr...   
6           Harry Potter Collection (Harry Potter  #1-6)   
4415   Harry Potter and the Chamber of Secrets (Harry...   
2      Harry Potter and the Chamber of Secrets (Harry...   
10675  Harry Potter and the Goblet of Fire (Harry Pot...   
8873   Harry Potter and the Sorcerer's Stone (Harry P...   
1233   Harry Potter and the Prisoner of Azkaban (Harr...   

                          authors  average_rating  
615                  J.K. Rowling            4.57  
1      J.K. Rowling/Mary GrandPré            4.49  
4      J.K. Rowling/Mary GrandPré            4.78  
3      J.K. Rowling/Mary GrandPré            4.56  
6                    J.K. Rowling            