In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
book_genre = pd.read_csv('data2/archive/data.csv', encoding='latin-1', sep=',', on_bad_lines='warn')

In [7]:
book_genre.columns

Index(['isbn13', 'isbn10', 'title', 'subtitle', 'authors', 'categories',
       'thumbnail', 'description', 'published_year', 'average_rating',
       'num_pages', 'ratings_count'],
      dtype='object')

The necessary column names are taken into account 

In [14]:
books=book_genre[['isbn13', 'title', 'authors', 'published_year', 'categories','average_rating','ratings_count','thumbnail']]

In [16]:
books.head(2)

Unnamed: 0,isbn13,title,authors,published_year,categories,average_rating,ratings_count,thumbnail
0,9780002005883,Gilead,Marilynne Robinson,2004.0,Fiction,3.85,361.0,http://books.google.com/books/content?id=KQZCP...
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,2000.0,Detective and mystery stories,3.83,5164.0,http://books.google.com/books/content?id=gA5GP...


The books with less than 50 number of ratings are removed

In [19]:
books= books[books['ratings_count']>=50]

In [21]:
books.head(2)

Unnamed: 0,isbn13,title,authors,published_year,categories,average_rating,ratings_count,thumbnail
0,9780002005883,Gilead,Marilynne Robinson,2004.0,Fiction,3.85,361.0,http://books.google.com/books/content?id=KQZCP...
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,2000.0,Detective and mystery stories,3.83,5164.0,http://books.google.com/books/content?id=gA5GP...


Creation of Pivot Table

In [24]:
# Ensure columns are clean
books.columns = books.columns.str.strip()  # Remove leading/trailing spaces
books.columns = books.columns.str.lower()  # Convert to lowercase for consistency

In [26]:
# Create a pivot table to group by categories and titles, and calculate average ratings
table_pivot = books.pivot_table(
    columns='categories',            # Grouped by categories (genres)
    index='title',                   # Each book is indexed by its title
    values='average_rating',         # Values are the average ratings
    aggfunc='mean',                  # Aggregate by mean for each category
    fill_value=0                     # Fill missing values with 0
)


In [28]:
table_pivot

categories,87th Precinct (Imaginary place),Abandoned mines,Aboriginal Australians,Abused wives,Accidents,Acting,Actors,Actors and actresses,Actresses,Adirondack Mountains (N.Y.),...,True Crime,United States,Vice-Presidents,War,Women terrorists,"World War, 1914-1918",Yoknapatawpha County (Imaginary place),Young Adult Fiction,Zero (The number),Zoology
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Surely You're Joking, Mr. Feynman!""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""What Do You Care what Other People Think?""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Tis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
", said the shotgun to the head.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eBay For Dummies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
××× ×××§×¡×¨××,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
×× ××××× ××ª ××××©×¨,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
äºè¼ªæ¸ : è±æç,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
print(table_pivot.head())

categories                                   87th Precinct (Imaginary place)  \
title                                                                          
"Surely You're Joking, Mr. Feynman!"                                     0.0   
"What Do You Care what Other People Think?"                              0.0   
'Salem's Lot                                                             0.0   
'Tis                                                                     0.0   
, said the shotgun to the head.                                          0.0   

categories                                   Abandoned mines  \
title                                                          
"Surely You're Joking, Mr. Feynman!"                     0.0   
"What Do You Care what Other People Think?"              0.0   
'Salem's Lot                                             0.0   
'Tis                                                     0.0   
, said the shotgun to the head.                        

Training Model

Creates a sparse Matrix for compressing the data of the pivot table

In [25]:
from scipy.sparse import csr_matrix
book_sparse = csr_matrix(table_pivot)

Clustering algoritm which is Nearest Neighbors this is an unsupervised ml algo
(brute procedure)

In [51]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(table_pivot)
book_index = 237 
if book_index < len(table_pivot):
    selected_book = table_pivot.iloc[book_index, :].to_frame().T
    distances, suggestions = model.kneighbors(selected_book, n_neighbors=6)

In [32]:
suggestions

array([[  21, 4950,  237,  467, 1584,  453]], dtype=int64)

Gives titles to all the suggestions

In [35]:
for i in range(len(suggestions[0])): 
    print(table_pivot.index[suggestions[0][i]])  # Access the title of the recommended book


2061
Theory of Fun for Game Design
Agile Web Development with Rails
Behind Closed Doors
High performance MySQL
Beautiful Evidence


In [37]:
#stores book names
book_names = table_pivot.index

In [39]:
import pickle
pickle.dump(model,open('Recmodel/model.pkl','wb'))
pickle.dump(book_names,open('Recmodel/book_names.pkl','wb'))
pickle.dump(table_pivot,open('Recmodel/book_pivot.pkl','wb'))
pickle.dump(books,open('Recmodel/books.pkl','wb'))

Testing Model

In [42]:
# Function to recommend books based on genre and liked books
def recommend_books(user_genre, liked_books):
    books_list = []
    
    # Filter the books based on the selected genre from the original books DataFrame
    filtered_books = books[books['categories'].str.contains(user_genre, case=False, na=False)]
    
    # Ensure filtered_books is not empty
    if filtered_books.empty:
        return f"No books found for the genre '{user_genre}'."
    
    # Get book IDs of the liked books from the table_pivot index
    liked_books_ids = [np.where(table_pivot.index == book)[0][0] for book in liked_books if book in table_pivot.index]
    
    # Handle the case where no liked books are found
    if not liked_books_ids:
        return "None of the liked books are found in the dataset."
    
    # Calculate the mean vector of the liked books from the table_pivot
    liked_books_vector = table_pivot.iloc[liked_books_ids].mean(axis=0).values.reshape(1, -1)
    
    # Find the nearest neighbors based on the mean vector of liked books
    distance, suggestion = model.kneighbors(liked_books_vector, n_neighbors=6)

    # Get the recommended books from the table_pivot index
    for i in range(len(suggestion[0])):
        books_list.append(table_pivot.index[suggestion[0][i]])

    return books_list




In [44]:
table_pivot.index[33]

'A Breath of Snow and Ashes'

In [49]:
user_genre = "Fantasy"
liked_books = ["Harry Potter", "The Hobbit"]

recommend_books(user_genre, liked_books)




["Suki's Kimono",
 "Jessica's Bad Idea",
 'Black Beauty',
 'Summer of the Monkeys',
 'The Octopus',
 'Three Rotten Eggs']

In [None]:
print(books.columns)